Chinese Characters - Image Analysis

Data Exploration

This file is an example of early stage image exploration of Chinese characters. This exploration project helped inform my GAN neural net design and dot product visualizations. Focus on skeletonizing the image to improve feature detection.

Libraries

In [3]:
from gapcv.vision import Image, Images
from keras.callbacks import LambdaCallback
import cv2
Using TensorFlow backend.
In [4]:
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import ndimage
In [5]:
import keras
from keras.applications.vgg16 import VGG16
from keras import backend as K
from keras.models import Sequential, load_model
from keras.layers import activations, BatchNormalization
from keras.layers.core import Flatten, Dense, Dropout
from keras.optimizers import SGD, Adam
from keras.metrics import categorical_crossentropy
from keras import regularizers
from keras.layers.convolutional import *
from keras.callbacks import ModelCheckpoint
In [6]:
import warnings
warnings.filterwarnings(action ='ignore')

Load Data

Data Import via GapMV

In [7]:
images = Images('chinese/ch_train_set/',num_proc='all',config=['nostore','gray'],name='chinese')
In [8]:
def convert_size(row):
    if row == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(row, 1024)))
    p = math.pow(1024, i)
    s = round(row / p, 2)
    return "{} {}".format(s, size_name[i])
In [9]:
images[1]
Out[9]:
<gapcv.vision.Image at 0x12d39fcf8>
In [10]:
images[0].data
Out[10]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

Data Preprocessing

In [11]:
# via JohnB/MatthewB/AndrewF

def line_thickness(im):
    i=1
    eroded =im
    while np.sum(eroded)>0:
        eroded =im
        kernel = np.ones((i,i),np.uint8)
        eroded = cv2.erode(im,kernel,iterations = 1)
        i+=1
    return i


def normalize_images(im):
    im //=128
    return im

def vis_image(im):
    fig = plt.figure(figsize=(2,2))
    img_show=im
    plt.axis('off')
    plt.imshow(img_show)
    plt.show()
    
# via Charles
def erosion(im):
    kernel = np.ones((2,2),np.uint8)
    er1 = cv2.erode(im,kernel,iterations = 1)
    vis_image(im)
    vis_image(er1)
    return er1

def toph(im):
    kernel = np.ones((3,3),np.uint8)
    tophat = cv2.morphologyEx(im, cv2.MORPH_TOPHAT, kernel)
    vis_image(im)
    vis_image(tophat)
    return tophat

def blackh(im):
    kernel = np.ones((2,2),np.uint8)
    blackhat = cv2.morphologyEx(im, cv2.MORPH_BLACKHAT, kernel)
    vis_image(im)
    vis_image(blackhat)
    return blackhat

# via AndrewF
def img_centering(im):
    com = ndimage.measurements.center_of_mass(im)

    # Translation distances in x and y axis
    x_trans = int(im.shape[0]//2-com[0])
    y_trans = int(im.shape[1]//2-com[1])

    # Pad and remove pixels from image to perform translation
    if x_trans > 0:
        im2 = np.pad(im, ((x_trans, 0), (0, 0)), mode='constant')
        im2 = im2[:im.shape[0]-x_trans, :]
    else:
        im2 = np.pad(im, ((0, -x_trans), (0, 0)), mode='constant')
        im2 = im2[-x_trans:, :]

    if y_trans > 0:
        im3 = np.pad(im2, ((0, 0), (y_trans, 0)), mode='constant')
        im3 = im3[:, :im.shape[0]-y_trans]

    else:
        im3 = np.pad(im2, ((0, 0), (0, -y_trans)), mode='constant')
        im3 = im3[:, -y_trans:]

    print("Before: {} \n After: {}".format(com,ndimage.measurements.center_of_mass(im3)))
    vis_image(im)
    vis_image(im3)

Image Transformations

Centering
In [69]:
img_centering(images[6002].data)
Before: (33.75694893341952, 41.60310277957337) 
 After: (32.75694893341952, 39.60310277957337)
In [70]:
erosion(images[6002].data);
In [14]:
np.sum(images[6002].data)
Out[14]:
1547.0
In [58]:
def p_line_thickness(im):
    i=1
    eroded =im
    while np.sum(eroded)>0:
        vis_image(eroded)
        eroded =im
        kernel = np.ones((i,i),np.uint8)
        eroded = cv2.erode(im,kernel,iterations = 1)
        i+=1
    return i
In [59]:
p_line_thickness(images[20].data)
Out[59]:
11
In [16]:
toph(images[6002].data);
In [17]:
blackh(images[6002].data)
Out[17]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

Dataframe Construction

In [65]:
def return_script(i, data):
    try:
        return data.image.split('_')[2].split('-')[1]
    except:
        return "modern"
        

dict_list={i:{'img_class':img.image.split('/')[-2],
              'image':img.image.split('/')[-1],
              'script':return_script(i,img),
              'label':img.label,
              'thickness':line_thickness(img.data),
              'name':img.name,
              'rawshape':img.rawshape,
              'ressize':img.ressize,
              'size':img.size,
              'shape':img.shape,
              'type':img.type} for i, img in enumerate(images._data)}

# get data frame
train_set=pd.DataFrame(dict_list).T

# fixed data types
thickness = train_set['thickness']

# minmax
train_set['thickness'] = (thickness-thickness.min())/(thickness.max()-thickness.min())


train_set['size'] = train_set['size'].astype('int')

train_set['size_max']=train_set['size']
train_set['size_min']=train_set['size']
train_set.head(3)
Out[65]:
image img_class label name rawshape ressize script shape size thickness type size_max size_min
0 a5df-jinwen_013.png a5df 0 a5df-jinwen_013 (65, 79) 0 jinwen (65, 79) 242 0.16 png 242 242
1 a5df-jinwen_007.png a5df 0 a5df-jinwen_007 (65, 79) 0 jinwen (65, 79) 252 0.28 png 252 252
2 a5df-jinwen_012.png a5df 0 a5df-jinwen_012 (65, 79) 0 jinwen (65, 79) 249 0.24 png 249 249
In [44]:
train_set.isna().any()
Out[44]:
image        False
img_class    False
label        False
name         False
rawshape     False
ressize      False
script       False
shape        False
size         False
thickness    False
type         False
size_max     False
size_min     False
dtype: bool
In [45]:
train_set_grouped=train_set.pivot_table(index=['label','type','shape','img_class','script'],
                   values=['image','size_max','size','size_min'],
                   aggfunc={'image':len,'size_max':max,'size':np.mean,'size_min':min}
                  )

train_set_grouped.reset_index(inplace=True)

train_set_grouped.rename(columns={'size':'size_avg', 'time':'time_avg'}, inplace = True)
train_set_grouped['size_avg']=train_set_grouped.apply(lambda row: convert_size(row['size_avg']), axis=1)
train_set_grouped['size_max']=train_set_grouped.apply(lambda row: convert_size(row['size_max']), axis=1)
train_set_grouped['size_min']=train_set_grouped.apply(lambda row: convert_size(row['size_min']), axis=1)
train_set_grouped.head()
Out[45]:
label type shape img_class script image size_avg size_max size_min
0 0 png (65, 79) a5df chuxi 7 304.71 B 329.0 B 265.0 B
1 0 png (65, 79) a5df jinwen 24 254.79 B 291.0 B 195.0 B
2 0 png (65, 79) a5df modern 1 233.0 B 233.0 B 233.0 B
3 0 png (65, 79) a5df oracle 1 281.0 B 281.0 B 281.0 B
4 1 png (65, 79) a447 chuxi 170 205.68 B 315.0 B 174.0 B

Data Visualization

In [46]:
images.split = 0.2
X_train, X_test, Y_train, Y_test = images.split

Below are some examples of the images from the data set:

In [47]:
fig=plt.figure(figsize=(12, 12))
columns = 4
rows = 4
for i in range(1, columns*rows +1):
    img_show=X_train[i]
    fig.add_subplot(rows, columns, i)
    plt.axis('off')
    plt.imshow(img_show)
plt.show()
In [48]:
n,c = train_set_grouped['label'].values, train_set_grouped['image'].values
In [49]:
plt.figure(figsize=(20,5))
plt.bar(n,c)
plt.title('Character Distribution by png file count')
plt.xlabel('Character Folder')
plt.ylabel('File count')
plt.show()
In [50]:
sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
tips.head()
Out[50]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [51]:
script_set = train_set[['script','size']]
plt.figure(figsize=(10,6))
ax = sns.violinplot(x="size", y="script",data=script_set,scale="width", palette="Set3")
plt.show()
np.log(train_set['thickness'][train_set['script']=='oracle'].tofloat)
In [66]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import seaborn as sns

iris = load_iris()
iris = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                    columns=iris['feature_names'] + ['target'])

# Sort the dataframe by target
labels = list(train_set['script'].unique())
targets = []

for s in labels:
    targets.append(list(train_set['thickness'][train_set['script']==s].values))
for t,l in zip(targets,labels):
    print(l)
    sns.distplot(t)
    plt.xlim(0,1)
    plt.show()
jinwen
modern
chuxi
oracle
smallseal
In [124]:
labels = train_set.img_class.unique()[0:10]

for l in labels:
    label_set = train_set[['script','size','img_class']]
    plt.figure(figsize=(10,2))
    plt.title(l,fontsize=15)
    ax = sns.violinplot(x="size", y="script",data=label_set[label_set['img_class']==l],scale="width", palette="Set3")
    plt.xlabel('',fontsize=10)
    plt.show()
In [24]:
char_set = train_set[['img_class','size']]
plt.figure(figsize=(18,80))
ax = sns.violinplot(x="size", y="img_class",kind='script',data=char_set,scale="width", palette="Set3")

My Model for Keras (different shape)

X1_sh,X2_sh = X_train.shape,X_test.shape X_train, X_test = X_train.reshape(X1_sh[0],X1_sh[1],X1_sh[2],1),X_test.reshape(X2_sh[0],X2_sh[1],X2_sh[2],1)
In [25]:
X_train,X_test = X_train.reshape(X_train.shape[0],65,79,1),X_test.reshape(X_test.shape[0],65,79,1)
In [26]:
batch_size = 64
num_classes = Y_train.shape[1]
epochs = 20
input_shape = X_train.shape[1:]

Keras Layers template: MNIST for Beginners

In [27]:
from keras.callbacks import Callback
In [28]:
model = Sequential()
model.add(Conv2D(64, kernel_size=(3, 3),activation='relu',kernel_initializer='RandomNormal',input_shape=(65,79,1,)))
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',kernel_initializer='RandomNormal', kernel_regularizer=regularizers.l2(0.3)))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.20))
model.add(Conv2D(64, (3, 3), activation='relu',padding='same',kernel_initializer='RandomNormal', kernel_regularizer=regularizers.l2(0.3)))
model.add(Conv2D(64, (3, 3), activation='relu',padding='same',kernel_initializer='RandomNormal', kernel_regularizer=regularizers.l2(0.2)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(128, (3, 3), activation='relu',padding='same',kernel_initializer='RandomNormal',kernel_regularizer=regularizers.l2(0.2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.2)))
model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(num_classes, activation='softmax'))

# checkpoint
# filepath="weights-improvement-{epoch:02d}.hdf5"
# checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True, mode='max')
# print_weights = LambdaCallback(on_epoch_end=lambda batch, logs: print(model.layers[0].get_weights()))

# callbacks_list = [checkpoint]



model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.RMSprop(),
              metrics=['accuracy'])
In [29]:
model.fit(X_train,Y_train,epochs=3)
Epoch 1/3
6585/6585 [==============================] - 139s 21ms/step - loss: 15.6359 - acc: 0.3332
Epoch 2/3
6585/6585 [==============================] - 132s 20ms/step - loss: 4.1664 - acc: 0.4612
Epoch 3/3
6585/6585 [==============================] - 132s 20ms/step - loss: 3.8857 - acc: 0.5008
Out[29]:
<keras.callbacks.History at 0x228151ad5f8>
In [30]:
model.evaluate(X_test,Y_test)
1647/1647 [==============================] - 10s 6ms/step
Out[30]:
[4.678720671984378, 0.36612021866970956]
answers = [labels[a.argmax()] for a in answers] answers = model.predict(X_test_fixed) answers = [labels[a.argmax()] for a in answers] y_answer = [labels[y.argmax()] for y in Y_test]prediction_set = Images('chinese/ch_test/', config=['nostore','gray']) final_images = np.array([im._imgdata.reshape(65,79,1) for im in prediction_set._data[:]]) final_labels = np.array([im.name for im in prediction_set]) pred = model.predict(final_images)actual_labels = df2['img_class'].values actual_label = [prediction_set[i].name for i in range(len(prediction_set))] pred_label = [final_labels[p.argmax()] for p in pred][p.argmax() for p in pred]
In [ ]:
v = 0
for i in range(20):
    if actual_label[i] == pred_label[i]:
        v+=1
        print('{}: CORRECT!'.format(i))
    else:
        print('{}: WRONG!'.format(i))
# print(v,v/200)
In [ ]:
im=Image('chinese/19981.gif',config=['resize=(65,79)','gray'])
im = im.data.reshape(65,79,1)
im = np.expand_dims(im,axis=0)
pred = model.predict(im).argmax()
actual_labels[pred]
In [ ]:
im=Image('chinese/a4a3-real.jpg',config=['resize=(65,79)','gray'])
im = im.data.reshape(65,79,1)
im = np.expand_dims(im,axis=0)
pred = model.predict(im).argmax()
actual_labels[pred]