機械学習の技術向上のためにKaggleのKernelの写経を始めたので、
記録として記事を書いていきます。
画像認識系の勉強をしたかったので、
「Plant Seedling Classification」という植物の分類「画像分類」を行う
コンペに注目してい見ました(既に終了しているコンペです)
「Plant Seedling Classification」
通常画像認識の学習を1から行う場合は時間が掛かってしまいますが、
既にパラメータを学習したモデルを使用する(転移学習)ことで
効率よく同時データを学習させることができるようです。
%matplotlib inline import datetime as dt import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = [16, 10] plt.rcParams['font.size'] = 16 import numpy as np import os import pandas as pd import seaborn as sns from keras.applications import xception from keras.preprocessing import image from mpl_toolkits.axes_grid1 import ImageGrid from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score , confusion_matrix from tqdm import tqdm from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" start = dt.datetime.now() cache_dir = os.path.expanduser(os.path.join('~', '.keras')) if not os.path.exists(cache_dir): os.makedirs(cache_dir) models_dir = os.path.join(cache_dir, 'models') if not os.path.exists(models_dir): os.makedirs(models_dir) #カテゴリの種類をリストに格納 CATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen', 'Loose Silky-bent', 'Maize', 'Scentless Mayweed', 'Shepherds Purse', 'Small-flowered Cranesbill', 'Sugar beet'] NUM_CATEGORIES = len(CATEGORIES) #それぞれのカテゴリごとのデータ数? ディレクトリの指定 SAMPLE_PER_CATEGORY = 200 SEED = 1987 data_dir = "E:\Kaggle\plant-seedlings-classification" train_dir = os.path.join(data_dir, 'train') test_dir = os.path.join(data_dir, 'test') sample_submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv')) #提出ファイルの内容確認 sample_submission.head(2) #カテゴリごとの画像枚数の確認 for category in CATEGORIES: print('{} {} images'.format(category, len(os.listdir(os.path.join(train_dir, category))))) train = [] for category_id, category in enumerate(CATEGORIES): for file in os.listdir(os.path.join(train_dir, category)): train.append(['train/{}/{}'.format(category, file), category_id, category]) train = pd.DataFrame(train, columns=['file', 'category_id', 'category']) train.head(2) train.shape train = pd.concat([train[train['category'] == c][:SAMPLE_PER_CATEGORY] for c in CATEGORIES]) train = train.sample(frac=1) train.index = np.arange(len(train)) train.head(2) train.shape test = [] for file in os.listdir(test_dir): test.append(['test/{}'.format(file), file]) test = pd.DataFrame(test, columns=['filepath', 'file']) test.head(2) test.shape def read_img(filepath, size): img = image.load_img(os.path.join(data_dir, filepath), target_size=size) img = image.img_to_array(img) return img fig = plt.figure(1, figsize=(NUM_CATEGORIES, NUM_CATEGORIES)) grid = ImageGrid(fig, 111, nrows_ncols=(NUM_CATEGORIES, NUM_CATEGORIES), axes_pad=0.05) i = 0 for category_id, category in enumerate(CATEGORIES): for filepath in train[train['category'] == category]['file'].values[:NUM_CATEGORIES]: ax = grid[i] img = read_img(filepath, (224, 224)) ax.imshow(img / 255.) ax.axis('off') if i % NUM_CATEGORIES == NUM_CATEGORIES - 1: ax.text(250, 112, filepath.split('/')[1], verticalalignment='center') i += 1 plt.show() np.random.seed(seed=SEED) rnd = np.random.random(len(train)) train_idx = rnd < 0.8 valid_idx = rnd >= 0.8 ytr = train.loc[train_idx, 'category_id'].values yv = train.loc[valid_idx, 'category_id'].values len(ytr), len(yv) INPUT_SIZE = 299 POOLING = 'avg' x_train = np.zeros((len(train), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32') for i, file in tqdm(enumerate(train['file'])): img = read_img(file, (INPUT_SIZE, INPUT_SIZE)) x = xception.preprocess_input(np.expand_dims(img.copy(), axis=0)) x_train[i] = x print('Train Images shape: {} size: {:,}'.format(x_train.shape, x_train.size)) Xtr = x_train[train_idx] Xv = x_train[valid_idx] print((Xtr.shape, Xv.shape, ytr.shape, yv.shape)) xception_bottleneck = xception.Xception(weights='imagenet', include_top=False, pooling=POOLING) train_x_bf = xception_bottleneck.predict(Xtr, batch_size=32, verbose=1) valid_x_bf = xception_bottleneck.predict(Xv, batch_size=32, verbose=1) print('Xception train bottleneck features shape: {} size: {:,}'.format(train_x_bf.shape, train_x_bf.size)) print('Xception valid bottleneck features shape: {} size: {:,}'.format(valid_x_bf.shape, valid_x_bf.size)) logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=SEED) logreg.fit(train_x_bf, ytr) valid_probs = logreg.predict_proba(valid_x_bf) valid_preds = logreg.predict(valid_x_bf) print('Validation Xception Accuracy {}'.format(accuracy_score(yv, valid_preds))) cnf_matrix = confusion_matrix(yv, valid_preds) abbreviation = ['BG', 'Ch', 'Cl', 'CC', 'CW', 'FH', 'LSB', 'M', 'SM', 'SP', 'SFC', 'SB'] pd.DataFrame({'class': CATEGORIES, 'abbreviation': abbreviation}) fig, ax = plt.subplots(1) ax = sns.heatmap(cnf_matrix, ax=ax, cmap=plt.cm.Greens, annot=True) ax.set_xticklabels(abbreviation) ax.set_yticklabels(abbreviation) plt.title('Confusion Matrix') plt.ylabel('True class') plt.xlabel('Predicted class') fig.savefig('Confusion matrix.png', dpi=300) plt.show(); x_test = np.zeros((len(test), INPUT_SIZE, INPUT_SIZE, 3), dtype='float32') for i, filepath in tqdm(enumerate(test['filepath'])): img = read_img(filepath, (INPUT_SIZE, INPUT_SIZE)) x = xception.preprocess_input(np.expand_dims(img.copy(), axis=0)) x_test[i] = x print('test Images shape: {} size: {:,}'.format(x_test.shape, x_test.size)) test_x_bf = xception_bottleneck.predict(x_test, batch_size=32, verbose=1) print('Xception test bottleneck features shape: {} size: {:,}'.format(test_x_bf.shape, test_x_bf.size)) test_preds = logreg.predict(test_x_bf) test['category_id'] = test_preds test['species'] = [CATEGORIES[c] for c in test_preds] test[['file', 'species']].to_csv('submission.csv', index=False) end = dt.datetime.now() print('Total time {} s.'.format((end - start).seconds)) print('We almost used the one hour time limit.')