今回はこちらのコンペをもとにCNNモデルの作成までをまとめてみます
以下コードになります。
Kerasを使用してモデルを作成しています。
#必要なライブラリの読み込み from numpy.random import seed seed(101) from tensorflow import set_random_seed set_random_seed(101) import pandas as pd import numpy as np import tensorflow as tf from tensorflow import keras from tensorflow.keras.preprocessing.image import ImageDataGenerator from tensorflow.keras.layers import Conv2D, MaxPooling2D from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation from tensorflow.keras.models import Sequential from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint from tensorflow.keras.optimizers import Adam import os import cv2 from sklearn.utils import shuffle from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split import itertools import shutil import matplotlib.pyplot as plt %matplotlib inline IMAGE_SIZE = 96 IMAGE_CHANNELS = 3 SAMPLE_SIZE = 80000 os.listdir('../input') #トレーニング・テストデータの読み込み print(len(os.listdir('../input/train'))) print(len(os.listdir('../input/test'))) #全てのデータを含んだデータフレームの作成 df_data = pd.read_csv('../input/train_labels.csv') # 一部データを削除 df_data[df_data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2'] df_data[df_data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe'] print(df_data.shape) #それぞれのカテゴリのデータ数を確認 df_data['label'].value_counts() #0,1カテゴリそれぞれの写真を表示してみる def draw_category_images(col_name,figure_cols,df,IMAGE_PATH): categories = (df.groupby([col_name])[col_name].nunique()).index f,ax = plt.subplots(nrows=len(categories),ncols=figure_cols, figsize=(4*figure_cols,4*len(categories))) for i,cat in enumerate(categories): sample = df[df[col_name]==cat].sample(figure_cols) # figure_cols is also the sample size for j in range(0,figure_cols): file=IMAGE_PATH + sample.iloc[j]['id'] + '.tif' im=cv2.imread(file) ax[i, j].imshow(im, resample=True, cmap='gray') ax[i, j].set_title(cat, fontsize=16) plt.tight_layout() plt.show() IMAGE_PATH = '../input/train/' draw_category_images('label',4, df_data, IMAGE_PATH) #データセットの作成 df_data.head() #0,1それぞれのデータ数が合わないのでラベル0のデータ数を減らす df_0 = df_data[df_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 101) df_1 = df_data[df_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 101) #2つのデータフレームを結合する df_data = pd.concat([df_0, df_1], axis=0).reset_index(drop=True) # シャッフルする df_data = shuffle(df_data) df_data['label'].value_counts() #データセットを分割する y = df_data['label'] df_train, df_val = train_test_split(df_data, test_size=0.10, random_state=101, stratify=y) print(df_train.shape) print(df_val.shape) #分割後のデータ数の確認 df_train['label'].value_counts() df_val['label'].value_counts() #ディレクトリ構成を変える base_dir = 'base_dir' os.mkdir(base_dir) train_dir = os.path.join(base_dir, 'train_dir') os.mkdir(train_dir) val_dir = os.path.join(base_dir, 'val_dir') os.mkdir(val_dir) no_tumor_tissue = os.path.join(train_dir, 'a_no_tumor_tissue') os.mkdir(no_tumor_tissue) has_tumor_tissue = os.path.join(train_dir, 'b_has_tumor_tissue') os.mkdir(has_tumor_tissue) no_tumor_tissue = os.path.join(val_dir, 'a_no_tumor_tissue') os.mkdir(no_tumor_tissue) has_tumor_tissue = os.path.join(val_dir, 'b_has_tumor_tissue') os.mkdir(has_tumor_tissue) # 作成後のディレクトリを確認 os.listdir('base_dir/train_dir') #トレーニング・バリデーションデータリストの取得 train_list = list(df_train['id']) val_list = list(df_val['id']) # 新しく作ったフォルダ用にファイル名等を変換していく for image in train_list: fname = image + '.tif' target = df_data.loc[image,'label'] if target == 0: label = 'a_no_tumor_tissue' if target == 1: label = 'b_has_tumor_tissue' src = os.path.join('../input/train', fname) dst = os.path.join(train_dir, label, fname) shutil.copyfile(src, dst) for image in val_list: fname = image + '.tif' target = df_data.loc[image,'label'] if target == 0: label = 'a_no_tumor_tissue' if target == 1: label = 'b_has_tumor_tissue' src = os.path.join('../input/train', fname) dst = os.path.join(val_dir, label, fname) shutil.copyfile(src, dst) #CNNモデルの作成 kernel_size = (3,3) pool_size= (2,2) first_filters = 32 second_filters = 64 third_filters = 128 dropout_conv = 0.3 dropout_dense = 0.3 model = Sequential() model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (96, 96, 3))) model.add(Conv2D(first_filters, kernel_size, activation = 'relu')) model.add(Conv2D(first_filters, kernel_size, activation = 'relu')) model.add(MaxPooling2D(pool_size = pool_size)) model.add(Dropout(dropout_conv)) model.add(Conv2D(second_filters, kernel_size, activation ='relu')) model.add(Conv2D(second_filters, kernel_size, activation ='relu')) model.add(Conv2D(second_filters, kernel_size, activation ='relu')) model.add(MaxPooling2D(pool_size = pool_size)) model.add(Dropout(dropout_conv)) model.add(Conv2D(third_filters, kernel_size, activation ='relu')) model.add(Conv2D(third_filters, kernel_size, activation ='relu')) model.add(Conv2D(third_filters, kernel_size, activation ='relu')) model.add(MaxPooling2D(pool_size = pool_size)) model.add(Dropout(dropout_conv)) model.add(Flatten()) model.add(Dense(256, activation = "relu")) model.add(Dropout(dropout_dense)) model.add(Dense(2, activation = "softmax")) model.summary()