PythonMania

普段はロボットとAIを組み合わせて色々作ってます。Python関係以外も色々投稿していくと思います。

【Python】画像認識 - CNNモデルの作成【DeepLearning】



今回はこちらのコンペをもとにCNNモデルの作成までをまとめてみます




www.kaggle.com



以下コードになります。
Kerasを使用してモデルを作成しています。

#必要なライブラリの読み込み
from numpy.random import seed
seed(101)
from tensorflow import set_random_seed
set_random_seed(101)

import pandas as pd
import numpy as np


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

import os
import cv2

from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import itertools
import shutil
import matplotlib.pyplot as plt
%matplotlib inline
IMAGE_SIZE = 96
IMAGE_CHANNELS = 3

SAMPLE_SIZE = 80000


os.listdir('../input')


#トレーニング・テストデータの読み込み
print(len(os.listdir('../input/train')))
print(len(os.listdir('../input/test')))


#全てのデータを含んだデータフレームの作成
df_data = pd.read_csv('../input/train_labels.csv')

# 一部データを削除
df_data[df_data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']
df_data[df_data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']


print(df_data.shape)


#それぞれのカテゴリのデータ数を確認
df_data['label'].value_counts()



#0,1カテゴリそれぞれの写真を表示してみる
def draw_category_images(col_name,figure_cols,df,IMAGE_PATH):
    categories = (df.groupby([col_name])[col_name].nunique()).index
    f,ax = plt.subplots(nrows=len(categories),ncols=figure_cols,
                       figsize=(4*figure_cols,4*len(categories)))
    for i,cat in enumerate(categories):
        sample = df[df[col_name]==cat].sample(figure_cols) # figure_cols is also the sample size
        for j in range(0,figure_cols):
           file=IMAGE_PATH + sample.iloc[j]['id'] + '.tif'
           im=cv2.imread(file)
           ax[i, j].imshow(im, resample=True, cmap='gray')
           ax[i, j].set_title(cat, fontsize=16)  
    plt.tight_layout()
    plt.show()


IMAGE_PATH = '../input/train/' 
draw_category_images('label',4, df_data, IMAGE_PATH)


#データセットの作成
df_data.head()
#0,1それぞれのデータ数が合わないのでラベル0のデータ数を減らす
df_0 = df_data[df_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 101)
df_1 = df_data[df_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 101)
#2つのデータフレームを結合する
df_data = pd.concat([df_0, df_1], axis=0).reset_index(drop=True)
# シャッフルする
df_data = shuffle(df_data)

df_data['label'].value_counts()

#データセットを分割する
y = df_data['label']
df_train, df_val = train_test_split(df_data, test_size=0.10, random_state=101, stratify=y)
print(df_train.shape)
print(df_val.shape)


#分割後のデータ数の確認

df_train['label'].value_counts()
df_val['label'].value_counts()


#ディレクトリ構成を変える
base_dir = 'base_dir'
os.mkdir(base_dir)
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)
no_tumor_tissue = os.path.join(train_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(train_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)
no_tumor_tissue = os.path.join(val_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(val_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)


# 作成後のディレクトリを確認
os.listdir('base_dir/train_dir')


#トレーニング・バリデーションデータリストの取得
train_list = list(df_train['id'])
val_list = list(df_val['id'])



# 新しく作ったフォルダ用にファイル名等を変換していく

for image in train_list:
    
   
    fname = image + '.tif'
    target = df_data.loc[image,'label']
    
   
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
    
    
    src = os.path.join('../input/train', fname)
   
    dst = os.path.join(train_dir, label, fname)
   
    shutil.copyfile(src, dst)




for image in val_list:
    

    fname = image + '.tif'
    
    target = df_data.loc[image,'label']
    
  
    if target == 0:
        label = 'a_no_tumor_tissue'
    if target == 1:
        label = 'b_has_tumor_tissue'
    

  
    src = os.path.join('../input/train', fname)

    dst = os.path.join(val_dir, label, fname)
 
    shutil.copyfile(src, dst)



#CNNモデルの作成
kernel_size = (3,3)
pool_size= (2,2)
first_filters = 32
second_filters = 64
third_filters = 128

dropout_conv = 0.3
dropout_dense = 0.3


model = Sequential()
model.add(Conv2D(first_filters, kernel_size, activation = 'relu', input_shape = (96, 96, 3)))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(Conv2D(first_filters, kernel_size, activation = 'relu'))
model.add(MaxPooling2D(pool_size = pool_size)) 
model.add(Dropout(dropout_conv))

model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(Conv2D(second_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(Conv2D(third_filters, kernel_size, activation ='relu'))
model.add(MaxPooling2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(2, activation = "softmax"))

model.summary()