多クラス分類 csvのラベルファイルがある場合
#ラべルの読み込み LABELS = "../input/train.csv" train_df = pd.read_csv(LABELS) top = sorted(list(train_df['label_id'].value_counts().head(16).index)) train_df = train_df[train_df['label_id'].isin(top)] from keras.preprocessing.image import img_to_array from keras.preprocessing.image import load_img from sklearn.model_selection import train_test_split #データセットの成型 SEED = 1234 TRAIN_FOLDER = "../input/train/train/" TEST_FOLDER = "../input/test/test/" DIM = 299 train_df['image_path'] = train_df.apply( lambda x: ( TRAIN_FOLDER + x["file_name"] ), axis=1 ) train_data = np.array([ img_to_array(load_img(img, target_size=(DIM, DIM))) for img in train_df['image_path'].values.tolist()]).astype('float32') train_labels = train_df['label_id'] #データ分割 x_train, x_validation, y_train, y_validation = train_test_split(train_data, train_labels, test_size=0.2, stratify=np.array(train_labels), random_state=SEED) #ラベルをカテゴリ変数に変換 y_train = pd.get_dummies(y_train.reset_index(drop=True), columns=top_breeds).as_matrix() y_validation = pd.get_dummies(y_validation.reset_index(drop=True), columns=top_breeds).as_matrix()
カテゴリがフォルダ毎に別れている場合
#globを使いデータを読み込み z = glob.glob("../input/train/*/*.png") ori_label = [] ori_imgs = [] for fn in z: if fn[-3:] != 'png': continue ori_label.append(fn.split('/')[-2]) new_img = Image.open(fn) ori_imgs.append(ImageOps.fit(new_img, (48, 48), Image.ANTIALIAS).convert('RGB')) #画像の配列化、正則化、カテゴリラベルの数値化 imgs = np.array([np.array(im) for im in ori_imgs]) imgs = imgs.reshape(imgs.shape[0], 48, 48, 3) / 255 lb = LabelBinarizer().fit(ori_label) label = lb.transform(ori_label) #データの分割 trainX, validX, trainY, validY = train_test_split(imgs, label, test_size=0.05, random_state=42)