{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 種々のベンチマークデータ\n", "\n", "個別のノートブックを用意していないデータセットを整える段取りを記録している。" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "__目次__\n", "\n", "- Irisデータ\n", "\n", "___" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "## Irisデータ\n", "\n", "20世紀前半からある古くて小さなデータセットであるが、線形モデルでは完全に識別できないことはわかっているので、多種の学習機のプロトタイプを手軽に試すデータセットとして重宝されてきた。学習課題はアヤメの花の品種を、花びらの長さなどの指標で識別することである。" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import csv\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5.1,3.5,1.4,0.2,Iris-setosa\n", "4.9,3.0,1.4,0.2,Iris-setosa\n", "4.7,3.2,1.3,0.2,Iris-setosa\n", "4.6,3.1,1.5,0.2,Iris-setosa\n", "5.0,3.6,1.4,0.2,Iris-setosa\n", "6.3,2.5,5.0,1.9,Iris-virginica\n", "6.5,3.0,5.2,2.0,Iris-virginica\n", "6.2,3.4,5.4,2.3,Iris-virginica\n", "5.9,3.0,5.1,1.8,Iris-virginica\n", "\n" ] } ], "source": [ "! cat data/iris/iris.data | head -n 5\n", "! cat data/iris/iris.data | tail -n 5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "ここで、先頭の5行はデータであるのに対して、末尾の5行はデータのない空の行が一つある。__このからの行を削除し__、新たに`iris_rev.data`として保存しておくこと。修正を加えたファイルの中身を覗いてみると以下のとおりである。" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5.1,3.5,1.4,0.2,Iris-setosa\n", "4.9,3.0,1.4,0.2,Iris-setosa\n", "4.7,3.2,1.3,0.2,Iris-setosa\n", "4.6,3.1,1.5,0.2,Iris-setosa\n", "5.0,3.6,1.4,0.2,Iris-setosa\n", "6.7,3.0,5.2,2.3,Iris-virginica\n", "6.3,2.5,5.0,1.9,Iris-virginica\n", "6.5,3.0,5.2,2.0,Iris-virginica\n", "6.2,3.4,5.4,2.3,Iris-virginica\n", "5.9,3.0,5.1,1.8,Iris-virginica\n" ] } ], "source": [ "! cat data/iris/iris_rev.data | head -n 5\n", "! cat data/iris/iris_rev.data | tail -n 5" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "150 data/iris/iris_rev.data\r\n" ] } ], "source": [ "! wc -l data/iris/iris_rev.data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "データがきちんと150点あることは上記からわかったので、安心して進むことができる。" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "NUM_DATA = 150\n", "NUM_TRAIN = 100 # Set manually.\n", "NUM_TEST = NUM_DATA - NUM_TRAIN\n", "NUM_FEATURES = 4\n", "NUM_CLASSES = 3\n", "NUM_LABELS = 1\n", "LABEL_DICT = {\"Iris-setosa\": 0,\n", " \"Iris-versicolor\": 1,\n", " \"Iris-virginica\": 2}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "toread = os.path.join(\"data\", \"iris\", \"iris_rev.data\")\n", "\n", "data_X = np.zeros((NUM_DATA,NUM_FEATURES), dtype=np.float32)\n", "data_y = np.zeros((NUM_DATA,1), dtype=np.int8)\n", "\n", "with open(toread, newline=\"\") as f_table:\n", " \n", " f_reader = csv.reader(f_table, delimiter=\",\")\n", " \n", " i = 0\n", " for line in f_reader:\n", " data_X[i,:] = np.array(line[0:-1], dtype=data_X.dtype)\n", " data_y[i,:] = np.array(LABEL_DICT[line[-1]], dtype=data_y.dtype)\n", " i += 1\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "訓練データを読み込んだのだが、検証データとともに一つの階層型ファイルにまとめるために、__PyTables__というパッケージを利用する。" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "import tables" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/iris/data.h5 (File) 'Iris data'\n", "Last modif.: 'Tue Aug 28 15:26:37 2018'\n", "Object Tree: \n", "/ (RootGroup) 'Iris data'\n", "\n" ] } ], "source": [ "# Open file connection, writing new file to disk.\n", "myh5 = tables.open_file(\"data/iris/data.h5\",\n", " mode=\"w\",\n", " title=\"Iris data\")\n", "print(myh5) # currently empty." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/iris/data.h5 (File) 'Iris data'\n", "Last modif.: 'Tue Aug 28 15:26:37 2018'\n", "Object Tree: \n", "/ (RootGroup) 'Iris data'\n", "/test (Group) 'Testing data'\n", "/train (Group) 'Training data'\n", "\n" ] } ], "source": [ "myh5.create_group(myh5.root, \"train\", \"Training data\")\n", "myh5.create_group(myh5.root, \"test\", \"Testing data\")\n", "print(myh5)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/iris/data.h5 (File) 'Iris data'\n", "Last modif.: 'Tue Aug 28 15:26:37 2018'\n", "Object Tree: \n", "/ (RootGroup) 'Iris data'\n", "/test (Group) 'Testing data'\n", "/test/inputs (EArray(0, 4)) 'Input images'\n", "/test/labels (EArray(0, 1)) 'Label values'\n", "/train (Group) 'Training data'\n", "/train/inputs (EArray(0, 4)) 'Input images'\n", "/train/labels (EArray(0, 1)) 'Label values'\n", "\n" ] } ], "source": [ "# Training data arrays.\n", "a = tables.Int8Atom()\n", "myh5.create_earray(myh5.root.train,\n", " name=\"labels\",\n", " atom=a,\n", " shape=(0,NUM_LABELS),\n", " title=\"Label values\")\n", "a = tables.Float32Atom()\n", "myh5.create_earray(myh5.root.train,\n", " name=\"inputs\",\n", " atom=a,\n", " shape=(0,NUM_FEATURES),\n", " title=\"Input images\")\n", "\n", "# Testing data arrays.\n", "a = tables.Int8Atom()\n", "myh5.create_earray(myh5.root.test,\n", " name=\"labels\",\n", " atom=a,\n", " shape=(0,NUM_LABELS),\n", " title=\"Label values\")\n", "a = tables.Float32Atom()\n", "myh5.create_earray(myh5.root.test,\n", " name=\"inputs\",\n", " atom=a,\n", " shape=(0,NUM_FEATURES),\n", " title=\"Input images\")\n", "\n", "print(myh5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "訓練データと検証データに分ける前に、クラス分布が偏らないようにシャッフルしておく。" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "shufidx = np.random.choice(a=NUM_DATA, size=NUM_DATA, replace=False)\n", "idx_tr = shufidx[0:NUM_TRAIN]\n", "idx_te = shufidx[NUM_TRAIN:]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/iris/data.h5 (File) 'Iris data'\n", "Last modif.: 'Tue Aug 28 15:26:37 2018'\n", "Object Tree: \n", "/ (RootGroup) 'Iris data'\n", "/test (Group) 'Testing data'\n", "/test/inputs (EArray(0, 4)) 'Input images'\n", "/test/labels (EArray(0, 1)) 'Label values'\n", "/train (Group) 'Training data'\n", "/train/inputs (EArray(100, 4)) 'Input images'\n", "/train/labels (EArray(100, 1)) 'Label values'\n", "\n" ] } ], "source": [ "# Training data\n", "for i in idx_tr:\n", " myh5.root.train.inputs.append([data_X[i,:]])\n", " myh5.root.train.labels.append([data_y[i,:]])\n", "print(myh5)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data/iris/data.h5 (File) 'Iris data'\n", "Last modif.: 'Tue Aug 28 15:26:37 2018'\n", "Object Tree: \n", "/ (RootGroup) 'Iris data'\n", "/test (Group) 'Testing data'\n", "/test/inputs (EArray(50, 4)) 'Input images'\n", "/test/labels (EArray(50, 1)) 'Label values'\n", "/train (Group) 'Training data'\n", "/train/inputs (EArray(100, 4)) 'Input images'\n", "/train/labels (EArray(100, 1)) 'Label values'\n", "\n" ] } ], "source": [ "# Testing data\n", "for i in idx_te:\n", " myh5.root.test.inputs.append([data_X[i,:]])\n", " myh5.root.test.labels.append([data_y[i,:]])\n", "print(myh5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "ファイルとの接続をここで打ち切る。" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "myh5.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "___" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }