{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Майнор по Анализу Данных, Группа ИАД-4\n",
"## 19/10/2017 Практика с нейронными сетями"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"from keras.models import Sequential\n",
"from keras.layers import Dense\n",
"\n",
"import numpy\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.metrics import roc_auc_score, roc_curve\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"RND_SEED = 7\n",
"plt.style.use('ggplot')\n",
"\n",
"numpy.random.seed(RND_SEED)\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Классификация"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Загрузка данных"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для тренеровки мы будем использовать достаточно известный набор данных [Pima Indians](http://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes).\n",
"\n",
"Признаки такие:
\n",
"1. Number of times pregnant\n",
"2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test\n",
"3. Diastolic blood pressure (mm Hg)\n",
"4. Triceps skin fold thickness (mm)\n",
"5. 2-Hour serum insulin (mu U/ml)\n",
"6. Body mass index (weight in kg/(height in m)^2)\n",
"7. Diabetes pedigree function\n",
"8. Age (years)\n",
"9. Class variable (0 or 1)\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data',\n",
" sep=',', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
| \n", " | 0 | \n", "1 | \n", "2 | \n", "3 | \n", "4 | \n", "5 | \n", "6 | \n", "7 | \n", "8 | \n", "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "1 | \n", "
| 1 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "0 | \n", "
| 2 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "1 | \n", "
| 3 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "0 | \n", "
| 4 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "1 | \n", "
"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras.datasets import mnist\n",
"from keras.utils.np_utils import to_categorical\n",
"\n",
"from keras.layers import Activation\n",
"from keras.layers.convolutional import Convolution2D, MaxPooling2D\n",
"from keras.layers import Dropout, Flatten, Dense\n",
"\n",
"from keras import backend as K"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras.optimizers import SGD"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"(X_train, y_train), (X_test, y_test) = mnist.load_data()\n",
"\n",
"n_train, img_rows, img_cols = X_train.shape\n",
"n_test, _, _ = X_test.shape\n",
"\n",
"n_train, n_test, img_rows, img_cols"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Посмотрим на данные"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Готовим данные\n",
"Есть некоторые заморочки связанные с тем, в каком виде должны быть данные"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"img_rows, img_cols = 28, 28\n",
"\n",
"if K.image_dim_ordering() != 'tf':\n",
" shape_ord = (1, img_rows, img_cols)\n",
"else: # channel_last\n",
" shape_ord = (img_rows, img_cols, 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train = X_train.reshape((X_train.shape[0],) + shape_ord)\n",
"X_test = X_test.reshape((X_test.shape[0],) + shape_ord)\n",
"\n",
"X_train = X_train.astype('float32')\n",
"X_test = X_test.astype('float32')\n",
"\n",
"X_train /= 255\n",
"X_test /= 255\n",
"\n",
"\n",
"# numbers 0-9, so ten classes\n",
"n_classes = 10\n",
"\n",
"y_train = to_categorical(y_train, n_classes)\n",
"y_test = to_categorical(y_test, n_classes)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_train.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Составляем модель"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Количество конволюционных фильтров\n",
"n_filters = 32\n",
"\n",
"# Размер фильра\n",
"n_conv = 3\n",
"\n",
"# Размер пуллинга\n",
"n_pool = 2\n",
"\n",
"model = Sequential()\n",
"model.add(Convolution2D( \n",
" n_filters, n_conv, n_conv,\n",
" border_mode='valid',\n",
" input_shape=input_shape,\n",
" activation='relu'\n",
"))\n",
"\n",
"# Конволюция\n",
"model.add(Convolution2D(n_filters, n_conv, n_conv, activation='relu'))\n",
"\n",
"# Пуллинг\n",
"model.add(MaxPooling2D(pool_size=(n_pool, n_pool)))\n",
"model.add(Dropout(0.25))\n",
"\n",
"# Выравниваем\n",
"model.add(Flatten())\n",
"\n",
"# Делаем полносвязный слой\n",
"model.add(Dense(128, activation='relu'))\n",
"model.add(Dropout(0.5))\n",
"\n",
"# Softmax в конце\n",
"model.add(Dense(n_classes, activation='softmax'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model.compile(\n",
" loss='categorical_crossentropy',\n",
" optimizer='adam',\n",
" metrics=['accuracy']\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model.fit(X_train,\n",
" y_train,\n",
" batch_size=32,\n",
" nb_epoch=10,\n",
" validation_data=(X_test, y_test),\n",
" verbose=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Внитри модели"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for i, layer in enumerate(model.layers):\n",
" print (\"Layer\", i, \"\\t\", layer.name, \"\\t\\t\", layer.input_shape, \"\\t\", layer.output_shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for i, layer in enumerate(model.layers):\n",
" if len(layer.get_weights()) > 0:\n",
" W, b = layer.get_weights()\n",
" print(\"Layer\", i, \"\\t\", layer.name, \"\\t\\t\", W.shape, \"\\t\", b.shape)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Цветные картинки"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras.datasets import cifar10\n",
"from keras.utils import np_utils\n",
"\n",
"(X_train, y_train), (X_test, y_test) = cifar10.load_data()\n",
"\n",
"nb_classes = 10\n",
"Y_train = np_utils.to_categorical(y_train, nb_classes)\n",
"Y_test = np_utils.to_categorical(y_test, nb_classes)\n",
"X_train = X_train.astype(\"float32\")\n",
"X_test = X_test.astype(\"float32\")\n",
"X_train /= 255\n",
"X_test /= 255"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"По сути это уменьшенные изображения, которые нам повсюду встречаются. Сеть должна уметь выполнять классификацию не зависимо от того, повернут ли объект, уменьшен он или увеличен и тп.\n",
"\n",
"То есть в обучающей выборке должны содержаться такие элементарные трансформации"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras.preprocessing.image import ImageDataGenerator\n",
"\n",
"generated_images = ImageDataGenerator(\n",
" featurewise_center=True, # set input mean to 0 over the dataset\n",
" samplewise_center=False, # set each sample mean to 0\n",
" featurewise_std_normalization=True, # divide inputs by std of the dataset\n",
" samplewise_std_normalization=False, # divide each input by its std\n",
" zca_whitening=False, # apply ZCA whitening\n",
" rotation_range=0, # randomly rotate images in the range (degrees, 0 to 180)\n",
" width_shift_range=0.2, # randomly shift images horizontally (fraction of total width)\n",
" height_shift_range=0.2, # randomly shift images vertically (fraction of total height)\n",
" horizontal_flip=True, # randomly flip images\n",
" vertical_flip=False) # randomly flip images\n",
"\n",
"generated_images.fit(X_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Далее, продолжаем по https://github.com/fchollet/keras/blob/master/examples/cifar10_cnn.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
},
"nav_menu": {},
"toc": {
"colors": {
"hover_highlight": "#DAA520",
"navigate_num": "#000000",
"navigate_text": "#333333",
"running_highlight": "#FF0000",
"selected_highlight": "#FFD700",
"sidebar_border": "#EEEEEE",
"wrapper_background": "#FFFFFF"
},
"moveMenuLeft": true,
"nav_menu": {
"height": "254px",
"width": "253px"
},
"navigate_menu": true,
"number_sections": false,
"sideBar": true,
"threshold": 4,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": false,
"widenNotebook": false
}
},
"nbformat": 4,
"nbformat_minor": 1
}