{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Numpy Introduction\n", "## numpy arrays" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 4, 5, 6])" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "arr = np.array([1,3,4,5,6])\n", "arr" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5,)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('int32')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr.dtype" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dtype('\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32mC:\\Users\\sharmatu\\AppData\\Local\\Continuum\\Anaconda\\envs\\Python3.5\\lib\\site-packages\\numpy\\core\\fromnumeric.py\u001b[0m in \u001b[0;36msum\u001b[1;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[0;32m 1812\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0msum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1813\u001b[0m return _methods._sum(a, axis=axis, dtype=dtype,\n\u001b[1;32m-> 1814\u001b[1;33m out=out, **kwargs)\n\u001b[0m\u001b[0;32m 1815\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1816\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;32mC:\\Users\\sharmatu\\AppData\\Local\\Continuum\\Anaconda\\envs\\Python3.5\\lib\\site-packages\\numpy\\core\\_methods.py\u001b[0m in \u001b[0;36m_sum\u001b[1;34m(a, axis, dtype, out, keepdims)\u001b[0m\n\u001b[0;32m 30\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 31\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_sum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 32\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mumr_sum\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 33\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_prod\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeepdims\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mTypeError\u001b[0m: cannot perform reduce with flexible type" ] } ], "source": [ "np.sum(arr)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating arrays" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3, 3)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.array([[1,2,3],[2,4,6],[8,8,8]])\n", "arr.shape" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 2, 3],\n", " [2, 4, 6],\n", " [8, 8, 8]])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0., 0., 0., 0.],\n", " [ 0., 0., 0., 0.]])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.zeros((2,4))\n", "arr" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1., 1., 1., 1.],\n", " [ 1., 1., 1., 1.]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.ones((2,4))\n", "arr" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 1., 0., 0.],\n", " [ 0., 1., 0.],\n", " [ 0., 0., 1.]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.identity(3)\n", "arr" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.11069212, -1.3712359 , -0.35438971, 0.03397169],\n", " [ 0.35755146, -1.15864674, 0.49294546, -0.59452261],\n", " [ 0.85139437, 0.75329689, -0.57315488, -0.02419983]])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.random.randn(3,4)\n", "arr" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 2. , 23. , 33. ],\n", " [ 32. , 42. , 63.4],\n", " [ 35. , 77. , 12. ]])" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from io import BytesIO\n", "b = BytesIO(b\"2,23,33\\n32,42,63.4\\n35,77,12\")\n", "arr = np.genfromtxt(b, delimiter=\",\")\n", "arr" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Accessing array elements\n", "#### Simple indexing" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 32. , 42. , 63.4])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[1]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[[ 0, 1, 2],\n", " [ 3, 4, 5]],\n", "\n", " [[ 6, 7, 8],\n", " [ 9, 10, 11]]])" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.arange(12).reshape(2,2,3)\n", "arr" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 1, 2],\n", " [3, 4, 5]])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[0]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5, 6, 7, 8, 9])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.arange(10)\n", "arr[5:]\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5, 6, 7])" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[5:8]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2, 3, 4])" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[:-5]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[[ 0, 1, 2],\n", " [ 3, 4, 5]],\n", "\n", " [[ 6, 7, 8],\n", " [ 9, 10, 11]]])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.arange(12).reshape(2,2,3)\n", "arr" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[[ 6, 7, 8],\n", " [ 9, 10, 11]]])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[1:2]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[[ 0, 1, 2],\n", " [ 3, 4, 5],\n", " [ 6, 7, 8]],\n", "\n", " [[ 9, 10, 11],\n", " [12, 13, 14],\n", " [15, 16, 17]],\n", "\n", " [[18, 19, 20],\n", " [21, 22, 23],\n", " [24, 25, 26]]])" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.arange(27).reshape(3,3,3)\n", "arr" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 2, 5, 8],\n", " [11, 14, 17],\n", " [20, 23, 26]])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[:,:,2]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 2, 5, 8],\n", " [11, 14, 17],\n", " [20, 23, 26]])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[...,2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Advanced Indexing" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 1, 2],\n", " [3, 4, 5],\n", " [6, 7, 8]])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.arange(9).reshape(3,3)\n", "arr" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 6])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr[[0,1,2],[1,0,0]]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Boolean Indexing" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.04941315, -0.41476745, -0.60236098],\n", " [-1.75033842, 0.62559942, -0.58148095],\n", " [ 0.43502897, -0.06588454, -0.40865494],\n", " [-0.53978394, -0.7317352 , -0.66959325],\n", " [ 0.45550659, -0.53018559, -0.2241479 ]])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities = np.array([\"delhi\",\"banglaore\",\"mumbai\",\"chennai\",\"bhopal\"])\n", "city_data = np.random.randn(5,3)\n", "city_data" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.04941315, -0.41476745, -0.60236098]])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[cities ==\"delhi\"]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 0.62559942, 0.43502897, 0.45550659])" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[city_data >0]" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.04941315, -0.41476745, -0.60236098],\n", " [-1.75033842, 0. , -0.58148095],\n", " [ 0. , -0.06588454, -0.40865494],\n", " [-0.53978394, -0.7317352 , -0.66959325],\n", " [ 0. , -0.53018559, -0.2241479 ]])" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[city_data >0] = 0\n", "city_data\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Operations on arrays" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2, 3, 4],\n", " [ 5, 6, 7, 8, 9],\n", " [10, 11, 12, 13, 14]])" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr = np.arange(15).reshape(3,5)\n", "arr" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 5, 6, 7, 8, 9],\n", " [10, 11, 12, 13, 14],\n", " [15, 16, 17, 18, 19]])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr + 5" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 2, 4, 6, 8],\n", " [10, 12, 14, 16, 18],\n", " [20, 22, 24, 26, 28]])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr * 2" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [ 4, 5, 6],\n", " [ 8, 9, 10],\n", " [12, 13, 14],\n", " [16, 17, 18]])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr1 = np.arange(15).reshape(5,3)\n", "arr2 = np.arange(5).reshape(5,1)\n", "arr2 + arr1\n" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 1, 2],\n", " [ 3, 4, 5],\n", " [ 6, 7, 8],\n", " [ 9, 10, 11],\n", " [12, 13, 14]])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr1" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0],\n", " [1],\n", " [2],\n", " [3],\n", " [4]])" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr2" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.92631238, -0.75087049, 0.38818842],\n", " [ 1.34359452, -0.68896739, -0.58429706],\n", " [ 1.06638747, -0.40104143, 0.99089011],\n", " [ 0.26232893, 1.4349162 , -0.97503394],\n", " [ 0.35716111, 0.20198017, 0.08151897]])" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arr1 = np.random.randn(5,3)\n", "arr1" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([[-0.92631238, -0.75087049, 0.38818842],\n", " [ 0.34359452, -0.68896739, -0.58429706],\n", " [ 0.06638747, -0.40104143, 0.99089011],\n", " [ 0.26232893, 0.4349162 , -0.97503394],\n", " [ 0.35716111, 0.20198017, 0.08151897]]), array([[-0., -0., 0.],\n", " [ 1., -0., -0.],\n", " [ 1., -0., 0.],\n", " [ 0., 1., -0.],\n", " [ 0., 0., 0.]]))" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.modf(arr1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Linear algebra using numpy" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 24, 24, 24],\n", " [ 72, 69, 66],\n", " [120, 114, 108]])" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A = np.array([[1,2,3],[4,5,6],[7,8,9]])\n", "B = np.array([[9,8,7],[6,5,4],[1,2,3]])\n", "A.dot(B)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0, 5, 10],\n", " [ 1, 6, 11],\n", " [ 2, 7, 12],\n", " [ 3, 8, 13],\n", " [ 4, 9, 14]])" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A = np.arange(15).reshape(3,5)\n", "A.T" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(array([[-0.15425367, 0.89974393, 0.40824829],\n", " [-0.50248417, 0.28432901, -0.81649658],\n", " [-0.85071468, -0.3310859 , 0.40824829]]),\n", " array([ 3.17420265e+01, 2.72832424e+00, 4.58204637e-16]),\n", " array([[-0.34716018, -0.39465093, -0.44214167, -0.48963242, -0.53712316],\n", " [-0.69244481, -0.37980343, -0.06716206, 0.24547932, 0.55812069],\n", " [ 0.33717486, -0.77044776, 0.28661392, 0.38941603, -0.24275704],\n", " [-0.36583339, 0.32092943, -0.08854543, 0.67763613, -0.54418674],\n", " [-0.39048565, 0.05843412, 0.8426222 , -0.29860414, -0.21196653]]))" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.linalg.svd(A)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1., 3., 2.])" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = np.array([[7,5,-3], [3,-5,2],[5,3,-7]])\n", "b = np.array([16,-8,0])\n", "x = np.linalg.solve(a, b)\n", "x" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.allclose(np.dot(a, x), b)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas\n", "## Data frames" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citydata
0Delhi1000
1Banglaore2000
2Mumbai1000
\n", "
" ], "text/plain": [ " city data\n", "0 Delhi 1000\n", "1 Banglaore 2000\n", "2 Mumbai 1000" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "d = [{'city':'Delhi',\"data\":1000},\n", " {'city':'Banglaore',\"data\":2000},\n", " {'city':'Mumbai',\"data\":1000}]\n", "pd.DataFrame(d)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(d)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading in data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "city_data = pd.read_csv(filepath_or_buffer='simplemaps-worldcities-basic.csv')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citycity_asciilatlngpopcountryiso2iso3province
0Qal eh-ye NowQal eh-ye34.98300063.1333002997.0AfghanistanAFAFGBadghis
1ChaghcharanChaghcharan34.51670165.25000115000.0AfghanistanAFAFGGhor
2Lashkar GahLashkar Gah31.58299864.360000201546.0AfghanistanAFAFGHilmand
3ZaranjZaranj31.11200161.88699849851.0AfghanistanAFAFGNimroz
4Tarin KowtTarin Kowt32.63329865.86669910000.0AfghanistanAFAFGUruzgan
5Zareh SharanZareh Sharan32.85000068.41670513737.0AfghanistanAFAFGPaktika
6AsadabadAsadabad34.86600071.15000548400.0AfghanistanAFAFGKunar
7TaloqanTaloqan36.72999969.54000464256.0AfghanistanAFAFGTakhar
8Mahmud-E EraqiMahmud-E Eraqi35.01669669.3333017407.0AfghanistanAFAFGKapisa
9Mehtar LamMehtar Lam34.65000070.16670117345.0AfghanistanAFAFGLaghman
\n", "
" ], "text/plain": [ " city city_ascii lat lng pop \\\n", "0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300 2997.0 \n", "1 Chaghcharan Chaghcharan 34.516701 65.250001 15000.0 \n", "2 Lashkar Gah Lashkar Gah 31.582998 64.360000 201546.0 \n", "3 Zaranj Zaranj 31.112001 61.886998 49851.0 \n", "4 Tarin Kowt Tarin Kowt 32.633298 65.866699 10000.0 \n", "5 Zareh Sharan Zareh Sharan 32.850000 68.416705 13737.0 \n", "6 Asadabad Asadabad 34.866000 71.150005 48400.0 \n", "7 Taloqan Taloqan 36.729999 69.540004 64256.0 \n", "8 Mahmud-E Eraqi Mahmud-E Eraqi 35.016696 69.333301 7407.0 \n", "9 Mehtar Lam Mehtar Lam 34.650000 70.166701 17345.0 \n", "\n", " country iso2 iso3 province \n", "0 Afghanistan AF AFG Badghis \n", "1 Afghanistan AF AFG Ghor \n", "2 Afghanistan AF AFG Hilmand \n", "3 Afghanistan AF AFG Nimroz \n", "4 Afghanistan AF AFG Uruzgan \n", "5 Afghanistan AF AFG Paktika \n", "6 Afghanistan AF AFG Kunar \n", "7 Afghanistan AF AFG Takhar \n", "8 Afghanistan AF AFG Kapisa \n", "9 Afghanistan AF AFG Laghman " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data.head(n=10)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citycity_asciilatlngpopcountryiso2iso3province
7317MutareMutare-18.97001932.650038216785.0ZimbabweZWZWEManicaland
7318KadomaKadoma-18.33000629.90994756400.0ZimbabweZWZWEMashonaland West
7319ChitungwizaChitungwiza-18.00000131.100003331071.0ZimbabweZWZWEHarare
7320HarareHarare-17.81779031.0447091557406.5ZimbabweZWZWEHarare
7321BulawayoBulawayo-20.16999828.580002697096.0ZimbabweZWZWEBulawayo
\n", "
" ], "text/plain": [ " city city_ascii lat lng pop country \\\n", "7317 Mutare Mutare -18.970019 32.650038 216785.0 Zimbabwe \n", "7318 Kadoma Kadoma -18.330006 29.909947 56400.0 Zimbabwe \n", "7319 Chitungwiza Chitungwiza -18.000001 31.100003 331071.0 Zimbabwe \n", "7320 Harare Harare -17.817790 31.044709 1557406.5 Zimbabwe \n", "7321 Bulawayo Bulawayo -20.169998 28.580002 697096.0 Zimbabwe \n", "\n", " iso2 iso3 province \n", "7317 ZW ZWE Manicaland \n", "7318 ZW ZWE Mashonaland West \n", "7319 ZW ZWE Harare \n", "7320 ZW ZWE Harare \n", "7321 ZW ZWE Bulawayo " ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data.tail()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": true }, "outputs": [], "source": [ "series_es = city_data.lat" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(series_es)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 34.516701\n", "3 31.112001\n", "5 32.850000\n", "7 36.729999\n", "9 34.650000\n", "Name: lat, dtype: float64" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series_es[1:10:2]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 34.983000\n", "1 34.516701\n", "2 31.582998\n", "3 31.112001\n", "4 32.633298\n", "5 32.850000\n", "6 34.866000\n", "Name: lat, dtype: float64" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series_es[:7]" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 34.983000\n", "1 34.516701\n", "2 31.582998\n", "3 31.112001\n", "4 32.633298\n", "5 32.850000\n", "6 34.866000\n", "Name: lat, dtype: float64" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "series_es[:-7315]" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citycity_asciilatlngpopcountryiso2iso3province
0Qal eh-ye NowQal eh-ye34.98300063.1333002997.0AfghanistanAFAFGBadghis
1ChaghcharanChaghcharan34.51670165.25000115000.0AfghanistanAFAFGGhor
2Lashkar GahLashkar Gah31.58299864.360000201546.0AfghanistanAFAFGHilmand
3ZaranjZaranj31.11200161.88699849851.0AfghanistanAFAFGNimroz
4Tarin KowtTarin Kowt32.63329865.86669910000.0AfghanistanAFAFGUruzgan
5Zareh SharanZareh Sharan32.85000068.41670513737.0AfghanistanAFAFGPaktika
6AsadabadAsadabad34.86600071.15000548400.0AfghanistanAFAFGKunar
\n", "
" ], "text/plain": [ " city city_ascii lat lng pop country \\\n", "0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300 2997.0 Afghanistan \n", "1 Chaghcharan Chaghcharan 34.516701 65.250001 15000.0 Afghanistan \n", "2 Lashkar Gah Lashkar Gah 31.582998 64.360000 201546.0 Afghanistan \n", "3 Zaranj Zaranj 31.112001 61.886998 49851.0 Afghanistan \n", "4 Tarin Kowt Tarin Kowt 32.633298 65.866699 10000.0 Afghanistan \n", "5 Zareh Sharan Zareh Sharan 32.850000 68.416705 13737.0 Afghanistan \n", "6 Asadabad Asadabad 34.866000 71.150005 48400.0 Afghanistan \n", "\n", " iso2 iso3 province \n", "0 AF AFG Badghis \n", "1 AF AFG Ghor \n", "2 AF AFG Hilmand \n", "3 AF AFG Nimroz \n", "4 AF AFG Uruzgan \n", "5 AF AFG Paktika \n", "6 AF AFG Kunar " ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[:7]" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citycity_asciilatlng
0Qal eh-ye NowQal eh-ye34.98300063.133300
1ChaghcharanChaghcharan34.51670165.250001
2Lashkar GahLashkar Gah31.58299864.360000
3ZaranjZaranj31.11200161.886998
4Tarin KowtTarin Kowt32.63329865.866699
\n", "
" ], "text/plain": [ " city city_ascii lat lng\n", "0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300\n", "1 Chaghcharan Chaghcharan 34.516701 65.250001\n", "2 Lashkar Gah Lashkar Gah 31.582998 64.360000\n", "3 Zaranj Zaranj 31.112001 61.886998\n", "4 Tarin Kowt Tarin Kowt 32.633298 65.866699" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data.iloc[:5,:4]" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latlng
360-34.602502-58.397531
1171-23.558680-46.625020
206831.216452121.436505
309828.66999377.230004
311019.01699072.856989
349235.685017139.751407
407419.442442-99.130988
451324.86999266.990009
539455.75216437.615523
612441.10499629.010002
707140.749979-73.980017
\n", "
" ], "text/plain": [ " lat lng\n", "360 -34.602502 -58.397531\n", "1171 -23.558680 -46.625020\n", "2068 31.216452 121.436505\n", "3098 28.669993 77.230004\n", "3110 19.016990 72.856989\n", "3492 35.685017 139.751407\n", "4074 19.442442 -99.130988\n", "4513 24.869992 66.990009\n", "5394 55.752164 37.615523\n", "6124 41.104996 29.010002\n", "7071 40.749979 -73.980017" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[city_data['pop'] > 10000000][city_data.columns[pd.Series(city_data.columns).str.startswith('l')]]" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\sharmatu\\AppData\\Local\\Continuum\\Anaconda\\envs\\Python3.5\\lib\\site-packages\\pandas\\core\\frame.py:2746: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", " **kwargs)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citycity_asciilatlngpopulationcountryiso2iso3province
360NaNNaNNaNNaNNaNNaNNaNNaNNaN
1171NaNNaNNaNNaNNaNNaNNaNNaNNaN
2068NaNNaNNaNNaNNaNNaNNaNNaNNaN
3098NaNNaNNaNNaNNaNNaNNaNNaNNaN
3110MumbaiMumbai19.01699072.85698915834918.0IndiaININDMaharashtra
3492TokyoTokyo35.685017139.75140722006299.5JapanJPJPNTokyo
4074NaNNaNNaNNaNNaNNaNNaNNaNNaN
4513NaNNaNNaNNaNNaNNaNNaNNaNNaN
5394NaNNaNNaNNaNNaNNaNNaNNaNNaN
6124NaNNaNNaNNaNNaNNaNNaNNaNNaN
7071NaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " city city_ascii lat lng population country iso2 iso3 \\\n", "360 NaN NaN NaN NaN NaN NaN NaN NaN \n", "1171 NaN NaN NaN NaN NaN NaN NaN NaN \n", "2068 NaN NaN NaN NaN NaN NaN NaN NaN \n", "3098 NaN NaN NaN NaN NaN NaN NaN NaN \n", "3110 Mumbai Mumbai 19.016990 72.856989 15834918.0 India IN IND \n", "3492 Tokyo Tokyo 35.685017 139.751407 22006299.5 Japan JP JPN \n", "4074 NaN NaN NaN NaN NaN NaN NaN NaN \n", "4513 NaN NaN NaN NaN NaN NaN NaN NaN \n", "5394 NaN NaN NaN NaN NaN NaN NaN NaN \n", "6124 NaN NaN NaN NaN NaN NaN NaN NaN \n", "7071 NaN NaN NaN NaN NaN NaN NaN NaN \n", "\n", " province \n", "360 NaN \n", "1171 NaN \n", "2068 NaN \n", "3098 NaN \n", "3110 Maharashtra \n", "3492 Tokyo \n", "4074 NaN \n", "4513 NaN \n", "5394 NaN \n", "6124 NaN \n", "7071 NaN " ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_greater_10mil = city_data[city_data['pop'] > 10000000]\n", "city_greater_10mil.rename(columns={'pop':'population'}, inplace=True)\n", "city_greater_10mil.where(city_greater_10mil.population > 15000000)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(np.random.randn(8, 3),\n", "columns=['A', 'B', 'C'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Operations on dataframes" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "numpy.ndarray" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nparray = df.values\n", "type(nparray)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "from numpy import nan\n", "df.iloc[4,2] = nan" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-1.279701-0.074395-1.370447
11.5360380.0604530.856685
20.4754071.029245-0.420355
3-1.636635-0.385956-0.261129
41.2595451.916660NaN
51.5914680.8132090.605695
6-1.2703610.2003580.035595
7-0.189060-1.874718-1.088224
\n", "
" ], "text/plain": [ " A B C\n", "0 -1.279701 -0.074395 -1.370447\n", "1 1.536038 0.060453 0.856685\n", "2 0.475407 1.029245 -0.420355\n", "3 -1.636635 -0.385956 -0.261129\n", "4 1.259545 1.916660 NaN\n", "5 1.591468 0.813209 0.605695\n", "6 -1.270361 0.200358 0.035595\n", "7 -0.189060 -1.874718 -1.088224" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABC
0-1.279701-0.074395-1.370447
11.5360380.0604530.856685
20.4754071.029245-0.420355
3-1.636635-0.385956-0.261129
41.2595451.9166600.000000
51.5914680.8132090.605695
6-1.2703610.2003580.035595
7-0.189060-1.874718-1.088224
\n", "
" ], "text/plain": [ " A B C\n", "0 -1.279701 -0.074395 -1.370447\n", "1 1.536038 0.060453 0.856685\n", "2 0.475407 1.029245 -0.420355\n", "3 -1.636635 -0.385956 -0.261129\n", "4 1.259545 1.916660 0.000000\n", "5 1.591468 0.813209 0.605695\n", "6 -1.270361 0.200358 0.035595\n", "7 -0.189060 -1.874718 -1.088224" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": { "collapsed": true }, "outputs": [], "source": [ "columns_numeric = ['lat','lng','pop']" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lat 20.662876\n", "lng 10.711914\n", "pop 265463.071633\n", "dtype: float64" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[columns_numeric].mean()" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lat 1.512936e+05\n", "lng 7.843263e+04\n", "pop 1.943721e+09\n", "dtype: float64" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[columns_numeric].sum()" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lat 7322\n", "lng 7322\n", "pop 7322\n", "dtype: int64" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[columns_numeric].count()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lat 26.792730\n", "lng 18.617509\n", "pop 61322.750000\n", "dtype: float64" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[columns_numeric].median()" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "lat 46.852480\n", "lng 89.900018\n", "pop 269210.000000\n", "Name: 0.8, dtype: float64" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[columns_numeric].quantile(0.8)" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 3095.116300\n", "1 15099.766702\n", "2 201641.942998\n", "3 49943.998999\n", "4 10098.499997\n", "dtype: float64" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[columns_numeric].sum(axis = 1).head()" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latlngpop
count7322.0000007322.0000007.322000e+03
mean20.66287610.7119142.654631e+05
std29.13481879.0446158.287622e+05
min-89.982894-179.589979-9.900000e+01
25%-0.324710-64.7884721.734425e+04
50%26.79273018.6175096.132275e+04
75%43.57544873.1036282.001726e+05
max82.483323179.3833042.200630e+07
\n", "
" ], "text/plain": [ " lat lng pop\n", "count 7322.000000 7322.000000 7.322000e+03\n", "mean 20.662876 10.711914 2.654631e+05\n", "std 29.134818 79.044615 8.287622e+05\n", "min -89.982894 -179.589979 -9.900000e+01\n", "25% -0.324710 -64.788472 1.734425e+04\n", "50% 26.792730 18.617509 6.132275e+04\n", "75% 43.575448 73.103628 2.001726e+05\n", "max 82.483323 179.383304 2.200630e+07" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data[columns_numeric].describe()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": true }, "outputs": [], "source": [ "city_data1 = city_data.sample(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Concatanating data frames" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citycity_asciilatlngpopcountryiso2iso3province
4857ShebekinoShebekino50.41435036.89437841301.5RussiaRURUSBelgorod
1561BouarBouar5.95001015.59996731476.5Central African RepublicCFCAFNana-Mambéré
6650ScottsbluffScottsbluff41.867508-103.66068620172.0United States of AmericaUSUSANebraska
964JanaubaJanauba-15.799618-43.30997738641.0BrazilBRBRAMinas Gerais
3896AltataAltata24.636045-107.916215750.0MexicoMXMEXSinaloa
7201Tra VinhTra Vinh9.934002106.334002131360.0VietnamVNVNMTrà Vinh
\n", "
" ], "text/plain": [ " city city_ascii lat lng pop \\\n", "4857 Shebekino Shebekino 50.414350 36.894378 41301.5 \n", "1561 Bouar Bouar 5.950010 15.599967 31476.5 \n", "6650 Scottsbluff Scottsbluff 41.867508 -103.660686 20172.0 \n", "964 Janauba Janauba -15.799618 -43.309977 38641.0 \n", "3896 Altata Altata 24.636045 -107.916215 750.0 \n", "7201 Tra Vinh Tra Vinh 9.934002 106.334002 131360.0 \n", "\n", " country iso2 iso3 province \n", "4857 Russia RU RUS Belgorod \n", "1561 Central African Republic CF CAF Nana-Mambéré \n", "6650 United States of America US USA Nebraska \n", "964 Brazil BR BRA Minas Gerais \n", "3896 Mexico MX MEX Sinaloa \n", "7201 Vietnam VN VNM Trà Vinh " ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data2 = city_data.sample(3)\n", "city_data_combine = pd.concat([city_data1,city_data2])\n", "city_data_combine" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df1 = pd.DataFrame({'col1': ['col10', 'col11', 'col12', 'col13'],\n", " 'col2': ['col20', 'col21', 'col22', 'col23'],\n", " 'col3': ['col30', 'col31', 'col32', 'col33'],\n", " 'col4': ['col40', 'col41', 'col42', 'col43']},\n", " index=[0, 1, 2, 3])" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1col2col3col4
0col10col20col30col40
1col11col21col31col41
2col12col22col32col42
3col13col23col33col43
\n", "
" ], "text/plain": [ " col1 col2 col3 col4\n", "0 col10 col20 col30 col40\n", "1 col11 col21 col31 col41\n", "2 col12 col22 col32 col42\n", "3 col13 col23 col33 col43" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1col2col3col4Col4col2col6
0col10col20col30col40NaNNaNNaN
1col11col21col31col41NaNNaNNaN
2col12col22col32col42Col42col22col62
3col13col23col33col43Col43col23col63
6NaNNaNNaNNaNCol46col26col66
7NaNNaNNaNNaNCol47col27col67
\n", "
" ], "text/plain": [ " col1 col2 col3 col4 Col4 col2 col6\n", "0 col10 col20 col30 col40 NaN NaN NaN\n", "1 col11 col21 col31 col41 NaN NaN NaN\n", "2 col12 col22 col32 col42 Col42 col22 col62\n", "3 col13 col23 col33 col43 Col43 col23 col63\n", "6 NaN NaN NaN NaN Col46 col26 col66\n", "7 NaN NaN NaN NaN Col47 col27 col67" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df4 = pd.DataFrame({'col2': ['col22', 'col23', 'col26', 'col27'],\n", " 'Col4': ['Col42', 'Col43', 'Col46', 'Col47'],\n", " 'col6': ['col62', 'col63', 'col66', 'col67']},\n", " index=[2, 3, 6, 7])\n", "\n", "pd.concat([df1,df4], axis=1)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": true }, "outputs": [], "source": [ "country_data = city_data[['iso3','country']].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(223, 2)" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "country_data.shape" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
iso3country
0AFGAfghanistan
33ALDAland
34ALBAlbania
60DZAAlgeria
111ASMAmerican Samoa
\n", "
" ], "text/plain": [ " iso3 country\n", "0 AFG Afghanistan\n", "33 ALD Aland\n", "34 ALB Albania\n", "60 DZA Algeria\n", "111 ASM American Samoa" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "country_data.head()" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": true }, "outputs": [], "source": [ "del(city_data['country'])" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
citycity_asciilatlngpopiso2iso3provincecountry
0Qal eh-ye NowQal eh-ye34.98300063.1333002997.0AFAFGBadghisAfghanistan
1ChaghcharanChaghcharan34.51670165.25000115000.0AFAFGGhorAfghanistan
2Lashkar GahLashkar Gah31.58299864.360000201546.0AFAFGHilmandAfghanistan
3ZaranjZaranj31.11200161.88699849851.0AFAFGNimrozAfghanistan
4Tarin KowtTarin Kowt32.63329865.86669910000.0AFAFGUruzganAfghanistan
\n", "
" ], "text/plain": [ " city city_ascii lat lng pop iso2 iso3 \\\n", "0 Qal eh-ye Now Qal eh-ye 34.983000 63.133300 2997.0 AF AFG \n", "1 Chaghcharan Chaghcharan 34.516701 65.250001 15000.0 AF AFG \n", "2 Lashkar Gah Lashkar Gah 31.582998 64.360000 201546.0 AF AFG \n", "3 Zaranj Zaranj 31.112001 61.886998 49851.0 AF AFG \n", "4 Tarin Kowt Tarin Kowt 32.633298 65.866699 10000.0 AF AFG \n", "\n", " province country \n", "0 Badghis Afghanistan \n", "1 Ghor Afghanistan \n", "2 Hilmand Afghanistan \n", "3 Nimroz Afghanistan \n", "4 Uruzgan Afghanistan " ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "city_data.merge(country_data, 'inner').head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Scikit-learn" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn import datasets\n", "diabetes = datasets.load_diabetes()\n", "X = diabetes.data[:10]\n", "y = diabetes.target" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235 ,\n", " -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613],\n", " [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,\n", " -0.01916334, 0.07441156, -0.03949338, -0.06832974, -0.09220405],\n", " [ 0.08529891, 0.05068012, 0.04445121, -0.00567061, -0.04559945,\n", " -0.03419447, -0.03235593, -0.00259226, 0.00286377, -0.02593034],\n", " [-0.08906294, -0.04464164, -0.01159501, -0.03665645, 0.01219057,\n", " 0.02499059, -0.03603757, 0.03430886, 0.02269202, -0.00936191],\n", " [ 0.00538306, -0.04464164, -0.03638469, 0.02187235, 0.00393485,\n", " 0.01559614, 0.00814208, -0.00259226, -0.03199144, -0.04664087]])" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X[:5]" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 151., 75., 141., 206., 135., 97., 138., 63., 110., 310.])" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y[:10]" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "collapsed": true }, "outputs": [], "source": [ "feature_names=['age', 'sex', 'bmi', 'bp',\n", " 's1', 's2', 's3', 's4', 's5', 's6']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Scikit example regression" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=None, error_score='raise',\n", " estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n", " normalize=False, positive=False, precompute=False, random_state=0,\n", " selection='cyclic', tol=0.0001, warm_start=False),\n", " fit_params={}, iid=True, n_jobs=1,\n", " param_grid={'alpha': array([ 1.00000e-04, 1.32035e-04, 1.74333e-04, 2.30181e-04,\n", " 3.03920e-04, 4.01281e-04, 5.29832e-04, 6.99564e-04,\n", " 9.23671e-04, 1.21957e-03, 1.61026e-03, 2.12611e-03,\n", " 2.80722e-03, 3.70651e-03, 4.89390e-03, 6.46167e-03,\n", " 8.... 7.88046e-02, 1.04050e-01, 1.37382e-01, 1.81393e-01,\n", " 2.39503e-01, 3.16228e-01])},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", " scoring=None, verbose=0)" ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import datasets\n", "from sklearn.linear_model import Lasso\n", "\n", "from sklearn import linear_model, datasets\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "diabetes = datasets.load_diabetes()\n", "X_train = diabetes.data[:310]\n", "y_train = diabetes.target[:310]\n", "\n", "X_test = diabetes.data[310:]\n", "y_test = diabetes.target[310:]\n", "\n", "lasso = Lasso(random_state=0)\n", "alphas = np.logspace(-4, -0.5, 30)\n", "\n", "scores = list()\n", "scores_std = list()\n", "\n", "estimator = GridSearchCV(lasso,\n", " param_grid = dict(alpha=alphas))\n", "\n", "estimator.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.46540637590235312" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "estimator.best_score_" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Lasso(alpha=0.025929437974046669, copy_X=True, fit_intercept=True,\n", " max_iter=1000, normalize=False, positive=False, precompute=False,\n", " random_state=0, selection='cyclic', tol=0.0001, warm_start=False)" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "estimator.best_estimator_" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 203.42104984, 177.6595529 , 122.62188598, 212.81136958,\n", " 173.61633075, 114.76145025, 202.36033584, 171.70767813,\n", " 164.28694562, 191.29091477, 191.41279009, 288.2772433 ,\n", " 296.47009002, 234.53378413, 210.61427168, 228.62812055,\n", " 156.74489991, 225.08834492, 191.75874632, 102.81600989,\n", " 172.373221 , 111.20843429, 290.22242876, 178.64605207,\n", " 78.13722832, 86.35832297, 256.41378529, 165.99622543,\n", " 121.29260976, 153.48718848, 163.09835143, 180.0932902 ,\n", " 161.4330553 , 155.80211635, 143.70181085, 126.13753819,\n", " 181.06471818, 105.03679977, 131.0479936 , 90.50606427,\n", " 252.66486639, 84.84786067, 59.41005358, 184.51368208,\n", " 201.46598714, 129.96333913, 90.65641478, 200.10932516,\n", " 55.2884802 , 171.60459062, 195.40750666, 122.14139787,\n", " 231.72783897, 159.49750022, 160.32104862, 165.53701866,\n", " 260.73217736, 259.77213787, 204.69526082, 185.66480969,\n", " 61.09821961, 209.9214333 , 108.50410841, 141.18424239,\n", " 126.10337002, 174.32819351, 214.4947322 , 162.1789921 ,\n", " 160.57776438, 134.11449594, 171.63076427, 71.71500885,\n", " 263.46782314, 113.73653782, 112.76227977, 134.37721414,\n", " 110.67874472, 98.67153573, 157.2591359 , 78.32019218,\n", " 265.97090212, 57.85502185, 100.38532691, 101.91670102,\n", " 277.13032245, 168.6443445 , 64.75637937, 184.37359745,\n", " 174.74927914, 188.78215433, 181.56001383, 92.74463449,\n", " 145.41037529, 257.78620944, 196.57335354, 276.1920927 ,\n", " 50.66776115, 179.12879963, 200.29366671, 167.29501922,\n", " 158.93206689, 156.08070427, 233.38241229, 125.30241353,\n", " 167.05404644, 171.66748431, 223.17843095, 156.7055944 ,\n", " 103.29063169, 84.08205647, 139.87060658, 189.99648341,\n", " 200.20182211, 143.61906164, 170.00220231, 112.05886847,\n", " 160.76337573, 130.06232976, 261.83022688, 102.24589129,\n", " 115.12771477, 119.14505163, 225.96991263, 63.51874043,\n", " 134.88829709, 120.01764214, 55.32147904, 189.95346987,\n", " 105.8037979 , 120.46197038, 211.35568232, 56.78368048])" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "estimator.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Deep Learning Frameworks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Theano example " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy\n", "import theano.tensor as T\n", "from theano import function\n", "x = T.dscalar('x')\n", "y = T.dscalar('y')\n", "z = x + y" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(10.0)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f = function([x, y], z)\n", "f(8, 2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tensorflow example" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b'Hello, TensorFlow!'\n" ] } ], "source": [ "import tensorflow as tf\n", "hello = tf.constant('Hello, TensorFlow!')\n", "sess = tf.Session()\n", "print(sess.run(hello))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Building a neural network model with Keras" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "from sklearn.datasets import load_breast_cancer\n", "cancer = load_breast_cancer()\n", "\n", "X_train = cancer.data[:340]\n", "y_train = cancer.target[:340]\n", "\n", "X_test = cancer.data[340:]\n", "y_test = cancer.target[340:]\n", "\n", "import numpy as np\n", "from keras.models import Sequential\n", "from keras.layers import Dense, Dropout" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "model = Sequential()\n", "model.add(Dense(15, input_dim=30, activation='relu'))\n", "model.add(Dense(1, activation='sigmoid'))" ] }, { "cell_type": "code", "execution_count": 151, "metadata": { "collapsed": true }, "outputs": [], "source": [ "model.compile(loss='binary_crossentropy',\n", " optimizer='rmsprop',\n", " metrics=['accuracy'])" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 2/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 3/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 4/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 5/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 6/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 7/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 8/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 9/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 10/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 11/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 12/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 13/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 14/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 15/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 16/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 17/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 18/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 19/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n", "Epoch 20/20\n", "340/340 [==============================] - 0s - loss: 7.3616 - acc: 0.5382 \n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.fit(X_train, y_train,\n", " epochs=20,\n", " batch_size=50)" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", " 32/229 [===>..........................] - ETA: 1s" ] } ], "source": [ "predictions = model.predict_classes(X_test)" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.759825327511\n", " precision recall f1-score support\n", "\n", " 0 0.00 0.00 0.00 55\n", " 1 0.76 1.00 0.86 174\n", "\n", "avg / total 0.58 0.76 0.66 229\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\metrics\\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", " 'precision', 'predicted', average, warn_for)\n" ] } ], "source": [ "from sklearn import metrics\n", "\n", "print('Accuracy:', metrics.accuracy_score(y_true=y_test, y_pred=predictions))\n", "print(metrics.classification_report(y_true=y_test, y_pred=predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The power of deep learning models" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/20\n", "340/340 [==============================] - 0s - loss: 3.3799 - acc: 0.3941 \n", "Epoch 2/20\n", "340/340 [==============================] - 0s - loss: 1.3740 - acc: 0.6059 \n", "Epoch 3/20\n", "340/340 [==============================] - 0s - loss: 0.4258 - acc: 0.8471 \n", "Epoch 4/20\n", "340/340 [==============================] - 0s - loss: 0.2859 - acc: 0.8912 \n", "Epoch 5/20\n", "340/340 [==============================] - 0s - loss: 0.2061 - acc: 0.9206 \n", "Epoch 6/20\n", "340/340 [==============================] - 0s - loss: 0.2407 - acc: 0.8941 \n", "Epoch 7/20\n", "340/340 [==============================] - 0s - loss: 0.2725 - acc: 0.9118 \n", "Epoch 8/20\n", "340/340 [==============================] - 0s - loss: 0.5237 - acc: 0.8676 \n", "Epoch 9/20\n", "340/340 [==============================] - 0s - loss: 0.2165 - acc: 0.9324 \n", "Epoch 10/20\n", "340/340 [==============================] - 0s - loss: 0.2502 - acc: 0.9029 \n", "Epoch 11/20\n", "340/340 [==============================] - 0s - loss: 0.3235 - acc: 0.8853 \n", "Epoch 12/20\n", "340/340 [==============================] - 0s - loss: 0.3115 - acc: 0.8912 \n", "Epoch 13/20\n", "340/340 [==============================] - 0s - loss: 0.2975 - acc: 0.9059 \n", "Epoch 14/20\n", "340/340 [==============================] - 0s - loss: 0.3426 - acc: 0.9118 \n", "Epoch 15/20\n", "340/340 [==============================] - 0s - loss: 0.3763 - acc: 0.9176 \n", "Epoch 16/20\n", "340/340 [==============================] - 0s - loss: 0.2420 - acc: 0.9088 \n", "Epoch 17/20\n", "340/340 [==============================] - 0s - loss: 0.4274 - acc: 0.8618 \n", "Epoch 18/20\n", "340/340 [==============================] - 0s - loss: 0.1885 - acc: 0.9353 \n", "Epoch 19/20\n", "340/340 [==============================] - 0s - loss: 0.2361 - acc: 0.9235 \n", "Epoch 20/20\n", "340/340 [==============================] - 0s - loss: 0.3154 - acc: 0.9000 \n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 155, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = Sequential()\n", "model.add(Dense(15, input_dim=30, activation='relu'))\n", "model.add(Dense(15, activation='relu'))\n", "model.add(Dense(15, activation='relu'))\n", "model.add(Dense(1, activation='sigmoid'))\n", "\n", "model.compile(loss='binary_crossentropy',\n", " optimizer='rmsprop',\n", " metrics=['accuracy'])\n", "\n", "model.fit(X_train, y_train,\n", " epochs=20,\n", " batch_size=50)" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\r", " 32/229 [===>..........................] - ETA: 1s" ] } ], "source": [ "predictions = model.predict_classes(X_test)" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.912663755459\n", " precision recall f1-score support\n", "\n", " 0 0.78 0.89 0.83 55\n", " 1 0.96 0.92 0.94 174\n", "\n", "avg / total 0.92 0.91 0.91 229\n", "\n" ] } ], "source": [ "print('Accuracy:', metrics.accuracy_score(y_true=y_test, y_pred=predictions))\n", "print(metrics.classification_report(y_true=y_test, y_pred=predictions))" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }