{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n", "1.14.2\n" ] } ], "source": [ "import os\n", "import numpy as np\n", "import csv\n", "\n", "print csv.__version__\n", "print np.__version__" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# open file\n", "# set delimiter as ';'\n", "# typecast to list and store to variable \"wine\"\n", "\n", "DATA_DIR = '../data'\n", "\n", "with open(os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), 'r') as datafile:\n", " reader = csv.reader(datafile, delimiter=\";\")\n", " wines = list(reader)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['fixed acidity',\n", " 'volatile acidity',\n", " 'citric acid',\n", " 'residual sugar',\n", " 'chlorides',\n", " 'free sulfur dioxide',\n", " 'total sulfur dioxide',\n", " 'density',\n", " 'pH',\n", " 'sulphates',\n", " 'alcohol',\n", " 'quality']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wines[0]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total records: 4898\n" ] } ], "source": [ "# -1 to avoid header from counting into records\n", "print 'Total records: {}'.format(len(wines)-1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "5.87790935076\n" ] } ], "source": [ "# average of quality variable\n", "qualities = [float(item[-1]) for item in wines[1:]]\n", "print sum(qualities)/len(qualities)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(4898, 12)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# converting python array to numpy array and typecasting the cell values to float\n", "wines = wines[1:]\n", "wines_np = np.array(wines, dtype='float')\n", "wines_np.shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 7. , 0.27, 0.36, ..., 0.45, 8.8 , 6. ],\n", " [ 6.3 , 0.3 , 0.34, ..., 0.49, 9.5 , 6. ],\n", " [ 8.1 , 0.28, 0.4 , ..., 0.44, 10.1 , 6. ],\n", " ...,\n", " [ 6.5 , 0.24, 0.19, ..., 0.46, 9.4 , 6. ],\n", " [ 5.5 , 0.29, 0.3 , ..., 0.38, 12.8 , 7. ],\n", " [ 6. , 0.21, 0.38, ..., 0.32, 11.8 , 6. ]])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# printing out wines 2-D array\n", "wines_np" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0. 0.]\n", " [0. 0.]]\n" ] } ], "source": [ "# creating numpy array with all zero elements\n", "print np.zeros((2,2), dtype='float')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0.79301112 0.95735914]\n", " [0.29987175 0.96456405]]\n" ] } ], "source": [ "# creating numpy array with all random numbers\n", "print np.random.rand(2,2)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# using numpy to read dataset\n", "wines = np.genfromtxt(\n", " os.path.abspath(os.path.join(DATA_DIR, 'day7/winequality-white.csv')), \n", " delimiter=\";\", \n", " skip_header=1\n", " )" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ True, True, True, ..., True, True, True],\n", " [ True, True, True, ..., True, True, True],\n", " [ True, True, True, ..., True, True, True],\n", " ...,\n", " [ True, True, True, ..., True, True, True],\n", " [ True, True, True, ..., True, True, True],\n", " [ True, True, True, ..., True, True, True]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wines == wines_np" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.28\n", "True\n" ] } ], "source": [ "# accessing the value for 3rd row 2nd column of wines\n", "print wines[2, 1]\n", "print wines[2, 1] == wines_np[2, 1]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([0.36, 0.34, 0.4 , 0.32])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we would like to access 4rows from top of 3rd column\n", "# wines[start:end, column_index]\n", "# since the index start from zero; so slicing excludes 4 and finds out result from 0, 1, 2, 3\n", "wines[:4,2]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([10., 10., 10., 10.])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we will override the existing value of 2nd column to 10.0 for all the rows\n", "wines[:, 2] = 10.0\n", "wines[:4, 2]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([0.28855438, 0.95129591, 0.80747318, 0.89765623, 0.98632739])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# creating 1-D array in numpy\n", "random_1d = np.random.rand(5)\n", "random_1d" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# creating 3-D numpy array\n", "random_3d = np.random.rand(2,4,3)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(2, 4, 3)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# take this shape as any thing for 2 years across 4 quarters per month in that quarter\n", "#2x4x3 = 24 months\n", "random_3d.shape" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[ 7, 0, 10, ..., 0, 8, 6],\n", " [ 6, 0, 10, ..., 0, 9, 6],\n", " [ 8, 0, 10, ..., 0, 10, 6],\n", " ...,\n", " [ 6, 0, 10, ..., 0, 9, 6],\n", " [ 5, 0, 10, ..., 0, 12, 7],\n", " [ 6, 0, 10, ..., 0, 11, 6]])" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Data types in numpy\n", "# converting wines to type=int\n", "wines.astype('int')" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[6. 6. 6. ... 6. 7. 6.]\n", "[7. 7. 7. ... 7. 8. 7.]\n" ] } ], "source": [ "# addition to any column across all rows\n", "# as shows below all the remaining mathematical operations can be done\n", "print wines[:, 11]\n", "print wines[:, 11] + 1" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([36., 36., 36., ..., 36., 49., 36.])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# multiplying 2 columns\n", "# examples show the square of 12th column\n", "wines[:, 11] * wines[: , 11]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "28790.0" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# sum any column across all rows\n", "wines[:, 11].sum(axis=0)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "5.87790935075541" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wines[:, 11].mean() #std, min, max are many other methods for fast stats computation" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }