{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "################################################################\n", "## ImportingData #1.2\n", "## Atul Singh\n", "## www.datagenx.net\n", "################################################################" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Importing Data #1\n", "### #1.2 Importing data from other file formats" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# import \n", "import numpy as np\n", "import pandas as pd\n", "import os\n", "import pickle as pkl" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Running os library funcs" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "C:\\Git\\MachineLearning\\python_DC\n", "['.ipynb_checkpoints', 'dataset', 'ImportingData_#1.1.ipynb', 'ImportingData_#1.2.ipynb']\n" ] } ], "source": [ "wd = os.getcwd() # get working dir\n", "files = os.listdir(wd) # get content of current working dir\n", "print(wd)\n", "print(files)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": false }, "source": [ "### Reading Excel using pandas" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['titanic1', 'titanic2', 'titanic3']\n" ] } ], "source": [ "fh = pd.ExcelFile(\"dataset/titanic.xls\")\n", "print(fh.sheet_names)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1 1.1 Allen, Miss. Elisabeth Walton female 29 \\\n", "0 1 1 Allison, Master. Hudson Trevor male 0.9167 \n", "1 1 0 Allison, Miss. Helen Loraine female 2.0000 \n", "2 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 \n", "3 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 \n", "4 1 1 Anderson, Mr. Harry male 48.0000 \n", "\n", " 0 0.1 24160 211.3375 B5 S 2 Unnamed: 12 \\\n", "0 1 2 113781 151.55 C22 C26 S 11 NaN \n", "1 1 2 113781 151.55 C22 C26 S NaN NaN \n", "2 1 2 113781 151.55 C22 C26 S NaN 135.0 \n", "3 1 2 113781 151.55 C22 C26 S NaN NaN \n", "4 0 0 19952 26.55 E12 S 3 NaN \n", "\n", " St Louis, MO \n", "0 Montreal, PQ / Chesterville, ON \n", "1 Montreal, PQ / Chesterville, ON \n", "2 Montreal, PQ / Chesterville, ON \n", "3 Montreal, PQ / Chesterville, ON \n", "4 New York, NY \n" ] } ], "source": [ "# parsing the sheets into pandas\n", "df1 = fh.parse(\"titanic1\") # first line becomes header\n", "print(df1.head())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pclass survived name sex \\\n", "0 1 1 Allen, Miss. Elisabeth Walton female \n", "1 1 1 Allison, Master. Hudson Trevor male \n", "2 1 0 Allison, Miss. Helen Loraine female \n", "3 1 0 Allison, Mr. Hudson Joshua Creighton male \n", "4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n", "\n", " age sibsp parch ticket fare cabin embarked boat body \\\n", "0 29.0000 0 0 24160 211.3375 B5 S 2 NaN \n", "1 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN \n", "2 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN \n", "3 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 \n", "4 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN \n", "\n", " home.dest \n", "0 St Louis, MO \n", "1 Montreal, PQ / Chesterville, ON \n", "2 Montreal, PQ / Chesterville, ON \n", "3 Montreal, PQ / Chesterville, ON \n", "4 Montreal, PQ / Chesterville, ON \n" ] } ], "source": [ "df1 = fh.parse(1)\n", "print(df1.head())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1 1.1 Allen, Miss. Elisabeth Walton female 29 \\\n", "0 1 1 Allison, Master. Hudson Trevor male 0.9167 \n", "1 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 \n", "2 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 \n", "3 1 1 Anderson, Mr. Harry male 48.0000 \n", "4 1 1 Andrews, Miss. Kornelia Theodosia female 63.0000 \n", "\n", " 0 0.1 24160 211.3375 B5 S 2 Unnamed: 12 \\\n", "0 1 2 113781 151.5500 C22 C26 S 11 NaN \n", "1 1 2 113781 151.5500 C22 C26 S NaN 135.0 \n", "2 1 2 113781 151.5500 C22 C26 S NaN NaN \n", "3 0 0 19952 26.5500 E12 S 3 NaN \n", "4 1 0 13502 77.9583 D7 S 10 NaN \n", "\n", " St Louis, MO \n", "0 Montreal, PQ / Chesterville, ON \n", "1 Montreal, PQ / Chesterville, ON \n", "2 Montreal, PQ / Chesterville, ON \n", "3 New York, NY \n", "4 Hudson, NY \n" ] } ], "source": [ "# parsing data with more option\n", "df1 = fh.parse(0, skiprows=[2]) #1st line becomes header and after it will skip the 2nd row only, \n", "print(df1.head())" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pclass survived name sex \\\n", "0 1 1 Allison, Master. Hudson Trevor male \n", "1 1 0 Allison, Mr. Hudson Joshua Creighton male \n", "2 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female \n", "3 1 1 Anderson, Mr. Harry male \n", "4 1 1 Andrews, Miss. Kornelia Theodosia female \n", "\n", " age sibsp parch ticket fare cabin embarked boat body \\\n", "0 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN \n", "1 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 \n", "2 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN \n", "3 48.0000 0 0 19952 26.5500 E12 S 3 NaN \n", "4 63.0000 1 0 13502 77.9583 D7 S 10 NaN \n", "\n", " home.dest \n", "0 Montreal, PQ / Chesterville, ON \n", "1 Montreal, PQ / Chesterville, ON \n", "2 Montreal, PQ / Chesterville, ON \n", "3 New York, NY \n", "4 Hudson, NY \n" ] } ], "source": [ "colnames = [\"pclass\",\"survived\",\"name\",\"sex\",\"age\",\"sibsp\",\"parch\",\"ticket\",\"fare\",\"cabin\",\"embarked\",\"boat\",\"body\",\"home.dest\"]\n", "df1 = fh.parse(0, skiprows=[2], names=colnames) #it will skip the 2nd row only, \n", "print(df1.head()) " ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " pclass survived name sex\n", "0 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female\n", "1 1 1 Anderson, Mr. Harry male\n", "2 1 1 Andrews, Miss. Kornelia Theodosia female\n", "3 1 0 Andrews, Mr. Thomas Jr male\n", "4 1 1 Appleton, Mrs. Edward Dale (Charlotte Lamson) female\n" ] } ], "source": [ "#print(colnames[0:4])\n", "# skip - 1,2,3 rows, cols - 0,1,2,3 and assigning name\n", "df1 = fh.parse(0, skiprows=[1,2,3], parse_cols=[0,1,2,3], names=colnames[0:4]) \n", "print(df1.head()) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading SAS files" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# # Import sas7bdat package\n", "# from sas7bdat import SAS7BDAT\n", "\n", "# # Save file to a DataFrame: df_sas\n", "# with SAS7BDAT('sales.sas7bdat') as file:\n", "# df_sas = file.to_data_frame()\n", "\n", "# # Print head of DataFrame\n", "# print(df_sas.head())\n", "\n", "# # Plot histogram of DataFrame features (pandas and pyplot already imported)\n", "# pd.DataFrame.hist(df_sas[['P']])\n", "# plt.ylabel('count')\n", "# plt.show()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [], "source": [ "## Importing SAS Stata files\n", "\n", "# # Import pandas\n", "# import pandas as pd\n", "\n", "# # Load Stata file into a pandas DataFrame: df\n", "# df = pd.read_stata(\"disarea.dta\")\n", "\n", "# # Print the head of the DataFrame df\n", "# print(df.head())\n", "\n", "# # Plot histogram of one column of the DataFrame\n", "# pd.DataFrame.hist(df[['disa10']])\n", "# plt.xlabel('Extent of disease')\n", "# plt.ylabel('Number of coutries')\n", "# plt.show()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Reading HDFS files" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# # Import packages\n", "# import numpy as np\n", "# import h5py\n", "\n", "# # Assign filename: file\n", "# file = \"LIGO_data.hdf5\"\n", "\n", "# # Load file: data\n", "# data = h5py.File(file, \"r\")\n", "\n", "# # Print the datatype of the loaded file\n", "# print(type(data))\n", "\n", "# # Print the keys of the file\n", "# for key in data.keys():\n", "# print(key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# # Get the HDF5 group: group\n", "# group = data['strain']\n", "\n", "# # Check out keys of group\n", "# for key in group.keys():\n", "# print(key)\n", "\n", "# # Set variable equal to time series data: strain\n", "# strain = data['strain']['Strain'].value\n", "\n", "# # Set number of time points to sample: num_samples\n", "# num_samples=10000\n", "\n", "# # Set time vector\n", "# time = np.arange(0, 1, 1/num_samples)\n", "\n", "# # Plot data\n", "# plt.plot(time, strain[:num_samples])\n", "# plt.xlabel('GPS Time (s)')\n", "# plt.ylabel('strain')\n", "# plt.show()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Loading Matlab files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# # Import package\n", "# import scipy.io\n", "\n", "# # Load MATLAB file: mat\n", "# mat = scipy.io.loadmat('albeck_gene_expression.mat')\n", "\n", "# # Print the datatype type of mat\n", "# print(type(mat))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# # Print the keys of the MATLAB dictionary\n", "# print(mat.keys())\n", "\n", "# # Print the type of the value corresponding to the key 'CYratioCyt'\n", "# print(type(mat['CYratioCyt']))\n", "\n", "# # Print the shape of the value corresponding to the key 'CYratioCyt'\n", "# print(np.shape(mat['CYratioCyt']))\n", "\n", "# # Subset the array and plot it\n", "# data = mat['CYratioCyt'][25, 5:]\n", "# fig = plt.figure()\n", "# plt.plot(data)\n", "# plt.xlabel('time (min.)')\n", "# plt.ylabel('normalized fluorescence (measure of expression)')\n", "# plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "############################################################\n", "## Atul Singh | www.datagenx.net | lnked.in/atulsingh\n", "############################################################" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }