{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "################################################################\n",
    "##  ImportingData #1.2\n",
    "##  Atul Singh\n",
    "##  www.datagenx.net\n",
    "################################################################"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing Data #1\n",
    "### #1.2 Importing data from other file formats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import \n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import pickle as pkl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Running os library funcs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C:\\Git\\MachineLearning\\python_DC\n",
      "['.ipynb_checkpoints', 'dataset', 'ImportingData_#1.1.ipynb', 'ImportingData_#1.2.ipynb']\n"
     ]
    }
   ],
   "source": [
    "wd = os.getcwd()  # get working dir\n",
    "files = os.listdir(wd)  # get content of current working dir\n",
    "print(wd)\n",
    "print(files)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "### Reading Excel using pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['titanic1', 'titanic2', 'titanic3']\n"
     ]
    }
   ],
   "source": [
    "fh = pd.ExcelFile(\"dataset/titanic.xls\")\n",
    "print(fh.sheet_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   1  1.1                    Allen, Miss. Elisabeth Walton  female       29  \\\n",
      "0  1    1                   Allison, Master. Hudson Trevor    male   0.9167   \n",
      "1  1    0                     Allison, Miss. Helen Loraine  female   2.0000   \n",
      "2  1    0             Allison, Mr. Hudson Joshua Creighton    male  30.0000   \n",
      "3  1    0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female  25.0000   \n",
      "4  1    1                              Anderson, Mr. Harry    male  48.0000   \n",
      "\n",
      "   0  0.1   24160  211.3375       B5  S    2  Unnamed: 12  \\\n",
      "0  1    2  113781    151.55  C22 C26  S   11          NaN   \n",
      "1  1    2  113781    151.55  C22 C26  S  NaN          NaN   \n",
      "2  1    2  113781    151.55  C22 C26  S  NaN        135.0   \n",
      "3  1    2  113781    151.55  C22 C26  S  NaN          NaN   \n",
      "4  0    0   19952     26.55      E12  S    3          NaN   \n",
      "\n",
      "                      St Louis, MO  \n",
      "0  Montreal, PQ / Chesterville, ON  \n",
      "1  Montreal, PQ / Chesterville, ON  \n",
      "2  Montreal, PQ / Chesterville, ON  \n",
      "3  Montreal, PQ / Chesterville, ON  \n",
      "4                     New York, NY  \n"
     ]
    }
   ],
   "source": [
    "# parsing the sheets into pandas\n",
    "df1 = fh.parse(\"titanic1\")  # first line becomes header\n",
    "print(df1.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   pclass  survived                                             name     sex  \\\n",
      "0       1         1                    Allen, Miss. Elisabeth Walton  female   \n",
      "1       1         1                   Allison, Master. Hudson Trevor    male   \n",
      "2       1         0                     Allison, Miss. Helen Loraine  female   \n",
      "3       1         0             Allison, Mr. Hudson Joshua Creighton    male   \n",
      "4       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   \n",
      "\n",
      "       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \\\n",
      "0  29.0000      0      0   24160  211.3375       B5        S    2    NaN   \n",
      "1   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   \n",
      "2   2.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   \n",
      "3  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   \n",
      "4  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   \n",
      "\n",
      "                         home.dest  \n",
      "0                     St Louis, MO  \n",
      "1  Montreal, PQ / Chesterville, ON  \n",
      "2  Montreal, PQ / Chesterville, ON  \n",
      "3  Montreal, PQ / Chesterville, ON  \n",
      "4  Montreal, PQ / Chesterville, ON  \n"
     ]
    }
   ],
   "source": [
    "df1 = fh.parse(1)\n",
    "print(df1.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   1  1.1                    Allen, Miss. Elisabeth Walton  female       29  \\\n",
      "0  1    1                   Allison, Master. Hudson Trevor    male   0.9167   \n",
      "1  1    0             Allison, Mr. Hudson Joshua Creighton    male  30.0000   \n",
      "2  1    0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female  25.0000   \n",
      "3  1    1                              Anderson, Mr. Harry    male  48.0000   \n",
      "4  1    1                Andrews, Miss. Kornelia Theodosia  female  63.0000   \n",
      "\n",
      "   0  0.1   24160  211.3375       B5  S    2  Unnamed: 12  \\\n",
      "0  1    2  113781  151.5500  C22 C26  S   11          NaN   \n",
      "1  1    2  113781  151.5500  C22 C26  S  NaN        135.0   \n",
      "2  1    2  113781  151.5500  C22 C26  S  NaN          NaN   \n",
      "3  0    0   19952   26.5500      E12  S    3          NaN   \n",
      "4  1    0   13502   77.9583       D7  S   10          NaN   \n",
      "\n",
      "                      St Louis, MO  \n",
      "0  Montreal, PQ / Chesterville, ON  \n",
      "1  Montreal, PQ / Chesterville, ON  \n",
      "2  Montreal, PQ / Chesterville, ON  \n",
      "3                     New York, NY  \n",
      "4                       Hudson, NY  \n"
     ]
    }
   ],
   "source": [
    "# parsing data with more option\n",
    "df1 = fh.parse(0, skiprows=[2])  #1st line becomes header and after it will skip the 2nd row only, \n",
    "print(df1.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   pclass  survived                                             name     sex  \\\n",
      "0       1         1                   Allison, Master. Hudson Trevor    male   \n",
      "1       1         0             Allison, Mr. Hudson Joshua Creighton    male   \n",
      "2       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female   \n",
      "3       1         1                              Anderson, Mr. Harry    male   \n",
      "4       1         1                Andrews, Miss. Kornelia Theodosia  female   \n",
      "\n",
      "       age  sibsp  parch  ticket      fare    cabin embarked boat   body  \\\n",
      "0   0.9167      1      2  113781  151.5500  C22 C26        S   11    NaN   \n",
      "1  30.0000      1      2  113781  151.5500  C22 C26        S  NaN  135.0   \n",
      "2  25.0000      1      2  113781  151.5500  C22 C26        S  NaN    NaN   \n",
      "3  48.0000      0      0   19952   26.5500      E12        S    3    NaN   \n",
      "4  63.0000      1      0   13502   77.9583       D7        S   10    NaN   \n",
      "\n",
      "                         home.dest  \n",
      "0  Montreal, PQ / Chesterville, ON  \n",
      "1  Montreal, PQ / Chesterville, ON  \n",
      "2  Montreal, PQ / Chesterville, ON  \n",
      "3                     New York, NY  \n",
      "4                       Hudson, NY  \n"
     ]
    }
   ],
   "source": [
    "colnames = [\"pclass\",\"survived\",\"name\",\"sex\",\"age\",\"sibsp\",\"parch\",\"ticket\",\"fare\",\"cabin\",\"embarked\",\"boat\",\"body\",\"home.dest\"]\n",
    "df1 = fh.parse(0, skiprows=[2], names=colnames)  #it will skip the 2nd row only, \n",
    "print(df1.head()) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   pclass  survived                                             name     sex\n",
      "0       1         0  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female\n",
      "1       1         1                              Anderson, Mr. Harry    male\n",
      "2       1         1                Andrews, Miss. Kornelia Theodosia  female\n",
      "3       1         0                           Andrews, Mr. Thomas Jr    male\n",
      "4       1         1    Appleton, Mrs. Edward Dale (Charlotte Lamson)  female\n"
     ]
    }
   ],
   "source": [
    "#print(colnames[0:4])\n",
    "# skip - 1,2,3 rows, cols - 0,1,2,3 and assigning name\n",
    "df1 = fh.parse(0, skiprows=[1,2,3], parse_cols=[0,1,2,3], names=colnames[0:4])  \n",
    "print(df1.head()) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading SAS files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# # Import sas7bdat package\n",
    "# from sas7bdat import SAS7BDAT\n",
    "\n",
    "# # Save file to a DataFrame: df_sas\n",
    "# with SAS7BDAT('sales.sas7bdat') as file:\n",
    "#     df_sas = file.to_data_frame()\n",
    "\n",
    "# # Print head of DataFrame\n",
    "# print(df_sas.head())\n",
    "\n",
    "# # Plot histogram of DataFrame features (pandas and pyplot already imported)\n",
    "# pd.DataFrame.hist(df_sas[['P']])\n",
    "# plt.ylabel('count')\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Importing SAS Stata files\n",
    "\n",
    "# # Import pandas\n",
    "# import pandas as pd\n",
    "\n",
    "# # Load Stata file into a pandas DataFrame: df\n",
    "# df = pd.read_stata(\"disarea.dta\")\n",
    "\n",
    "# # Print the head of the DataFrame df\n",
    "# print(df.head())\n",
    "\n",
    "# # Plot histogram of one column of the DataFrame\n",
    "# pd.DataFrame.hist(df[['disa10']])\n",
    "# plt.xlabel('Extent of disease')\n",
    "# plt.ylabel('Number of coutries')\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Reading HDFS files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# # Import packages\n",
    "# import numpy as np\n",
    "# import h5py\n",
    "\n",
    "# # Assign filename: file\n",
    "# file = \"LIGO_data.hdf5\"\n",
    "\n",
    "# # Load file: data\n",
    "# data = h5py.File(file, \"r\")\n",
    "\n",
    "# # Print the datatype of the loaded file\n",
    "# print(type(data))\n",
    "\n",
    "# # Print the keys of the file\n",
    "# for key in data.keys():\n",
    "#     print(key)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# # Get the HDF5 group: group\n",
    "# group = data['strain']\n",
    "\n",
    "# # Check out keys of group\n",
    "# for key in group.keys():\n",
    "#     print(key)\n",
    "\n",
    "# # Set variable equal to time series data: strain\n",
    "# strain = data['strain']['Strain'].value\n",
    "\n",
    "# # Set number of time points to sample: num_samples\n",
    "# num_samples=10000\n",
    "\n",
    "# # Set time vector\n",
    "# time = np.arange(0, 1, 1/num_samples)\n",
    "\n",
    "# # Plot data\n",
    "# plt.plot(time, strain[:num_samples])\n",
    "# plt.xlabel('GPS Time (s)')\n",
    "# plt.ylabel('strain')\n",
    "# plt.show()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Loading Matlab files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# # Import package\n",
    "# import scipy.io\n",
    "\n",
    "# # Load MATLAB file: mat\n",
    "# mat = scipy.io.loadmat('albeck_gene_expression.mat')\n",
    "\n",
    "# # Print the datatype type of mat\n",
    "# print(type(mat))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# # Print the keys of the MATLAB dictionary\n",
    "# print(mat.keys())\n",
    "\n",
    "# # Print the type of the value corresponding to the key 'CYratioCyt'\n",
    "# print(type(mat['CYratioCyt']))\n",
    "\n",
    "# # Print the shape of the value corresponding to the key 'CYratioCyt'\n",
    "# print(np.shape(mat['CYratioCyt']))\n",
    "\n",
    "# # Subset the array and plot it\n",
    "# data = mat['CYratioCyt'][25, 5:]\n",
    "# fig = plt.figure()\n",
    "# plt.plot(data)\n",
    "# plt.xlabel('time (min.)')\n",
    "# plt.ylabel('normalized fluorescence (measure of expression)')\n",
    "# plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "############################################################\n",
    "## Atul Singh  | www.datagenx.net | lnked.in/atulsingh\n",
    "############################################################"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}