{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Data Loading, Storage, " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "np.random.seed(12345)\n", "import matplotlib.pyplot as plt\n", "plt.rc('figure', figsize=(10, 6))\n", "np.set_printoptions(precision=4, suppress=True)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Reading and Writing Data in Text Format" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!cat examples/ex1.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df = pd.read_csv('examples/ex1.csv')\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.read_table('examples/ex1.csv', sep=',')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!cat examples/ex2.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.read_csv('examples/ex2.csv', header=None)\n", "pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "names = ['a', 'b', 'c', 'd', 'message']\n", "pd.read_csv('examples/ex2.csv', names=names, index_col='message')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!cat examples/csv_mindex.csv\n", "parsed = pd.read_csv('examples/csv_mindex.csv',\n", " index_col=['key1', 'key2'])\n", "parsed" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "list(open('examples/ex3.txt'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "result = pd.read_table('examples/ex3.txt', sep='\\s+')\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!cat examples/ex4.csv\n", "pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!cat examples/ex5.csv\n", "result = pd.read_csv('examples/ex5.csv')\n", "result\n", "pd.isnull(result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n", "pd.read_csv('examples/ex5.csv', na_values=sentinels)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Reading Text Files in Pieces" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.options.display.max_rows = 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "result = pd.read_csv('examples/ex6.csv')\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.read_csv('examples/ex6.csv', nrows=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)\n", "chunker" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)\n", "\n", "tot = pd.Series([])\n", "for piece in chunker:\n", " tot = tot.add(piece['key'].value_counts(), fill_value=0)\n", "\n", "tot = tot.sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "tot[:10]" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Writing Data to Text Format" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = pd.read_csv('examples/ex5.csv')\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data.to_csv('examples/out.csv')\n", "!cat examples/out.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import sys\n", "data.to_csv(sys.stdout, sep='|')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data.to_csv(sys.stdout, na_rep='NULL')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data.to_csv(sys.stdout, index=False, header=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "dates = pd.date_range('1/1/2000', periods=7)\n", "ts = pd.Series(np.arange(7), index=dates)\n", "ts.to_csv('examples/tseries.csv')\n", "!cat examples/tseries.csv" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Working with Delimited Formats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!cat examples/ex7.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import csv\n", "f = open('examples/ex7.csv')\n", "\n", "reader = csv.reader(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "for line in reader:\n", " print(line)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "with open('examples/ex7.csv') as f:\n", " lines = list(csv.reader(f))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "header, values = lines[0], lines[1:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", "data_dict" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "class my_dialect(csv.Dialect):\n", " lineterminator = '\\n'\n", " delimiter = ';'\n", " quotechar = '\"'\n", " quoting = csv.QUOTE_MINIMAL" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "reader = csv.reader(f, dialect=my_dialect)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "reader = csv.reader(f, delimiter='|')" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "with open('mydata.csv', 'w') as f:\n", " writer = csv.writer(f, dialect=my_dialect)\n", " writer.writerow(('one', 'two', 'three'))\n", " writer.writerow(('1', '2', '3'))\n", " writer.writerow(('4', '5', '6'))\n", " writer.writerow(('7', '8', '9'))" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### JSON Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "obj = \"\"\"\n", "{\"name\": \"Wes\",\n", " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", " \"pet\": null,\n", " \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]},\n", " {\"name\": \"Katie\", \"age\": 38,\n", " \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]\n", "}\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import json\n", "result = json.loads(obj)\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "asjson = json.dumps(result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])\n", "siblings" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!cat examples/example.json" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = pd.read_json('examples/example.json')\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "print(data.to_json())\n", "print(data.to_json(orient='records'))" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### XML and HTML: Web Scraping" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "conda install lxml\n", "pip install beautifulsoup4 html5lib" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "tables = pd.read_html('examples/fdic_failed_bank_list.html')\n", "len(tables)\n", "failures = tables[0]\n", "failures.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "close_timestamps = pd.to_datetime(failures['Closing Date'])\n", "close_timestamps.dt.year.value_counts()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "#### Parsing XML with lxml.objectify" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "\n", " 373889\n", " \n", " Metro-North Railroad\n", " Escalator Availability\n", " Percent of the time that escalators are operational\n", " systemwide. The availability rate is based on physical observations performed\n", " the morning of regular business days only. This is a new indicator the agency\n", " began reporting in 2009.\n", " 2011\n", " 12\n", " Service Indicators\n", " M\n", " U\n", " %\n", " 1\n", " 97.00\n", " \n", " 97.00\n", " \n", "" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from lxml import objectify\n", "\n", "path = 'datasets/mta_perf/Performance_MNR.xml'\n", "parsed = objectify.parse(open(path))\n", "root = parsed.getroot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = []\n", "\n", "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", " 'DESIRED_CHANGE', 'DECIMAL_PLACES']\n", "\n", "for elt in root.INDICATOR:\n", " el_data = {}\n", " for child in elt.getchildren():\n", " if child.tag in skip_fields:\n", " continue\n", " el_data[child.tag] = child.pyval\n", " data.append(el_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "perf = pd.DataFrame(data)\n", "perf.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from io import StringIO\n", "tag = 'Google'\n", "root = objectify.parse(StringIO(tag)).getroot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "root\n", "root.get('href')\n", "root.text" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Binary Data Formats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "frame = pd.read_csv('examples/ex1.csv')\n", "frame\n", "frame.to_pickle('examples/frame_pickle')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.read_pickle('examples/frame_pickle')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!rm examples/frame_pickle" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Using HDF5 Format" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "frame = pd.DataFrame({'a': np.random.randn(100)})\n", "store = pd.HDFStore('mydata.h5')\n", "store['obj1'] = frame\n", "store['obj1_col'] = frame['a']\n", "store" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "store['obj1']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "store.put('obj2', frame, format='table')\n", "store.select('obj2', where=['index >= 10 and index <= 15'])\n", "store.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "frame.to_hdf('mydata.h5', 'obj3', format='table')\n", "pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "os.remove('mydata.h5')" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Reading Microsoft Excel Files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "xlsx = pd.ExcelFile('examples/ex1.xlsx')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.read_excel(xlsx, 'Sheet1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')\n", "frame" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "writer = pd.ExcelWriter('examples/ex2.xlsx')\n", "frame.to_excel(writer, 'Sheet1')\n", "writer.save()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "frame.to_excel('examples/ex2.xlsx')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!rm examples/ex2.xlsx" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Interacting with Web APIs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import requests\n", "url = 'https://api.github.com/repos/pandas-dev/pandas/issues'\n", "resp = requests.get(url)\n", "resp" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = resp.json()\n", "data[0]['title']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "issues = pd.DataFrame(data, columns=['number', 'title',\n", " 'labels', 'state'])\n", "issues" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Interacting with Databases" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import sqlite3\n", "query = \"\"\"\n", "CREATE TABLE test\n", "(a VARCHAR(20), b VARCHAR(20),\n", " c REAL, d INTEGER\n", ");\"\"\"\n", "con = sqlite3.connect('mydata.sqlite')\n", "con.execute(query)\n", "con.commit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "data = [('Atlanta', 'Georgia', 1.25, 6),\n", " ('Tallahassee', 'Florida', 2.6, 3),\n", " ('Sacramento', 'California', 1.7, 5)]\n", "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", "con.executemany(stmt, data)\n", "con.commit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cursor = con.execute('select * from test')\n", "rows = cursor.fetchall()\n", "rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cursor.description\n", "pd.DataFrame(rows, columns=[x[0] for x in cursor.description])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import sqlalchemy as sqla\n", "db = sqla.create_engine('sqlite:///mydata.sqlite')\n", "pd.read_sql('select * from test', db)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "!rm mydata.sqlite" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Conclusion" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.0" } }, "nbformat": 4, "nbformat_minor": 0 }