{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data loading, storage, and file formats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division\n", "from numpy.random import randn\n", "import numpy as np\n", "import os\n", "import sys\n", "import matplotlib.pyplot as plt\n", "np.random.seed(12345)\n", "plt.rc('figure', figsize=(10, 6))\n", "from pandas import Series, DataFrame\n", "import pandas as pd\n", "np.set_printoptions(precision=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading and Writing Data in Text Format" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cat ch06/ex1.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = pd.read_csv('ch06/ex1.csv')\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_table('ch06/ex1.csv', sep=',')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cat ch06/ex2.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_csv('ch06/ex2.csv', header=None)\n", "pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "names = ['a', 'b', 'c', 'd', 'message']\n", "pd.read_csv('ch06/ex2.csv', names=names, index_col='message')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cat ch06/csv_mindex.csv\n", "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])\n", "parsed" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "list(open('ch06/ex3.txt'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "result = pd.read_table('ch06/ex3.txt', sep='\\s+')\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cat ch06/ex4.csv\n", "pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cat ch06/ex5.csv\n", "result = pd.read_csv('ch06/ex5.csv')\n", "result\n", "pd.isnull(result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n", "pd.read_csv('ch06/ex5.csv', na_values=sentinels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading text files in pieces" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "result = pd.read_csv('ch06/ex6.csv')\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_csv('ch06/ex6.csv', nrows=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", "chunker" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", "\n", "tot = Series([])\n", "for piece in chunker:\n", " tot = tot.add(piece['key'].value_counts(), fill_value=0)\n", "\n", "tot = tot.order(ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tot[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Writing data out to text format" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = pd.read_csv('ch06/ex5.csv')\n", "data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.to_csv('ch06/out.csv')\n", "!cat ch06/out.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.to_csv(sys.stdout, sep='|')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.to_csv(sys.stdout, na_rep='NULL')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.to_csv(sys.stdout, index=False, header=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dates = pd.date_range('1/1/2000', periods=7)\n", "ts = Series(np.arange(7), index=dates)\n", "ts.to_csv('ch06/tseries.csv')\n", "!cat ch06/tseries.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "Series.from_csv('ch06/tseries.csv', parse_dates=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Manually working with delimited formats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!cat ch06/ex7.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import csv\n", "f = open('ch06/ex7.csv')\n", "\n", "reader = csv.reader(f)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for line in reader:\n", " print(line)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "lines = list(csv.reader(open('ch06/ex7.csv')))\n", "header, values = lines[0], lines[1:]\n", "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", "data_dict" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "class my_dialect(csv.Dialect):\n", " lineterminator = '\\n'\n", " delimiter = ';'\n", " quotechar = '\"'\n", " quoting = csv.QUOTE_MINIMAL" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "with open('mydata.csv', 'w') as f:\n", " writer = csv.writer(f, dialect=my_dialect)\n", " writer.writerow(('one', 'two', 'three'))\n", " writer.writerow(('1', '2', '3'))\n", " writer.writerow(('4', '5', '6'))\n", " writer.writerow(('7', '8', '9'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%cat mydata.csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JSON data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "obj = \"\"\"\n", "{\"name\": \"Wes\",\n", " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", " \"pet\": null,\n", " \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n", " {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n", "}\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import json\n", "result = json.loads(obj)\n", "result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "asjson = json.dumps(result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "siblings = DataFrame(result['siblings'], columns=['name', 'age'])\n", "siblings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### XML and HTML, Web scraping\n", "\n", "**NB. The Yahoo! Finance API has changed and this example no longer works**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from lxml.html import parse\n", "from urllib2 import urlopen\n", "\n", "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n", "\n", "doc = parsed.getroot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "links = doc.findall('.//a')\n", "links[15:20]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "lnk = links[28]\n", "lnk\n", "lnk.get('href')\n", "lnk.text_content()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "urls = [lnk.get('href') for lnk in doc.findall('.//a')]\n", "urls[-10:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tables = doc.findall('.//table')\n", "calls = tables[9]\n", "puts = tables[13]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "rows = calls.findall('.//tr')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def _unpack(row, kind='td'):\n", " elts = row.findall('.//%s' % kind)\n", " return [val.text_content() for val in elts]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "_unpack(rows[0], kind='th')\n", "_unpack(rows[1], kind='td')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from pandas.io.parsers import TextParser\n", "\n", "def parse_options_data(table):\n", " rows = table.findall('.//tr')\n", " header = _unpack(rows[0], kind='th')\n", " data = [_unpack(r) for r in rows[1:]]\n", " return TextParser(data, names=header).get_chunk()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "call_data = parse_options_data(calls)\n", "put_data = parse_options_data(puts)\n", "call_data[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Parsing XML with lxml.objectify" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%cd ch06/mta_perf/Performance_XML_Data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!head -21 Performance_MNR.xml" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from lxml import objectify\n", "\n", "path = 'Performance_MNR.xml'\n", "parsed = objectify.parse(open(path))\n", "root = parsed.getroot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = []\n", "\n", "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", " 'DESIRED_CHANGE', 'DECIMAL_PLACES']\n", "\n", "for elt in root.INDICATOR:\n", " el_data = {}\n", " for child in elt.getchildren():\n", " if child.tag in skip_fields:\n", " continue\n", " el_data[child.tag] = child.pyval\n", " data.append(el_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "perf = DataFrame(data)\n", "perf" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "root" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "root.get('href')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "root.text" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binary data formats" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cd ../.." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "frame = pd.read_csv('ch06/ex1.csv')\n", "frame\n", "frame.to_pickle('ch06/frame_pickle')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.read_pickle('ch06/frame_pickle')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using HDF5 format" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "store = pd.HDFStore('mydata.h5')\n", "store['obj1'] = frame\n", "store['obj1_col'] = frame['a']\n", "store" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "store['obj1']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "store.close()\n", "os.remove('mydata.h5')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Interacting with HTML and Web APIs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import requests\n", "url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'\n", "resp = requests.get(url)\n", "resp" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "issue_labels = DataFrame(data)\n", "issue_labels" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interacting with databases" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import sqlite3\n", "\n", "query = \"\"\"\n", "CREATE TABLE test\n", "(a VARCHAR(20), b VARCHAR(20),\n", " c REAL, d INTEGER\n", ");\"\"\"\n", "\n", "con = sqlite3.connect(':memory:')\n", "con.execute(query)\n", "con.commit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = [('Atlanta', 'Georgia', 1.25, 6),\n", " ('Tallahassee', 'Florida', 2.6, 3),\n", " ('Sacramento', 'California', 1.7, 5)]\n", "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", "\n", "con.executemany(stmt, data)\n", "con.commit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cursor = con.execute('select * from test')\n", "rows = cursor.fetchall()\n", "rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cursor.description" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "DataFrame(rows, columns=zip(*cursor.description)[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas.io.sql as sql\n", "sql.read_sql('select * from test', con)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }