{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Data Loading, Storage, "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"np.random.seed(12345)\n",
"import matplotlib.pyplot as plt\n",
"plt.rc('figure', figsize=(10, 6))\n",
"np.set_printoptions(precision=4, suppress=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Reading and Writing Data in Text Format"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!cat examples/ex1.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"df = pd.read_csv('examples/ex1.csv')\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pd.read_table('examples/ex1.csv', sep=',')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!cat examples/ex2.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pd.read_csv('examples/ex2.csv', header=None)\n",
"pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"names = ['a', 'b', 'c', 'd', 'message']\n",
"pd.read_csv('examples/ex2.csv', names=names, index_col='message')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!cat examples/csv_mindex.csv\n",
"parsed = pd.read_csv('examples/csv_mindex.csv',\n",
" index_col=['key1', 'key2'])\n",
"parsed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"list(open('examples/ex3.txt'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"result = pd.read_table('examples/ex3.txt', sep='\\s+')\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!cat examples/ex4.csv\n",
"pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!cat examples/ex5.csv\n",
"result = pd.read_csv('examples/ex5.csv')\n",
"result\n",
"pd.isnull(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n",
"pd.read_csv('examples/ex5.csv', na_values=sentinels)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Reading Text Files in Pieces"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pd.options.display.max_rows = 10"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"result = pd.read_csv('examples/ex6.csv')\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pd.read_csv('examples/ex6.csv', nrows=5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)\n",
"chunker"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)\n",
"\n",
"tot = pd.Series([])\n",
"for piece in chunker:\n",
" tot = tot.add(piece['key'].value_counts(), fill_value=0)\n",
"\n",
"tot = tot.sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"tot[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Writing Data to Text Format"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data = pd.read_csv('examples/ex5.csv')\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data.to_csv('examples/out.csv')\n",
"!cat examples/out.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import sys\n",
"data.to_csv(sys.stdout, sep='|')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data.to_csv(sys.stdout, na_rep='NULL')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data.to_csv(sys.stdout, index=False, header=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"dates = pd.date_range('1/1/2000', periods=7)\n",
"ts = pd.Series(np.arange(7), index=dates)\n",
"ts.to_csv('examples/tseries.csv')\n",
"!cat examples/tseries.csv"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Working with Delimited Formats"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!cat examples/ex7.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import csv\n",
"f = open('examples/ex7.csv')\n",
"\n",
"reader = csv.reader(f)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"for line in reader:\n",
" print(line)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"with open('examples/ex7.csv') as f:\n",
" lines = list(csv.reader(f))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"header, values = lines[0], lines[1:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
"data_dict"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"class my_dialect(csv.Dialect):\n",
" lineterminator = '\\n'\n",
" delimiter = ';'\n",
" quotechar = '\"'\n",
" quoting = csv.QUOTE_MINIMAL"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"reader = csv.reader(f, dialect=my_dialect)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"reader = csv.reader(f, delimiter='|')"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"with open('mydata.csv', 'w') as f:\n",
" writer = csv.writer(f, dialect=my_dialect)\n",
" writer.writerow(('one', 'two', 'three'))\n",
" writer.writerow(('1', '2', '3'))\n",
" writer.writerow(('4', '5', '6'))\n",
" writer.writerow(('7', '8', '9'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### JSON Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"obj = \"\"\"\n",
"{\"name\": \"Wes\",\n",
" \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n",
" \"pet\": null,\n",
" \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]},\n",
" {\"name\": \"Katie\", \"age\": 38,\n",
" \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]\n",
"}\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import json\n",
"result = json.loads(obj)\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"asjson = json.dumps(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])\n",
"siblings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!cat examples/example.json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data = pd.read_json('examples/example.json')\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"print(data.to_json())\n",
"print(data.to_json(orient='records'))"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### XML and HTML: Web Scraping"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"conda install lxml\n",
"pip install beautifulsoup4 html5lib"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"tables = pd.read_html('examples/fdic_failed_bank_list.html')\n",
"len(tables)\n",
"failures = tables[0]\n",
"failures.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"close_timestamps = pd.to_datetime(failures['Closing Date'])\n",
"close_timestamps.dt.year.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"#### Parsing XML with lxml.objectify"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"\n",
" 373889\n",
" \n",
" Metro-North Railroad\n",
" Escalator Availability\n",
" Percent of the time that escalators are operational\n",
" systemwide. The availability rate is based on physical observations performed\n",
" the morning of regular business days only. This is a new indicator the agency\n",
" began reporting in 2009.\n",
" 2011\n",
" 12\n",
" Service Indicators\n",
" M\n",
" U\n",
" %\n",
" 1\n",
" 97.00\n",
" \n",
" 97.00\n",
" \n",
""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from lxml import objectify\n",
"\n",
"path = 'datasets/mta_perf/Performance_MNR.xml'\n",
"parsed = objectify.parse(open(path))\n",
"root = parsed.getroot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data = []\n",
"\n",
"skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n",
" 'DESIRED_CHANGE', 'DECIMAL_PLACES']\n",
"\n",
"for elt in root.INDICATOR:\n",
" el_data = {}\n",
" for child in elt.getchildren():\n",
" if child.tag in skip_fields:\n",
" continue\n",
" el_data[child.tag] = child.pyval\n",
" data.append(el_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"perf = pd.DataFrame(data)\n",
"perf.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from io import StringIO\n",
"tag = 'Google'\n",
"root = objectify.parse(StringIO(tag)).getroot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"root\n",
"root.get('href')\n",
"root.text"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Binary Data Formats"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"frame = pd.read_csv('examples/ex1.csv')\n",
"frame\n",
"frame.to_pickle('examples/frame_pickle')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pd.read_pickle('examples/frame_pickle')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!rm examples/frame_pickle"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Using HDF5 Format"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"frame = pd.DataFrame({'a': np.random.randn(100)})\n",
"store = pd.HDFStore('mydata.h5')\n",
"store['obj1'] = frame\n",
"store['obj1_col'] = frame['a']\n",
"store"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"store['obj1']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"store.put('obj2', frame, format='table')\n",
"store.select('obj2', where=['index >= 10 and index <= 15'])\n",
"store.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"frame.to_hdf('mydata.h5', 'obj3', format='table')\n",
"pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"os.remove('mydata.h5')"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Reading Microsoft Excel Files"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"xlsx = pd.ExcelFile('examples/ex1.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"pd.read_excel(xlsx, 'Sheet1')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')\n",
"frame"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"writer = pd.ExcelWriter('examples/ex2.xlsx')\n",
"frame.to_excel(writer, 'Sheet1')\n",
"writer.save()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"frame.to_excel('examples/ex2.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!rm examples/ex2.xlsx"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Interacting with Web APIs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import requests\n",
"url = 'https://api.github.com/repos/pandas-dev/pandas/issues'\n",
"resp = requests.get(url)\n",
"resp"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data = resp.json()\n",
"data[0]['title']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"issues = pd.DataFrame(data, columns=['number', 'title',\n",
" 'labels', 'state'])\n",
"issues"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Interacting with Databases"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import sqlite3\n",
"query = \"\"\"\n",
"CREATE TABLE test\n",
"(a VARCHAR(20), b VARCHAR(20),\n",
" c REAL, d INTEGER\n",
");\"\"\"\n",
"con = sqlite3.connect('mydata.sqlite')\n",
"con.execute(query)\n",
"con.commit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"data = [('Atlanta', 'Georgia', 1.25, 6),\n",
" ('Tallahassee', 'Florida', 2.6, 3),\n",
" ('Sacramento', 'California', 1.7, 5)]\n",
"stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n",
"con.executemany(stmt, data)\n",
"con.commit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"cursor = con.execute('select * from test')\n",
"rows = cursor.fetchall()\n",
"rows"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"cursor.description\n",
"pd.DataFrame(rows, columns=[x[0] for x in cursor.description])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import sqlalchemy as sqla\n",
"db = sqla.create_engine('sqlite:///mydata.sqlite')\n",
"pd.read_sql('select * from test', db)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"!rm mydata.sqlite"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Conclusion"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}