{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "np.random.seed(12345)\n", "import matplotlib.pyplot as plt\n", "plt.rc(\"figure\", figsize=(10, 6))\n", "pd.options.display.max_colwidth = 75\n", "pd.options.display.max_columns = 20\n", "np.set_printoptions(precision=4, suppress=True)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "!cat examples/ex1.csv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"examples/ex1.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "!cat examples/ex2.csv" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "pd.read_csv(\"examples/ex2.csv\", header=None)\n", "pd.read_csv(\"examples/ex2.csv\", names=[\"a\", \"b\", \"c\", \"d\", \"message\"])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "names = [\"a\", \"b\", \"c\", \"d\", \"message\"]\n", "pd.read_csv(\"examples/ex2.csv\", names=names, index_col=\"message\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "!cat examples/csv_mindex.csv\n", "parsed = pd.read_csv(\"examples/csv_mindex.csv\",\n", " index_col=[\"key1\", \"key2\"])\n", "parsed" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "!cat examples/ex3.txt" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "result = pd.read_csv(\"examples/ex3.txt\", sep=\"\\s+\")\n", "result" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "!cat examples/ex4.csv\n", "pd.read_csv(\"examples/ex4.csv\", skiprows=[0, 2, 3])" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "!cat examples/ex5.csv\n", "result = pd.read_csv(\"examples/ex5.csv\")\n", "result" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "pd.isna(result)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "result = pd.read_csv(\"examples/ex5.csv\", na_values=[\"NULL\"])\n", "result" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "result2 = pd.read_csv(\"examples/ex5.csv\", keep_default_na=False)\n", "result2\n", "result2.isna()\n", "result3 = pd.read_csv(\"examples/ex5.csv\", keep_default_na=False,\n", " na_values=[\"NA\"])\n", "result3\n", "result3.isna()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "sentinels = {\"message\": [\"foo\", \"NA\"], \"something\": [\"two\"]}\n", "pd.read_csv(\"examples/ex5.csv\", na_values=sentinels,\n", " keep_default_na=False)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "pd.options.display.max_rows = 10" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "result = pd.read_csv(\"examples/ex6.csv\")\n", "result" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "pd.read_csv(\"examples/ex6.csv\", nrows=5)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "chunker = pd.read_csv(\"examples/ex6.csv\", chunksize=1000)\n", "type(chunker)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "chunker = pd.read_csv(\"examples/ex6.csv\", chunksize=1000)\n", "\n", "tot = pd.Series([], dtype='int64')\n", "for piece in chunker:\n", " tot = tot.add(piece[\"key\"].value_counts(), fill_value=0)\n", "\n", "tot = tot.sort_values(ascending=False)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "tot[:10]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv(\"examples/ex5.csv\")\n", "data" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "data.to_csv(\"examples/out.csv\")\n", "!cat examples/out.csv" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "import sys\n", "data.to_csv(sys.stdout, sep=\"|\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "data.to_csv(sys.stdout, na_rep=\"NULL\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "data.to_csv(sys.stdout, index=False, header=False)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "data.to_csv(sys.stdout, index=False, columns=[\"a\", \"b\", \"c\"])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "!cat examples/ex7.csv" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import csv\n", "f = open(\"examples/ex7.csv\")\n", "reader = csv.reader(f)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "for line in reader:\n", " print(line)\n", "f.close()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "with open(\"examples/ex7.csv\") as f:\n", " lines = list(csv.reader(f))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "header, values = lines[0], lines[1:]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", "data_dict" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "obj = \"\"\"\n", "{\"name\": \"Wes\",\n", " \"cities_lived\": [\"Akron\", \"Nashville\", \"New York\", \"San Francisco\"],\n", " \"pet\": null,\n", " \"siblings\": [{\"name\": \"Scott\", \"age\": 34, \"hobbies\": [\"guitars\", \"soccer\"]},\n", " {\"name\": \"Katie\", \"age\": 42, \"hobbies\": [\"diving\", \"art\"]}]\n", "}\n", "\"\"\"" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "import json\n", "result = json.loads(obj)\n", "result" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "asjson = json.dumps(result)\n", "asjson" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "siblings = pd.DataFrame(result[\"siblings\"], columns=[\"name\", \"age\"])\n", "siblings" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "!cat examples/example.json" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "data = pd.read_json(\"examples/example.json\")\n", "data" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "data.to_json(sys.stdout)\n", "data.to_json(sys.stdout, orient=\"records\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "tables = pd.read_html(\"examples/fdic_failed_bank_list.html\")\n", "len(tables)\n", "failures = tables[0]\n", "failures.head()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "close_timestamps = pd.to_datetime(failures[\"Closing Date\"])\n", "close_timestamps.dt.year.value_counts()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "from lxml import objectify\n", "\n", "path = \"datasets/mta_perf/Performance_MNR.xml\"\n", "with open(path) as f:\n", " parsed = objectify.parse(f)\n", "root = parsed.getroot()" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "data = []\n", "\n", "skip_fields = [\"PARENT_SEQ\", \"INDICATOR_SEQ\",\n", " \"DESIRED_CHANGE\", \"DECIMAL_PLACES\"]\n", "\n", "for elt in root.INDICATOR:\n", " el_data = {}\n", " for child in elt.getchildren():\n", " if child.tag in skip_fields:\n", " continue\n", " el_data[child.tag] = child.pyval\n", " data.append(el_data)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "perf = pd.DataFrame(data)\n", "perf.head()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "perf2 = pd.read_xml(path)\n", "perf2.head()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "frame = pd.read_csv(\"examples/ex1.csv\")\n", "frame\n", "frame.to_pickle(\"examples/frame_pickle\")" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "pd.read_pickle(\"examples/frame_pickle\")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "!rm examples/frame_pickle" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "fec = pd.read_parquet('datasets/fec/fec.parquet')" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "xlsx = pd.ExcelFile(\"examples/ex1.xlsx\")" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "xlsx.sheet_names" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "xlsx.parse(sheet_name=\"Sheet1\")" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [], "source": [ "xlsx.parse(sheet_name=\"Sheet1\", index_col=0)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "frame = pd.read_excel(\"examples/ex1.xlsx\", sheet_name=\"Sheet1\")\n", "frame" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "writer = pd.ExcelWriter(\"examples/ex2.xlsx\")\n", "frame.to_excel(writer, \"Sheet1\")\n", "writer.close()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "frame.to_excel(\"examples/ex2.xlsx\")" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "!rm examples/ex2.xlsx" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "!rm -f examples/mydata.h5" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "frame = pd.DataFrame({\"a\": np.random.standard_normal(100)})\n", "store = pd.HDFStore(\"examples/mydata.h5\")\n", "store[\"obj1\"] = frame\n", "store[\"obj1_col\"] = frame[\"a\"]\n", "store" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "store[\"obj1\"]" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "store.put(\"obj2\", frame, format=\"table\")\n", "store.select(\"obj2\", where=[\"index >= 10 and index <= 15\"])\n", "store.close()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "frame.to_hdf(\"examples/mydata.h5\", \"obj3\", format=\"table\")\n", "pd.read_hdf(\"examples/mydata.h5\", \"obj3\", where=[\"index < 5\"])" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.remove(\"examples/mydata.h5\")" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "import requests\n", "url = \"https://api.github.com/repos/pandas-dev/pandas/issues\"\n", "resp = requests.get(url)\n", "resp.raise_for_status()\n", "resp" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "data = resp.json()\n", "data[0][\"title\"]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "issues = pd.DataFrame(data, columns=[\"number\", \"title\",\n", " \"labels\", \"state\"])\n", "issues" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "\n", "query = \"\"\"\n", "CREATE TABLE test\n", "(a VARCHAR(20), b VARCHAR(20),\n", " c REAL, d INTEGER\n", ");\"\"\"\n", "\n", "con = sqlite3.connect(\"mydata.sqlite\")\n", "con.execute(query)\n", "con.commit()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "data = [(\"Atlanta\", \"Georgia\", 1.25, 6),\n", " (\"Tallahassee\", \"Florida\", 2.6, 3),\n", " (\"Sacramento\", \"California\", 1.7, 5)]\n", "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", "\n", "con.executemany(stmt, data)\n", "con.commit()" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "cursor = con.execute(\"SELECT * FROM test\")\n", "rows = cursor.fetchall()\n", "rows" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "cursor.description\n", "pd.DataFrame(rows, columns=[x[0] for x in cursor.description])" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "import sqlalchemy as sqla\n", "db = sqla.create_engine(\"sqlite:///mydata.sqlite\")\n", "pd.read_sql(\"SELECT * FROM test\", db)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "!rm mydata.sqlite" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }