{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import itertools as it\n", "import sys\n", "import xlrd\n", "\n", "from os import path as p, chdir\n", "from io import StringIO\n", "from array import array\n", "\n", "if 'examples' in p.abspath('.'):\n", " chdir('..')\n", " \n", "from meza import io, process as pr, convert as cv, fntools as ft, stats" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load data, type cast, and write to a CSV file" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col1': 'hello', 'col2': '5/4/82', 'col3': '1'}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a csv file\n", "text = (\n", " 'col1,col2,col3\\n'\n", " 'hello,5/4/82,1\\n'\n", " 'one,1/1/15,2\\n'\n", " 'happy,7/4/92,3\\n')\n", "\n", "f = StringIO(text)\n", "f.seek(0)\n", "\n", "# Load the file\n", "records = io.read_csv(f)\n", "\n", "# Records are an iterator over the rows\n", "row = next(records)\n", "row" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'id': 'col1', 'type': 'text'},\n", " {'id': 'col2', 'type': 'date'},\n", " {'id': 'col3', 'type': 'int'}]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Replace first row so as not to loose any data\n", "records = pr.prepend(records, row)\n", "\n", "# Guess column types. Note: `detect_types` returns a new `records`\n", "# generator since it consumes rows during type detection\n", "records, result = pr.detect_types(records)\n", "result['types']" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col1': 'hello', 'col2': datetime.date(1982, 5, 4), 'col3': 1}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now type cast the records. Note: most `meza.process` functions return\n", "# generators, so lets wrap the result in a list to view the data\n", "casted = list(pr.type_cast(records, **result))\n", "casted[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### now run some operations on the type casted data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ ". at 0x103d0adb0>" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Cut out the first column of data\n", "cut_recs = pr.cut(casted, ['col1'], exclude=True)\n", "cut_recs" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col2': datetime.date(2015, 1, 1), 'col3': 3}" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# merge the cut rows to and get the max value of the remaining columns. \n", "# Note: since `merge` (by definition) will always contain just one row,\n", "# it is returned as is (not wrapped in a generator)\n", "merged = pr.merge(cut_recs, pred=bool, op=max)\n", "merged" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'col2,col3\\r\\n2015-01-01,3\\r\\n'" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now write merged data back to a new csv file.\n", "f = StringIO()\n", "io.write(f, cv.records2csv([merged]))\n", "f.getvalue()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "meza can read both filepaths and file-like objects. Additionally, all readers return equivalent records iterators, i.e., a generator of dictionaries with keys corresponding to the column names. Please see [Readers](https://github.com/reubano/meza#readers) for a complete list of available readers and recognized file types." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col': 'hello', 'col_2': 'world'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read a file like object and de-duplicate the header\n", "f = StringIO('col,col\\nhello,world\\n')\n", "records = io.read_csv(f, dedupe=True)\n", "\n", "# View the first row\n", "next(records) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Processing data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Numerical analysis (à la [pandas](http://pandas.pydata.org/pandas-docs/stable/10min.html#min))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following example, pandas equivalent methods are preceded by `-->`." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'A': 0.5607, 'B': 0.9338, 'C': 0.4769, 'D': 0.7804}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create some data in the same structure as what the various `read...`\n", "# functions output\n", "header = ['A', 'B', 'C', 'D']\n", "data = [\n", " [0.5607, 0.9338, 0.4769, 0.7804],\n", " [0.8227, 0.2844, 0.8166, 0.7323],\n", " [0.4627, 0.8633, 0.3283, 0.1909],\n", " [0.3932, 0.5454, 0.9604, 0.6376],\n", " [0.3685, 0.9166, 0.9457, 0.8066],\n", " [0.7584, 0.6981, 0.5625, 0.3578],\n", " [0.8959, 0.6932, 0.2565, 0.3378]]\n", "\n", "df = [dict(zip(header, d)) for d in data]\n", "df[0]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'A': 0.8227, 'B': 0.2844, 'C': 0.8166, 'D': 0.7323}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sort records by the value of column `B` --> df.sort_values(by='B')\n", "next(pr.sort(df, 'B')) " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'A': 0.5607}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select column `A` --> df['A']\n", "next(pr.cut(df, ['A'])) " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'A': 0.5607, 'B': 0.9338, 'C': 0.4769, 'D': 0.7804},\n", " {'A': 0.8227, 'B': 0.2844, 'C': 0.8166, 'D': 0.7323},\n", " {'A': 0.4627, 'B': 0.8633, 'C': 0.3283, 'D': 0.1909}]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select the first 3 rows of data --> df[0:3]\n", "list(it.islice(df, 3))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'A': 0.4627, 'B': 0.8633, 'C': 0.3283, 'D': 0.1909}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select all data whose value for column `A` is less than 0.5 --> df[df.A < 0.5]\n", "next(pr.tfilter(df, 'A', lambda x: x < 0.5)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: since `aggregate` and `merge` (by definition) return just one row,\n", "they return them as is (not wrapped in a generator)." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0.6088714285714286" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Calculate the mean of column `A` across all data --> df.mean()['A']\n", "pr.aggregate(df, 'A', stats.mean)['A']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'A': 4.2621, 'B': 4.9348, 'C': 4.3469, 'D': 3.8434}" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Calculate the sum of each column across all data --> df.sum()\n", "pr.merge(df, pred=bool, op=sum) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Text processing (à la [csvkit](https://csvkit.readthedocs.org/en/0.9.1/cli.html#processing)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following example, csvkit equivalent commands are preceded by `-->`." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First create a few simple csv files\n", "f1 = StringIO('col_1,col_2,col_3\\n1,dill,male\\n2,bob,male\\n3,jane,female\\n')\n", "f2 = StringIO('col_1,col_2,col_3\\n4,tom,male\\n5,dick,male\\n6,jill,female\\n')\n", "f1.seek(0)\n", "f2.seek(0) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can read the files, manipulate the data, convert the manipulated data to json, and write the json back to a new file. Also, note that since all readers return equivalent records iterators, you can use them interchangeably (in place of read_csv) to open any supported file. E.g., read_xls, read_sqlite, etc." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col_1': '1', 'col_2': 'dill', 'col_3': 'male'}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First, let's combine the files into one iterator --> csvstack file1.csv file2.csv\n", "records = io.join(f1, f2, ext='csv')\n", "next(records)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col_1': '6', 'col_2': 'jill', 'col_3': 'female'}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(it.islice(records, 4, None)) " ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Now let's create a persistant records list so we can do some simple manipulations\n", "f1.seek(0)\n", "records = list(io.read_csv(f1))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col_1': '2', 'col_2': 'bob', 'col_3': 'male'}" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sort records by the value of column `col_2` --> csvsort -c col_2 file1.csv\n", "next(pr.sort(records, 'col_2'))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col_2': 'dill'}" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select column `col_2` --> csvcut -c col_2 file1.csv\n", "next(pr.cut(records, ['col_2'])) " ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col_1': '3', 'col_2': 'jane', 'col_3': 'female'}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Select all data whose value for column `col_2` contains `jan`\n", "# --> csvgrep -c col_2 -m jan file1.csv\n", "next(pr.grep(records, [{'pattern': 'jan'}], ['col_2']))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'[{\"col_2\": \"dill\", \"col_3\": \"male\", \"col_1\": \"1\"}, {\"col_2\": \"bob\", \"col_3\": \"male\", \"col_1\": \"2\"}, {\"col_2\": \"jane\", \"col_3\": \"female\", \"col_1\": \"3\"}]'" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert a csv file to json --> csvjson -i 4 file1.csv\n", "f_json = StringIO()\n", "io.write(f_json, cv.records2json(records))\n", "f_json.getvalue()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Geo processing (à la [mapbox](https://github.com/mapbox?utf8=%E2%9C%93&query=geojson)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following example, mapbox equivalent commands are preceded by `-->`." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First create a geojson file\n", "f = StringIO(\n", " '{\"type\": \"FeatureCollection\",\"features\": ['\n", " '{\"type\": \"Feature\", \"id\": 11, \"geometry\": '\n", " '{\"type\": \"Point\", \"coordinates\": [10, 20]}},'\n", " '{\"type\": \"Feature\", \"id\": 12, \"geometry\": '\n", " '{\"type\": \"Point\", \"coordinates\": [5, 15]}}]}')\n", "\n", "f.seek(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can open the file, split the data by id, and finally convert the split data to a new geojson file-like object." ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'id': 11, 'lat': 20, 'lon': 10, 'type': 'Point'}" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the geojson file and peek at the results\n", "records, peek = pr.peek(io.read_geojson(f))\n", "peek[0] " ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "11" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Split the records by feature ``id`` and select the first feature --> geojsplit -k id file.geojson\n", "splits = pr.split(records, 'id')\n", "feature_records, name = next(splits)\n", "name" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'{\"type\": \"FeatureCollection\", \"bbox\": [10, 20, 10, 20], \"features\": [{\"type\": \"Feature\", \"id\": 11, \"geometry\": {\"type\": \"Point\", \"coordinates\": [10, 20]}, \"properties\": {\"id\": 11}}], \"crs\": {\"type\": \"name\", \"properties\": {\"name\": \"urn:ogc:def:crs:OGC:1.3:CRS84\"}}}'" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert the feature records into a GeoJSON file-like object\n", "geojson = cv.records2geojson(feature_records)\n", "geojson.readline()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: you can also write back to a file as shown previously `io.write('file.geojson', geojson)`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Writing Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "meza can persist records to disk via the following functions:\n", "\n", " - meza.convert.records2csv\n", " - meza.convert.records2json\n", " - meza.convert.records2geojson\n", "\n", "Each function returns a file-like object that you can write to disk via `meza.io.write('/path/to/file', result)`." ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'col1': 'hello', 'col2': 'world'}" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First let's create a simple tsv file like object\n", "f = StringIO('col1\\tcol2\\nhello\\tworld\\n')\n", "f.seek(0)\n", "\n", "# Next create a records list so we can reuse it\n", "records = list(io.read_tsv(f))\n", "records[0] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we're ready to write the records data to file" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'col1,col2'" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a csv file like object\n", "f_out = cv.records2csv(records)\n", "f_out.readline().strip()" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'[{\"col1\": \"hello\", \"col2\": \"world\"}]'" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a json file like object\n", "f_out = cv.records2json(records)\n", "f_out.readline()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Interoperability" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "meza plays nicely with native and NumPy arrays out of the box" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[{'id': 'b', 'type': 'int'},\n", " {'id': 'c', 'type': 'float'},\n", " {'id': 'a', 'type': 'text'}]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# First create some records and types. Also, convert the records to a list\n", "# so we can reuse them.\n", "records = [{'a': 'one', 'b': 2}, {'a': 'five', 'b': 10, 'c': 20.1}]\n", "records, result = pr.detect_types(records)\n", "records, types = list(records), result['types']\n", "types" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[array('u', 'b'), array('u', 'c'), array('u', 'a')],\n", " array('i', [2, 10]),\n", " array('f', [0.0, 20.100000381469727]),\n", " [array('u', 'one'), array('u', 'five')]]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert records to a list of arrays\n", "narray = cv.records2array(records, types, native=True)\n", "narray" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'a': 'one', 'b': 2, 'c': 0.0}" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert list of arrays back to records\n", "next(cv.array2records(narray, native=True)) " ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[array('i', [1, 4]), array('i', [2, 5]), array('i', [3, 6])]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Convert a 2-D array to records\n", "arr = [(1, 4), (2, 5), (3, 6)]\n", "data = [array('i', a) for a in arr]\n", "data" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'column_1': 1, 'column_2': 2, 'column_3': 3}" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(cv.array2records(data, native=True)) " ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }