{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Presentation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Naive" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Data and code behind the stories and interactives at FiveThirtyEight'" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from urllib.request import urlopen\n", "from json import loads \n", "\n", "BASE = 'https://api.github.com/search'\n", "_url1 = '{}/repositories?q={}'\n", "q = 'data&per_page=100'\n", "url1 = _url1.format(BASE, q)\n", "f = urlopen(url1)\n", "data = loads(f.read().decode('utf-8'))\n", "repos = data['items']\n", "repos[0]['description']" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fivethirtyeight/data'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repos[0]['full_name']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Processing data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def rate(repos):\n", " rated = []\n", " \n", " for repo in repos:\n", " rated.append(repo['watchers'] * 2)\n", "\n", " return rated" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[11142, 5556, 396, 438, 128]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rate(repos)[:5]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Infinite data\n", "from itertools import count\n", "\n", "inf_repos = ({'watchers': c} for c in count())\n", "\n", "# Don't actually run the below code since it will hang forever\n", "# rate(inf_repos)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Expensive data\n", "from time import sleep\n", "\n", "def exp_rate(repos):\n", " rated = []\n", "\n", " for repo in repos:\n", " sleep(1)\n", " rated.append(repo['watchers'] * 2)\n", "\n", " return rated" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[11142, 5556, 396, 438, 128]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exp_rate(repos)[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lazy evaluation" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[0, 1, 2, 3, 4]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eager_list = list(range(5))\n", "eager_list" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lazy_list = iter(eager_list)\n", "lazy_list" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(lazy_list)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "[1, 2, 3, 4]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(lazy_list)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "ename": "StopIteration", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlazy_list\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mStopIteration\u001b[0m: " ] } ], "source": [ "next(lazy_list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading data" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from ijson import items\n", "\n", "f = urlopen(url1)\n", "repos = items(f, 'items.item')\n", "repos" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fivethirtyeight/data'" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repo = next(repos)\n", "repo['full_name']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Processing data" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def gen_rates(repos):\n", " for repo in repos:\n", " yield repo['watchers'] * 2" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gen_rates(repos)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5556" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rates = gen_rates(repos)\n", "next(rates)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "396" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(rates)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Infinite data\n", "rates = gen_rates(inf_repos)\n", "next(rates)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Expensive data\n", "def gen_exp_rates(repos):\n", " for repo in repos:\n", " sleep(1)\n", " yield repo['watchers'] * 2" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[438, 128, 684, 348, 1356]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from itertools import islice\n", "\n", "rates = gen_exp_rates(repos)\n", "result = islice(rates, 5)\n", "list(result)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "648" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(rates)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping data" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['id', 'name', 'full_name', 'owner', 'private', 'html_url', 'description', 'fork', 'url', 'forks_url', 'keys_url', 'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url', 'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url', 'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url', 'languages_url', 'stargazers_url', 'contributors_url', 'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url', 'comments_url', 'issue_comment_url', 'contents_url', 'compare_url', 'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url', 'milestones_url', 'notifications_url', 'labels_url', 'releases_url', 'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url', 'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size', 'stargazers_count', 'watchers_count', 'language', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 'forks_count', 'mirror_url', 'open_issues_count', 'forks', 'open_issues', 'watchers', 'default_branch', 'score'])" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "f = urlopen(url1)\n", "repos = items(f, 'items.item')\n", "repo = next(repos)\n", "repo.keys()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repo['has_issues']" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(False, 3)" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import itertools as it\n", "from operator import itemgetter\n", "\n", "keyfunc = itemgetter('has_issues')\n", "sorted_repos = sorted(repos, key=keyfunc)\n", "grouped = it.groupby(sorted_repos, keyfunc)\n", "data = ((key, len(list(group))) for key, group in grouped)\n", "next(data)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(True, 96)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Memoization" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Processing data" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def calc_rate(watchers):\n", " sleep(1)\n", " return watchers * 2\n", "\n", "def gen_exp_rates(repos):\n", " for repo in repos:\n", " yield calc_rate(repo['watchers'])" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[10, 10, 10, 10, 10]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repos = it.repeat({'watchers': 5})\n", "rates = gen_exp_rates(repos)\n", "result = islice(rates, 5)\n", "list(result)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from functools import lru_cache\n", "\n", "def _calc_rate(watchers):\n", " sleep(1)\n", " return watchers * 2\n", "\n", "cacher = lru_cache()\n", "calc_rate = cacher(_calc_rate)\n", "\n", "def gen_exp_rates(repos):\n", " for repo in repos:\n", " yield calc_rate(repo['watchers'])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[10, 10, 10, 10, 10]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repos = it.repeat({'watchers': 5})\n", "rates = gen_exp_rates(repos)\n", "result = islice(rates, 5)\n", "list(result)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": true }, "outputs": [], "source": [ "@lru_cache()\n", "def calc_rate(watchers):\n", " sleep(1)\n", " return watchers * 2\n", "\n", "def gen_exp_rates(repos):\n", " for repo in repos:\n", " yield calc_rate(repo['watchers'])" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[10, 10, 10, 10, 10]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repos = it.repeat({'watchers': 5})\n", "rates = gen_exp_rates(repos)\n", "result = islice(rates, 5)\n", "list(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Introducing meza" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading data" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fivethirtyeight/data'" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from urllib.request import urlopen\n", "from meza.io import read_json\n", "\n", "url2 = '{}/repositories?q=data'.format(BASE) \n", "f = urlopen(url2)\n", "records = read_json(f, path='items.item')\n", "repo = next(records)\n", "repo['full_name']" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "29" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(list(records))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'greeting': 'hello', 'location': 'world'}" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from io import StringIO\n", "from meza.io import read_csv\n", "\n", "f = StringIO('greeting,location\\nhello,world\\n')\n", "next(read_csv(f))" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'fivethirtyeight/data'" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from os import path as p\n", "from meza.io import join\n", "\n", "url3 = '{}&page=2'.format(url2)\n", "files = map(urlopen, [url2, url3])\n", "records = join(*files, ext='json', path='items.item')\n", "repo = next(records) \n", "repo['full_name']" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Jupyter Notebook'" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "repo['language']" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "59" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(list(records))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Transforming data" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'a': 200, 'b': 300, 'c': 400}" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from meza.process import merge\n", "\n", "records = [{'a': 200}, {'b': 300}, {'c': 400}]\n", "merge(records)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'a'" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from meza.process import group\n", "\n", "records = [\n", " {'item': 'a', 'amount': 200},\n", " {'item': 'a', 'amount': 200},\n", " {'item': 'b', 'amount': 400}]\n", "\n", "grouped = group(records, 'item')\n", "key, _group = next(grouped)\n", "key" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'amount': 200, 'item': 'a'}, {'amount': 200, 'item': 'a'}]" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "_group" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ ". at 0x11020ae08>" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from meza import process as pr\n", "\n", "f = urlopen(url2)\n", "raw = read_json(f, path='items.item')\n", "fields = ['full_name', 'language', 'watchers', 'score', 'has_wiki']\n", "cut = pr.cut(raw, fields)\n", "cut" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cut, preview = pr.peek(cut)\n", "cut" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(preview)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'full_name': 'fivethirtyeight/data',\n", " 'has_wiki': True,\n", " 'language': 'Jupyter Notebook',\n", " 'score': Decimal('120.396454'),\n", " 'watchers': 5572}" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preview[0]" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'HTML': Decimal('73.19426'),\n", " 'JavaScript': Decimal('54.46375'),\n", " 'Python': Decimal('50.188396'),\n", " 'has_wiki': False}" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "filled = pr.fillempty(raw, value='', fields=['language'])\n", "pivoted = pr.pivot(filled, 'score', 'language', rows=['has_wiki'], op=min)\n", "next(pivoted)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'': Decimal('44.635494'),\n", " 'C#': Decimal('47.918125'),\n", " 'HTML': Decimal('68.96914'),\n", " 'JavaScript': Decimal('44.16988'),\n", " 'PHP': Decimal('44.0172'),\n", " 'Python': Decimal('44.73296'),\n", " 'R': Decimal('45.959583'),\n", " 'has_wiki': True}" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "next(pivoted)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 1 }