# Presentation

## Naive

### Reading data

In [5]:
from urllib.request import urlopen
from json import loads 

BASE = 'https://api.github.com/search'
_url1 = '{}/repositories?q={}'
q = 'data&per_page=100'
url1 = _url1.format(BASE, q)
f = urlopen(url1)
data = loads(f.read().decode('utf-8'))
repos = data['items']
repos[0]['description']

'Data and code behind the stories and interactives at FiveThirtyEight'

In [6]:
repos[0]['full_name']

'fivethirtyeight/data'

### Processing data

In [7]:
def rate(repos):
 rated = []
 
 for repo in repos:
 rated.append(repo['watchers'] * 2)

 return rated

In [8]:
rate(repos)[:5]

[11142, 5556, 396, 438, 128]

In [9]:
# Infinite data
from itertools import count

inf_repos = ({'watchers': c} for c in count())

# Don't actually run the below code since it will hang forever
# rate(inf_repos)

In [10]:
# Expensive data
from time import sleep

def exp_rate(repos):
 rated = []

 for repo in repos:
 sleep(1)
 rated.append(repo['watchers'] * 2)

 return rated

In [11]:
exp_rate(repos)[:5]

[11142, 5556, 396, 438, 128]

## Lazy evaluation

In [12]:
eager_list = list(range(5))
eager_list

[0, 1, 2, 3, 4]

In [13]:
lazy_list = iter(eager_list)
lazy_list



In [14]:
next(lazy_list)

0

In [15]:
list(lazy_list)

[1, 2, 3, 4]

In [16]:
next(lazy_list)

StopIteration: 

### Reading data

In [17]:
from ijson import items

f = urlopen(url1)
repos = items(f, 'items.item')
repos



In [18]:
repo = next(repos)
repo['full_name']

'fivethirtyeight/data'

### Processing data

In [19]:
def gen_rates(repos):
 for repo in repos:
 yield repo['watchers'] * 2

In [20]:
gen_rates(repos)



In [21]:
rates = gen_rates(repos)
next(rates)

5556

In [22]:
next(rates)

396

In [23]:
# Infinite data
rates = gen_rates(inf_repos)
next(rates)

0

In [25]:
# Expensive data
def gen_exp_rates(repos):
 for repo in repos:
 sleep(1)
 yield repo['watchers'] * 2

In [26]:
from itertools import islice

rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

[438, 128, 684, 348, 1356]

In [27]:
next(rates)

648

## Grouping data

In [39]:
f = urlopen(url1)
repos = items(f, 'items.item')
repo = next(repos)
repo.keys()

dict_keys(['id', 'name', 'full_name', 'owner', 'private', 'html_url', 'description', 'fork', 'url', 'forks_url', 'keys_url', 'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url', 'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url', 'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url', 'languages_url', 'stargazers_url', 'contributors_url', 'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url', 'comments_url', 'issue_comment_url', 'contents_url', 'compare_url', 'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url', 'milestones_url', 'notifications_url', 'labels_url', 'releases_url', 'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url', 'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size', 'stargazers_count', 'watchers_count', 'language', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 'forks_count', 'mirror_url', 'open_issues_count', 'forks', 'open_issues', 'watchers', 'de

In [40]:
repo['has_issues']

True

In [41]:
import itertools as it
from operator import itemgetter

keyfunc = itemgetter('has_issues')
sorted_repos = sorted(repos, key=keyfunc)
grouped = it.groupby(sorted_repos, keyfunc)
data = ((key, len(list(group))) for key, group in grouped)
next(data)

(False, 3)

In [42]:
next(data)

(True, 96)

## Memoization

### Processing data

In [43]:
def calc_rate(watchers):
 sleep(1)
 return watchers * 2

def gen_exp_rates(repos):
 for repo in repos:
 yield calc_rate(repo['watchers'])

In [45]:
repos = it.repeat({'watchers': 5})
rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

[10, 10, 10, 10, 10]

In [48]:
from functools import lru_cache

def _calc_rate(watchers):
 sleep(1)
 return watchers * 2

cacher = lru_cache()
calc_rate = cacher(_calc_rate)

def gen_exp_rates(repos):
 for repo in repos:
 yield calc_rate(repo['watchers'])

In [51]:
repos = it.repeat({'watchers': 5})
rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

[10, 10, 10, 10, 10]

In [52]:
@lru_cache()
def calc_rate(watchers):
 sleep(1)
 return watchers * 2

def gen_exp_rates(repos):
 for repo in repos:
 yield calc_rate(repo['watchers'])

In [53]:
repos = it.repeat({'watchers': 5})
rates = gen_exp_rates(repos)
result = islice(rates, 5)
list(result)

[10, 10, 10, 10, 10]

## Introducing meza

### Reading data

In [61]:
from urllib.request import urlopen
from meza.io import read_json

url2 = '{}/repositories?q=data'.format(BASE) 
f = urlopen(url2)
records = read_json(f, path='items.item')
repo = next(records)
repo['full_name']

'fivethirtyeight/data'

In [62]:
len(list(records))

29

In [56]:
from io import StringIO
from meza.io import read_csv

f = StringIO('greeting,location\nhello,world\n')
next(read_csv(f))

{'greeting': 'hello', 'location': 'world'}

In [57]:
from os import path as p
from meza.io import join

url3 = '{}&page=2'.format(url2)
files = map(urlopen, [url2, url3])
records = join(*files, ext='json', path='items.item')
repo = next(records) 
repo['full_name']

'fivethirtyeight/data'

In [58]:
repo['language']

'Jupyter Notebook'

In [59]:
len(list(records))

59

### Transforming data

In [63]:
from meza.process import merge

records = [{'a': 200}, {'b': 300}, {'c': 400}]
merge(records)

{'a': 200, 'b': 300, 'c': 400}

In [64]:
from meza.process import group

records = [
 {'item': 'a', 'amount': 200},
 {'item': 'a', 'amount': 200},
 {'item': 'b', 'amount': 400}]

grouped = group(records, 'item')
key, _group = next(grouped)
key

'a'

In [65]:
_group

[{'amount': 200, 'item': 'a'}, {'amount': 200, 'item': 'a'}]

In [66]:
from meza import process as pr

f = urlopen(url2)
raw = read_json(f, path='items.item')
fields = ['full_name', 'language', 'watchers', 'score', 'has_wiki']
cut = pr.cut(raw, fields)
cut

. at 0x11020ae08>

In [67]:
cut, preview = pr.peek(cut)
cut



In [68]:
len(preview)

5

In [69]:
preview[0]

{'full_name': 'fivethirtyeight/data',
 'has_wiki': True,
 'language': 'Jupyter Notebook',
 'score': Decimal('120.396454'),
 'watchers': 5572}

In [70]:
filled = pr.fillempty(raw, value='', fields=['language'])
pivoted = pr.pivot(filled, 'score', 'language', rows=['has_wiki'], op=min)
next(pivoted)

{'HTML': Decimal('73.19426'),
 'JavaScript': Decimal('54.46375'),
 'Python': Decimal('50.188396'),
 'has_wiki': False}

In [71]:
next(pivoted)

{'': Decimal('44.635494'),
 'C#': Decimal('47.918125'),
 'HTML': Decimal('68.96914'),
 'JavaScript': Decimal('44.16988'),
 'PHP': Decimal('44.0172'),
 'Python': Decimal('44.73296'),
 'R': Decimal('45.959583'),
 'has_wiki': True}