In [None]:
#default_exp page

# Pagination

> Parallel and serial pagination

In [None]:
#export
from fastcore.utils import *
from fastcore.foundation import *
from ghapi.core import *

import re
from urllib.parse import parse_qs,urlsplit

### Paged operations

Some GitHub API operations return their results one page at a time. For instance, there are many thousands of [gists](https://docs.github.com/en/free-pro-team@latest/github/writing-on-github/creating-gists), but if we call `list_public` we only see the first 30:

In [None]:
api = GhApi()

In [None]:
gists = api.gists.list_public()
len(gists)

30

That's because this operation takes two optional parameters, `per_page`, and `page`:

In [None]:
api.gists.list_public

[gists.list_public](https://docs.github.com/v3/gists/#list-public-gists)(since, per_page, page): *List public gists*

This is a common pattern for `list_*` operations in the GitHub API. One way to get more results is to increase `per_page`:

In [None]:
len(api.gists.list_public(per_page=100))

100

However, `per_page` has a maximum of `100`, so if you want more, you'll have to pass `page=` to get pages beyond the first. An easy way to iterate through all pages is to use `paged`.

In [None]:
#export
def paged(oper, *args, per_page=30, max_pages=9999, **kwargs):
 "Convert operation `oper(*args,**kwargs)` into an iterator"
 yield from itertools.takewhile(noop, (oper(*args, per_page=per_page, page=i, **kwargs) for i in range(1,max_pages+1)))

We'll demonstrate this using the `repos.list_for_org` method:

In [None]:
api.repos.list_for_org

[repos.list_for_org](https://docs.github.com/v3/repos/#list-organization-repositories)(org, type, sort, direction, per_page, page): *List organization repositories*

In [None]:
repos = api.repos.list_for_org('fastai')
len(repos),repos[0].name

(30, 'fast-image')

To convert this operation into a Python iterator, pass the operation itself, along with any arguments (either keyword or positional) to `paged`:

In [None]:
repos = paged(api.repos.list_for_org, 'fastai')

You can now iterate through `repos` using Python, e.g:

In [None]:
for page in repos: print(len(page), page[0].name)

30 fast-image
30 fastforest
30 .github
3 tweetrel


### Link header (RFC 5988)

GitHub tells us how many pages are available using the [link header](https://tools.ietf.org/html/rfc5988). Unfortunately the pypi [LinkHeader](https://pypi.org/project/LinkHeader/) library appears to no longer be maintained, so we've put a refactored version of it here.

In [None]:
#export
class _Scanner:
 def __init__(self, buf): self.buf,self.match = buf,None
 def __getitem__(self, key): return self.match.group(key)
 def scan(self, pattern):
 self.match = re.compile(pattern).match(self.buf)
 if self.match: self.buf = self.buf[self.match.end():]
 return self.match

_QUOTED = r'"((?:[^"\\]|\\.)*)"'
_TOKEN = r'([^()<>@,;:\"\[\]?={}\s]+)'
_RE_COMMA_HREF = r' *,? *< *([^>]*) *> *'
_RE_ATTR = rf'{_TOKEN} *(?:= *({_TOKEN}|{_QUOTED}))? *'

In [None]:
#export
def _parse_link_hdr(header):
 "Parse an RFC 5988 link header, returning a `list` of `tuple`s of URL and attr `dict`"
 scanner,links = _Scanner(header),[]
 while scanner.scan(_RE_COMMA_HREF):
 href,attrs = scanner[1],[]
 while scanner.scan('; *'):
 if scanner.scan(_RE_ATTR):
 attr_name, token, quoted = scanner[1], scanner[3], scanner[4]
 if quoted is not None: attrs.append([attr_name, quoted.replace(r'\"', '"')])
 elif token is not None: attrs.append([attr_name, token])
 else: attrs.append([attr_name, None])
 links.append((href,dict(attrs)))
 if scanner.buf: raise Exception(f"parse() failed at {scanner.buf!r}")
 return links

In [None]:
#export
def parse_link_hdr(header):
 "Parse an RFC 5988 link header, returning a `dict` from rels to a `tuple` of URL and attrs `dict`"
 return {a.pop('rel'):(u,a) for u,a in _parse_link_hdr(header)}

Here's an example of a link header with just one link:

In [None]:
parse_link_hdr('; rel="foo bar"; type=text/html')

{'foo bar': ('http://example.com', {'type': 'text/html'})}

In [None]:
links = parse_link_hdr('; rel="foo bar"; type=text/html')
link = links['foo bar']
test_eq(link[0], 'http://example.com')
test_eq(link[1]['type'], 'text/html')

Let's test it on the headers we received on our last call to GitHub. You can access the last call's headers in `recv_hdrs':

In [None]:
api.recv_hdrs['Link']

'; rel="prev", ; rel="last", ; rel="first"'

Here's what happens when we parse that:

In [None]:
parse_link_hdr(api.recv_hdrs['Link'])

{'prev': ('https://api.github.com/organizations/20547620/repos?per_page=30&page=4',
 {}),
 'last': ('https://api.github.com/organizations/20547620/repos?per_page=30&page=4',
 {}),
 'first': ('https://api.github.com/organizations/20547620/repos?per_page=30&page=1',
 {})}

### Getting pages in parallel

Rather than requesting each page one at a time, we can save some time by getting all the pages we need in parallel.

In [None]:
#export
@patch
def last_page(self:GhApi):
 "Parse RFC 5988 link header from most recent operation, and extract the last page"
 header = self.recv_hdrs.get('Link', '')
 last = nested_idx(parse_link_hdr(header), 'last', 0) or ''
 qs = parse_qs(urlsplit(last).query)
 return int(nested_idx(qs,'page',0) or 0)

To help us know the number of pages needed, we can use `last_page`, which uses the link header we just looked at to grab the last page from GitHub.

We will need multiple pages to get all the repos in the `github` organization, even if we get 100 at a time:

In [None]:
api.repos.list_for_org('github', per_page=100)
api.last_page()

4

In [None]:
#export
def _call_page(i, oper, args, kwargs, per_page):
 return oper(*args, per_page=per_page, page=i, **kwargs)

In [None]:
#export
def pages(oper, n_pages, *args, n_workers=None, per_page=100, **kwargs):
 "Get `n_pages` pages from `oper(*args,**kwargs)`"
 return parallel(_call_page, range(1,n_pages+1), oper=oper, per_page=per_page, args=args, kwargs=kwargs,
 progress=False, n_workers=ifnone(n_workers,n_pages), threadpool=True)

`pages` by default passes `per_page=100` to the operation.

Let's look at some examples. To get all the pages for the repos in the `github` organization in parallel, we can use this:

In [None]:
gh_repos = pages(api.repos.list_for_org, api.last_page(), 'github').concat()
len(gh_repos)

367

If you already know ahead of time the number of pages required, there's no need to call `last_page`. For instance, the GitHub docs specify that we can get at most 3000 gists:

In [None]:
gists = pages(api.gists.list_public, 30).concat()
len(gists)

3000

GitHub ignores the `per_page` parameter for some API calls, such as listing public events, which it limits to 8 pages of 30 items per page. To retrieve all pages in these cases, you need to explicitly pass the lower per page limit:

In [None]:
api.activity.list_public_events()
api.last_page()

8

In [None]:
evts = pages(api.activity.list_public_events, api.last_page(), per_page=30).concat()
len(evts)

232

## Export -

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_core.ipynb.
Converted 01_actions.ipynb.
Converted 02_auth.ipynb.
Converted 03_page.ipynb.
Converted 04_event.ipynb.
Converted 10_cli.ipynb.
Converted 50_fullapi.ipynb.
Converted 80_tutorial_actions.ipynb.
Converted 90_build_lib.ipynb.
Converted Untitled.ipynb.
Converted ghapi demo.ipynb.
Converted index.ipynb.
