{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Downloading Gist data from the Github API" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ " from toolz.curried import *\n", " from pandas import Series, DataFrame, concat, get_dummies, TimeGrouper" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "cache the `pandas.read_json` function because that is how we will download the results. Be aware, resetting `read_json` will clear the cache." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ " read_json = memoize(__import__('pandas').read_json)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Begin downloading the data through the Github user information." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ " def get_info(user = 'tonyfast'):\n", " return concat({user: \n", " read_json(f\"\"\"https://api.github.com/users/{user}\"\"\", typ='series')\n", " }).unstack()\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From the info we can determine the location and quantity of the user's gists." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " def get_gists(info, max=3):\n", " gist_page_url = info.loc[\"gists_url\"].format(**{\"/gist_id\": \"?page={}\"}).format\n", "\n", " return (\n", " concat(\n", " [\n", " read_json(gist_page_url(object))\n", " for object in range(1, min(max, (info.loc[\"public_gists\"] // 30) + 1))\n", " ]\n", " )\n", " .pipe(\n", " lambda df: df[\"files\"]\n", " .apply(compose(Series, list, dict.values))\n", " .stack()\n", " .apply(Series)\n", " .reset_index(-1, drop=True)\n", " .join(df)\n", " )\n", " .pipe(do(cleanse)).pipe(convert_to_feather, f\"{info.name}_gists.feather\")\n", " )\n", "\n", "\n", " def cleanse(df):\n", " \"\"\"files and owner are dict's that cannot be serialized by feather\"\"\"\n", " del df[\"files\"], df[\"owner\"]\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Storing the data in [feather](https://github.com/wesm/feather) makes it more reusable." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " def convert_to_feather(df, dest):\n", " import feather\n", " feather.write_dataframe(df, dest)\n", " return feather.read_dataframe(dest)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The `main` function" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ " def main(user=\"tonyfast\", max=3):\n", " return get_gists(get_info(user).loc[user], max)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Demonstrate the functions use." ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
filenametypelanguageraw_urlsizecommentscomments_urlcommits_urlcreated_atdescription...git_pull_urlgit_push_urlhtml_urlidnode_idpublictruncatedupdated_aturluser
2554Untitled279.ipynbtext/plainJupyter Notebookhttps://gist.githubusercontent.com/tonyfast/a4...19280https://api.github.com/gists/748a91e1769d24979...https://api.github.com/gists/748a91e1769d24979...2015-12-29 13:51:19Underlay an svg element as a <div> background...https://gist.github.com/748a91e1769d24979393.githttps://gist.github.com/748a91e1769d24979393.githttps://gist.github.com/748a91e1769d24979393748a91e1769d24979393MDQ6R2lzdDc0OGE5MWUxNzY5ZDI0OTc5MzkzTrueFalse2015-12-29 13:51:19https://api.github.com/gists/748a91e1769d24979393NaN
8969Untitled144.ipynbtext/plainJupyter Notebookhttps://gist.githubusercontent.com/tonyfast/0c...37530https://api.github.com/gists/b0125d860d5ffe74d...https://api.github.com/gists/b0125d860d5ffe74d...2016-02-03 17:58:58A simple dropdown widget...https://gist.github.com/b0125d860d5ffe74dbdb.githttps://gist.github.com/b0125d860d5ffe74dbdb.githttps://gist.github.com/b0125d860d5ffe74dbdbb0125d860d5ffe74dbdbMDQ6R2lzdGIwMTI1ZDg2MGQ1ZmZlNzRkYmRiTrueFalse2016-02-03 17:58:59https://api.github.com/gists/b0125d860d5ffe74dbdbNaN
\n", "

2 rows × 21 columns

\n", "
" ], "text/plain": [ " filename type language \\\n", "2554 Untitled279.ipynb text/plain Jupyter Notebook \n", "8969 Untitled144.ipynb text/plain Jupyter Notebook \n", "\n", " raw_url size comments \\\n", "2554 https://gist.githubusercontent.com/tonyfast/a4... 1928 0 \n", "8969 https://gist.githubusercontent.com/tonyfast/0c... 3753 0 \n", "\n", " comments_url \\\n", "2554 https://api.github.com/gists/748a91e1769d24979... \n", "8969 https://api.github.com/gists/b0125d860d5ffe74d... \n", "\n", " commits_url created_at \\\n", "2554 https://api.github.com/gists/748a91e1769d24979... 2015-12-29 13:51:19 \n", "8969 https://api.github.com/gists/b0125d860d5ffe74d... 2016-02-03 17:58:58 \n", "\n", " description ... \\\n", "2554 Underlay an svg element as a
background ... \n", "8969 A simple dropdown widget ... \n", "\n", " git_pull_url \\\n", "2554 https://gist.github.com/748a91e1769d24979393.git \n", "8969 https://gist.github.com/b0125d860d5ffe74dbdb.git \n", "\n", " git_push_url \\\n", "2554 https://gist.github.com/748a91e1769d24979393.git \n", "8969 https://gist.github.com/b0125d860d5ffe74dbdb.git \n", "\n", " html_url id \\\n", "2554 https://gist.github.com/748a91e1769d24979393 748a91e1769d24979393 \n", "8969 https://gist.github.com/b0125d860d5ffe74dbdb b0125d860d5ffe74dbdb \n", "\n", " node_id public truncated \\\n", "2554 MDQ6R2lzdDc0OGE5MWUxNzY5ZDI0OTc5Mzkz True False \n", "8969 MDQ6R2lzdGIwMTI1ZDg2MGQ1ZmZlNzRkYmRi True False \n", "\n", " updated_at url \\\n", "2554 2015-12-29 13:51:19 https://api.github.com/gists/748a91e1769d24979393 \n", "8969 2016-02-03 17:58:59 https://api.github.com/gists/b0125d860d5ffe74dbdb \n", "\n", " user \n", "2554 NaN \n", "8969 NaN \n", "\n", "[2 rows x 21 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ " if 0: \n", " df = main(max= 1000)\n", " %matplotlib inline\n", " df.set_index('created_at').language.pipe(get_dummies).describe().sort_values('mean', axis=1, ascending=False).loc['mean'].plot.pie()\n", " __import__(\"IPython\").display.display(df.sample(2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }