{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download Pushshift Reddit Data\n",
"See this on [Github](https://github.com/yinleon/doppler_tutorials/blob/master/1-download-data.ipynb), [NbViewer](https://nbviewer.jupyter.org/github/yinleon/doppler_tutorials/blob/master/1-download-data.ipynb)
\n",
"By Leon Yin 2019-04-05
\n",
"This Notebook collects subreddit metadata from PushShift's REST API, and downloads images from Reddit using requests."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import warnings\n",
"\n",
"import pandas as pd\n",
"from tqdm import tqdm\n",
"\n",
"import config\n",
"import context_manager\n",
"import doppler.data_sources.pushshift as ps\n",
"from doppler.image_utils import download_media_and_return_dhash, read_image\n",
"\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A trick in data engineering is creating a dictionary (called the _context_) that defines the destination of all output files. Here we illustrate an example of what it looks like:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def get_subbreddit_context(subreddit):\n",
" '''\n",
" Where will data be saved?\n",
" '''\n",
" sub_dir = os.path.join(config.data_dir, subreddit)\n",
" media_dir = os.path.join(config.data_dir, 'media')\n",
" file_subreddit = os.path.join(sub_dir, 'posts.csv.gz')\n",
" file_subreddit_media = os.path.join(sub_dir, 'media.csv.gz')\n",
" \n",
" for _dir in [config.data_dir, sub_dir, media_dir]:\n",
" os.makedirs(_dir, exist_ok=True)\n",
" \n",
" context = {\n",
" 'data_dir' : config.data_dir,\n",
" 'sub_dir' : sub_dir,\n",
" 'media_dir' : media_dir,\n",
" 'file_subreddit' : file_subreddit,\n",
" 'file_subreddit_media' : file_subreddit_media\n",
" }\n",
" \n",
" return context"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" We will use functions like `get_subreddit_context` throughout this article. Rather than define these within each notebook, they are all writtein in the `context_manager.py` file."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Query PushShift\n",
"Pushshift has an open restful API that returns JSON records. \n",
"\n",
"For your convenience, we have written a simple Python wrapper found in in `doppler/data_sources/pushshift.py`."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[0;31mSignature:\u001b[0m\n",
"\u001b[0mps\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload_subreddit_posts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'subreddit'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'size=5000'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'ascending=False'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'start_date=False'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'seen_ids=set()'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'display_every_x_iterations=20'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'verbose=True'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mSource:\u001b[0m \n",
"\u001b[0;32mdef\u001b[0m \u001b[0mdownload_subreddit_posts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubreddit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5000\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mstart_date\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseen_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdisplay_every_x_iterations\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;34m'''\u001b[0m\n",
"\u001b[0;34m Queries the pushshift.io API for a given `subreddit`.\u001b[0m\n",
"\u001b[0;34m \u001b[0m\n",
"\u001b[0;34m To go back in time from `start_date`, set ascending to False.\u001b[0m\n",
"\u001b[0;34m To go forward from time `state_date`, set ascneding to True.\u001b[0m\n",
"\u001b[0;34m Skips ids in set `seen_ids`.\u001b[0m\n",
"\u001b[0;34m Displays the http request every `display_every_x_iterations` if `verbose`=True.\u001b[0m\n",
"\u001b[0;34m '''\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseen_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0;34m\"seen_ids needs to be a set!\"\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mrecords\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mlast_record_date\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mstart_date\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;31m# Buld the url\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0murl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbuild_api_endpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msubreddit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlast_record_date\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mverbose\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mdisplay_every_x_iterations\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;31m# make the HTTP request to the API\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjson\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;31m# check which records were returned by the API and which are new?\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;31m# if there are no new IDs, then we're done!\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mpaginated_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"id\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mnew_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpaginated_ids\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mseen_ids\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_ids\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;31m# add new records to existing records. \u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mnew_records\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mrow\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'id'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnew_ids\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mrecords\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_records\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;31m# collect all records created before the last record's date.\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mlast_record\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mlast_record_date\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlast_record\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'created_utc'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cancelled early\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mrecords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mFile:\u001b[0m ~/code/doppler/doppler_tutorials/doppler/data_sources/pushshift.py\n",
"\u001b[0;31mType:\u001b[0m function\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"??ps.download_subreddit_posts"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"