{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "05cba5c5-235c-439a-ba7d-96944877f000",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:24:02) \n",
      "[Clang 11.1.0 ]\n"
     ]
    }
   ],
   "source": [
    "# Use svg graphics, display inline\n",
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format = 'svg'\n",
    "\n",
    "import sys\n",
    "\n",
    "# Basic scientific computing imports\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# display config\n",
    "pd.set_option('display.float_format', lambda x: '%.3f' % x)\n",
    "np.set_printoptions(suppress=True)\n",
    "\n",
    "# ad hoc imports\n",
    "import requests\n",
    "import json\n",
    "import os\n",
    "import praw\n",
    "\n",
    "print(sys.version)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "87c2ffab-4488-4d46-9fda-f2ef31399336",
   "metadata": {},
   "source": [
    "# Subreddit Discovery with PRAW\n",
    "\n",
    "Reddit is the Wild West of the internet. Unlike many modern social platforms, it's structured into communities that each have their own purpose and standards. Instead of adding people to your personal network, you can explore and join these communitites to get a taste of what they're about.\n",
    "\n",
    "**Thought Experiment**: How would you go about finding new communities of people irl, without Reddit. Or the internet. Like if you actually had to get out of the house and meet people. Imagine you're dropped into a new city in 1985 and you don't know a single soul.\n",
    "\n",
    "The first thing you might do is find the community you're most familiar with. Somewhere where you're already familiar with the customs and know how to maneuver. For me, that community is probably `/r/bjj`, but we can use `/r/datascience` (I know you nerds love data science). Let's use PRAW to get the titles of the 10 hottest posts in that community."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b8dd40e7-271d-4dd2-bb04-eb51f7851eea",
   "metadata": {},
   "outputs": [],
   "source": [
    "session_params = {\n",
    "    'user_agent': '',\n",
    "    'client_id': '', \n",
    "    'client_secret': '',\n",
    "}\n",
    "\n",
    "reddit = praw.Reddit(**session_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bc73d35e-abbb-417e-ae05-2447357f611a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "      <th>day_of_week</th>\n",
       "      <th>hour</th>\n",
       "      <th>score</th>\n",
       "      <th>upvote_ratio</th>\n",
       "      <th>comment_count</th>\n",
       "      <th>post_id</th>\n",
       "      <th>url</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2021-11-07</td>\n",
       "      <td>Weekly Entering &amp; Transitioning Thread | 07 No...</td>\n",
       "      <td>datascience-bot</td>\n",
       "      <td>Sunday</td>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "      <td>0.750</td>\n",
       "      <td>7</td>\n",
       "      <td>qon4ml</td>\n",
       "      <td>https://www.reddit.com/r/datascience/comments/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2021-11-07</td>\n",
       "      <td>What is something you took the time to learn t...</td>\n",
       "      <td>THE_REAL_ODB</td>\n",
       "      <td>Sunday</td>\n",
       "      <td>15</td>\n",
       "      <td>124</td>\n",
       "      <td>0.980</td>\n",
       "      <td>46</td>\n",
       "      <td>qore0i</td>\n",
       "      <td>https://www.reddit.com/r/datascience/comments/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2021-11-07</td>\n",
       "      <td>I start working among a team of Data Scientist...</td>\n",
       "      <td>Abdullah_super</td>\n",
       "      <td>Sunday</td>\n",
       "      <td>4</td>\n",
       "      <td>72</td>\n",
       "      <td>0.920</td>\n",
       "      <td>21</td>\n",
       "      <td>qohgdl</td>\n",
       "      <td>https://www.reddit.com/r/datascience/comments/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2021-11-07</td>\n",
       "      <td>What is your go-to resources for learning new ...</td>\n",
       "      <td>aero_gsr</td>\n",
       "      <td>Sunday</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>0.720</td>\n",
       "      <td>4</td>\n",
       "      <td>qox4rs</td>\n",
       "      <td>https://www.reddit.com/r/datascience/comments/...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2021-11-07</td>\n",
       "      <td>Use of data science in new small start-ups</td>\n",
       "      <td>powermed2404</td>\n",
       "      <td>Sunday</td>\n",
       "      <td>16</td>\n",
       "      <td>6</td>\n",
       "      <td>0.690</td>\n",
       "      <td>10</td>\n",
       "      <td>qos332</td>\n",
       "      <td>https://www.reddit.com/r/datascience/comments/...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         date                                              title  \\\n",
       "0  2021-11-07  Weekly Entering & Transitioning Thread | 07 No...   \n",
       "1  2021-11-07  What is something you took the time to learn t...   \n",
       "2  2021-11-07  I start working among a team of Data Scientist...   \n",
       "3  2021-11-07  What is your go-to resources for learning new ...   \n",
       "4  2021-11-07         Use of data science in new small start-ups   \n",
       "\n",
       "            author day_of_week  hour  score  upvote_ratio  comment_count  \\\n",
       "0  datascience-bot      Sunday    12      2         0.750              7   \n",
       "1     THE_REAL_ODB      Sunday    15    124         0.980             46   \n",
       "2   Abdullah_super      Sunday     4     72         0.920             21   \n",
       "3         aero_gsr      Sunday    20      3         0.720              4   \n",
       "4     powermed2404      Sunday    16      6         0.690             10   \n",
       "\n",
       "  post_id                                                url  \n",
       "0  qon4ml  https://www.reddit.com/r/datascience/comments/...  \n",
       "1  qore0i  https://www.reddit.com/r/datascience/comments/...  \n",
       "2  qohgdl  https://www.reddit.com/r/datascience/comments/...  \n",
       "3  qox4rs  https://www.reddit.com/r/datascience/comments/...  \n",
       "4  qos332  https://www.reddit.com/r/datascience/comments/...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "column_names = (\n",
    "    'date', 'title', 'author', 'day_of_week', 'hour', \n",
    "    'score', 'upvote_ratio', 'comment_count', 'post_id', 'url'\n",
    ")\n",
    "rows = []\n",
    "new_posts_gen = reddit.subreddit('DataScience').hot(limit=10)\n",
    "for post in new_posts_gen:\n",
    "    date = str(pd.to_datetime(post.created * 1e9)).split(' ')[0]\n",
    "    title = post.title\n",
    "    author = post.author\n",
    "    dow = pd.to_datetime(post.created * 1e9).day_name()\n",
    "    hour = pd.to_datetime(post.created * 1e9).hour\n",
    "    score = post.score\n",
    "    upvote_ratio = post.upvote_ratio\n",
    "    comment_count = post.num_comments\n",
    "    post_id = post.id\n",
    "    url = post.url\n",
    "    \n",
    "    row = (date, title, author, dow, hour, score, upvote_ratio, comment_count, post_id, url)\n",
    "    rows.append(row)\n",
    "\n",
    "df = pd.DataFrame(data=rows, columns=column_names)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f462777b-6fa5-4a8f-b28d-775ce163bff7",
   "metadata": {},
   "source": [
    "I create a new `Reddit` object and pass in some log in credentials. Check out [this page](https://praw.readthedocs.io/en/latest/getting_started/authentication.html#auth-url) to determine how to get `client_id` and `client_secret` keys. You can generate the pair [here](https://www.reddit.com/prefs/apps) by creating a new application.\n",
    "\n",
    "The code above should give you a basic idea of how to use the API. From the `Reddit` object, we can access basic Reddit entities like subreddits, redditors, and comments. Here, we access /r/DataScience and use the `hot()` method to get the top 10 hottest posts. Note that `hot()` returns a generator.\n",
    "\n",
    "The API's structure is actually really nice to work with. To get a post's title, we can just use the `post.title` property. The author is accessible through the `post.author` property. Very intuitive and ✨pythonic✨. Here, we iterate through the hottests posts, extract the relevant info, then dump it all into a dataframe so it's easy to deal with.\n",
    "\n",
    "Ok, so we've found our main community. We have a home base from which we can branch out and find new communities and hobbies to take part in. Let's look around and meet some people. On reddit, communities interact primarily through posting and commenting. Lets aggregate a list of potentially interesting friends. Maybe they'll be able to point us to other cool communities."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a77db536-3fc7-43ad-b1ea-e8e640107254",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "79 potentially interesting redditors total\n"
     ]
    }
   ],
   "source": [
    "cool_people = []\n",
    "hot_posts = reddit.subreddit('DataScience').hot(limit=10)\n",
    "\n",
    "for post in hot_posts:\n",
    "    if post.author:\n",
    "        cool_people.append(post.author)\n",
    "    for comment in post.comments:\n",
    "        if comment.author:\n",
    "            commentor = comment.author\n",
    "            cool_people.append(commentor)\n",
    "\n",
    "cool_people = list(set(cool_people))   # remove duplicates\n",
    "\n",
    "print(f'{len(cool_people)} potentially interesting redditors total')\n",
    "# print([redditor.name for redditor in cool_people[:10]])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "082a4451-669b-4e07-8d1a-bc3f79f59ca5",
   "metadata": {},
   "source": [
    "Boom. Now we have a set of people who can potentially point us to new communities. Let's examine the other communities one of our friends is interacting with."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "80a28df1-fb88-4d48-8bf9-e991b6eeb0ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Emailmarketing' 'Entrepreneur' 'EntrepreneurRideAlong' 'ProgrammerHumor'\n",
      " 'b2bmarketing' 'datascience' 'email' 'marketing' 'sales' 'smallbusiness']\n"
     ]
    }
   ],
   "source": [
    "redditor = np.random.choice(cool_people)\n",
    "# print(redditor.name)\n",
    "\n",
    "other_subreddits = []\n",
    "for post in redditor.submissions.new(limit=10):\n",
    "    other_subreddits.append(post.subreddit.display_name)\n",
    "\n",
    "print(np.unique(other_subreddits))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dee1c2ea-a019-4bb4-a4fc-841df87a5375",
   "metadata": {},
   "source": [
    "Oooooo this redditor is into some cool stuff. While their \"recommendations\" are interesting, they're all sort of biased towards their food. Instead of looking at this one person's interests, I want to leverage the wisdom of the crowd here. It would be interesting to ask all of my new friends for recommendations and see what the most common answers are. Let's do that and see which communities are mentioned the most."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "1f920168-2b5b-45a1-820b-a5539c1b9ad6",
   "metadata": {},
   "outputs": [],
   "source": [
    "cool_communities = []\n",
    "for redditor in cool_people:\n",
    "    communities = set()\n",
    "    for post in redditor.submissions.new(limit=20):\n",
    "        communities.add(post.subreddit.display_name)\n",
    "        \n",
    "    # append each distinct community to dedup\n",
    "    for community in communities:\n",
    "        if community != 'datascience':\n",
    "            cool_communities.append(community)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "b5c902db-7f58-498b-aad6-d6c8f3e29e86",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MachineLearning         57\n",
       "AskReddit               48\n",
       "statistics              35\n",
       "learnmachinelearning    29\n",
       "learnpython             28\n",
       "dataengineering         27\n",
       "Showerthoughts          21\n",
       "personalfinance         18\n",
       "cscareerquestions       18\n",
       "buildapc                18\n",
       "dataisbeautiful         17\n",
       "NoStupidQuestions       16\n",
       "Python                  16\n",
       "rstats                  15\n",
       "AskStatistics           14\n",
       "wallstreetbets          14\n",
       "datasets                12\n",
       "askscience              12\n",
       "CryptoCurrency          12\n",
       "aws                     12\n",
       "dtype: int64"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cool_communities)[:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf00973a-d2b4-4818-be85-a028bcd4da61",
   "metadata": {},
   "source": [
    "Ok, now we're talking! we're starting to flesh out the picture of other potentially cool communities. Some of them, like AskReddit or Showerthoughts, are generally popular—they probably aren't anything new. There are ways we could potentially filter those out (Something like TF-idf), but it might be more trouble than it works. No need to be elegant here, we can simply look at the list and filter them out.\n",
    "\n",
    "Further down the list we start to see some really interesting communities like r/CryptoCurrency and r/datasets. So far so good. Finally, we'll make this a bit more robust by increasing the number of redditors and subreddits we look through for suggestions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "250b73fd-3d54-48d7-91d9-8a65ad3b0218",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "cool_people = []\n",
    "hot_posts = reddit.subreddit('datascience').hot(limit=30)\n",
    "\n",
    "for post in hot_posts:\n",
    "    if post.author:\n",
    "        cool_people.append(post.author)\n",
    "    # only look at the first 20 comments\n",
    "    for i, comment in enumerate(post.comments):\n",
    "        if i > 20: break\n",
    "        if comment.author:\n",
    "            commentor = comment.author\n",
    "            cool_people.append(commentor)\n",
    "            \n",
    "cool_people = list(set(cool_people))   # remove duplicates\n",
    "\n",
    "\n",
    "cool_communities = []\n",
    "for redditor in cool_people:\n",
    "    communities = set()\n",
    "    \n",
    "    # sometimes getting a redditors new posts throws error 403\n",
    "    # have to manually call next() and catch it\n",
    "    \n",
    "    posts = redditor.submissions.new(limit=20)\n",
    "    while posts:\n",
    "        try:\n",
    "            post = next(posts)\n",
    "        except Exception as e:\n",
    "            continue\n",
    "            \n",
    "        communities.add(post.subreddit.display_name)\n",
    "        # append each distinct community to dedup\n",
    "        for community in communities:\n",
    "            cool_communities.append(community)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "36630984-a443-4ba3-ac07-d1cfafc4aee7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "flightsim           4\n",
       "sffpc               3\n",
       "algotrading         2\n",
       "heroesofthestorm    1\n",
       "dtype: int64"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.value_counts(cool_communities)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}