import sys import requests from collections import defaultdict from datetime import datetime, timedelta import time import os import json # IMPORTANT READ THIS FIRST # The pushshift service that this script uses is only available to moderators. Go through this guide to get a token and update the token field below # https://api.pushshift.io/guide token = "" subreddits = ['PKA','bayarea'] ignored_users = ['[deleted]', 'automoderator'] lookback_days = 30 min_comments_per_sub = 1 file_name = "users.txt" require_first_subreddit = False # if true, print users that occur in the first subreddit and any one of the following ones. Otherwise just find the most overlap between all subs url = "https://api.pushshift.io/reddit/comment/search?&limit=1000&order=desc&subreddit={}&before=" startTime = datetime.utcnow()#datetime.strptime("22-02-25 00:00:00", '%y-%m-%d %H:%M:%S')# startEpoch = int(startTime.timestamp()) endTime = startTime - timedelta(days=lookback_days) endEpoch = int(endTime.timestamp()) totalSeconds = startEpoch - endEpoch if not os.path.exists("overlap_subreddits"): os.makedirs("overlap_subreddits") def loadSubredditCommenters(subreddit): for filename in os.listdir("overlap_subreddits"): if filename.endswith(".txt") and filename.startswith(subreddit): count_comments = 0 with open(os.path.join("overlap_subreddits", filename), 'r') as inputFile: commenters = defaultdict(int) for line in inputFile: items = line.split(" ") if len(items) != 2: print(f"Error loading line for {subreddit}: {line}") continue user_comments = int(items[1]) commenters[items[0]] = user_comments count_comments += user_comments dateString = filename.split("_")[-1][:-4] print(f"Loaded {len(commenters)} commenters for subreddit r/{subreddit} through {dateString}") dateThrough = datetime.strptime(dateString, '%Y-%m-%d') return commenters, int(dateThrough.timestamp()), count_comments return None, None, 0 def saveSubredditCommenters(subreddit, commenters, dateThrough): if dateThrough is None: return #print(f"Saving {len(commenters)} commenters for subreddit r/{subreddit} through {dateThrough.strftime('%Y-%m-%d')}") for filename in os.listdir("overlap_subreddits"): if filename.endswith(".txt") and filename.startswith(subreddit): os.remove(os.path.join("overlap_subreddits", filename)) with open(os.path.join("overlap_subreddits", f"{subreddit}_{dateThrough.strftime('%Y-%m-%d')}.txt"), 'w') as outputFile: for commenter, countComments in commenters.items(): outputFile.write(commenter) outputFile.write(" ") outputFile.write(str(countComments)) outputFile.write("\n") def countCommenters(subreddit): commenters, previousEpoch, count = loadSubredditCommenters(subreddit) if previousEpoch is not None and previousEpoch < endEpoch: print(f"Full subreddit loaded: {subreddit}") return commenters if commenters is None: commenters = defaultdict(int) previousEpoch = startEpoch print(f"Counting commenters in: {subreddit}") breakOut = False currentDate = None while True: newUrl = url.format(subreddit)+str(previousEpoch) try: response = requests.get(newUrl, headers={'User-Agent': "Overlap counter by /u/Watchful1", 'Authorization': f"Bearer {token}"}) except (requests.exceptions.ReadTimeout, requests.exceptions.ChunkedEncodingError, requests.exceptions.ConnectionError): print(f"Pushshift timeout, this usually means pushshift is down. Waiting 5 seconds and trying again: {newUrl}") time.sleep(5) continue try: result_data = response.json() if "detail" in result_data and result_data['detail'] == "Not authenticated": print("You are not authenticated, read the comment at the top and update the token") sys.exit(1) objects = result_data['data'] except json.decoder.JSONDecodeError: print(f"Decoding error, this usually means pushshift is down. Waiting 5 seconds and trying again: {newUrl}") time.sleep(5) continue time.sleep(1) # pushshift is ratelimited. If we go too fast we'll get errors if len(objects) == 0: break for object in objects: previousEpoch = object['created_utc'] - 1 if object['author'].lower() not in ignored_users: commenters[object['author']] += 1 count += 1 if count % 1000 == 0: currentDatetime = datetime.fromtimestamp(previousEpoch) print("r/{0} comments: {1}, {2}, {3:.2f}%".format( subreddit, count, currentDatetime.strftime("%Y-%m-%d"), ((startEpoch - previousEpoch) / totalSeconds) * 100)) if currentDatetime.date() != currentDate: saveSubredditCommenters(subreddit, commenters, currentDatetime) currentDate = currentDatetime.date() if previousEpoch < endEpoch: breakOut = True currentDate = datetime.fromtimestamp(previousEpoch).date() break if breakOut: break saveSubredditCommenters(subreddit, commenters, currentDate) print(f"Comments: {count}, commenters: {len(commenters)}") return commenters if __name__ == "__main__": commenterSubreddits = defaultdict(int) is_first = True for subreddit in subreddits: commenters = countCommenters(subreddit) for commenter in commenters: if require_first_subreddit and not is_first and commenter not in commenterSubreddits: continue if commenters[commenter] >= min_comments_per_sub: commenterSubreddits[commenter] += 1 is_first = False if require_first_subreddit: count_found = 0 with open(file_name, 'w') as txt: txt.write(f"Commenters in r/{subreddits[0]} and at least one of r/{(', '.join(subreddits))}\n") for commenter, countSubreddits in commenterSubreddits.items(): if countSubreddits >= 2: count_found += 1 txt.write(f"{commenter}\n") print(f"{count_found} commenters in r/{subreddits[0]} and at least one of r/{(', '.join(subreddits))}") else: sharedCommenters = defaultdict(list) for commenter, countSubreddits in commenterSubreddits.items(): if countSubreddits >= len(subreddits) - 2: sharedCommenters[countSubreddits].append(commenter) commentersAll = len(sharedCommenters[len(subreddits)]) commentersMinusOne = len(sharedCommenters[len(subreddits) - 1]) commentersMinusTwo = len(sharedCommenters[len(subreddits) - 2]) print(f"{commentersAll} commenters in all subreddits, {commentersMinusOne} in all but one, {commentersMinusTwo} in all but 2. Writing output to {file_name}") with open(file_name, 'w') as txt: if commentersAll == 0: txt.write(f"No commenters in all subreddits\n") else: txt.write(f"{commentersAll} commenters in all subreddits\n") for user in sorted(sharedCommenters[len(subreddits)], key=str.lower): txt.write(f"{user}\n") txt.write("\n") if commentersAll < 10 and len(subreddits) > 2: if commentersMinusOne == 0: txt.write(f"No commenters in all but one subreddits\n") else: txt.write(f"{commentersMinusOne} commenters in all but one subreddits\n") for user in sorted(sharedCommenters[len(subreddits) - 1], key=str.lower): txt.write(f"{user}\n") txt.write("\n") if commentersMinusOne < 10: if commentersMinusTwo == 0: txt.write(f"No commenters in all but two subreddits\n") else: txt.write(f"{commentersMinusTwo} commenters in all but two subreddits\n") for user in sorted(sharedCommenters[len(subreddits) - 2], key=str.lower): txt.write(f"{user}\n") txt.write("\n")