import requests from datetime import datetime import traceback import time import json import sys import csv import json # IMPORTANT READ THIS FIRST # The pushshift service that this script uses is only available to moderators. Go through this guide to get a token and update the token field below # https://api.pushshift.io/guide token = "" username = "" # put the username you want to download in the quotes subreddit = "" # put the subreddit you want to download in the quotes thread_id = "" # put the id of the thread you want to download in the quotes, it's the first 5 to 7 character string of letters and numbers from the url, like 107xayi # leave either one blank to download an entire user's or subreddit's history # or fill in both to download a specific users history from a specific subreddit # change this to one of "human", "csv" or "json" # - human: the score, creation date, author, link and then the comment/submission body on a second line. Objects are separated by lines of dashes # - csv: a comma seperated value file with the fields score, date, title, author, link and then body or url # - json: the full json object output_format = "human" # default start time is the current time and default end time is all history # you can change out the below lines to set a custom start and end date. The script works backwards, so the end date has to be before the start date start_time = datetime.utcnow() #datetime.strptime("10/05/2021", "%m/%d/%Y") end_time = None #datetime.strptime("09/25/2021", "%m/%d/%Y") convert_to_ascii = False # don't touch this unless you know what you're doing convert_thread_id_to_base_ten = True # don't touch this unless you know what you're doing def write_human_line(handle, obj, is_submission, convert_to_ascii): handle.write(str(obj['score'])) handle.write(" : ") handle.write(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d")) if is_submission: handle.write(" : ") if convert_to_ascii: handle.write(obj['title'].encode(encoding='ascii', errors='ignore').decode()) else: handle.write(obj['title']) handle.write(" : u/") handle.write(obj['author']) handle.write(" : ") handle.write(f"https://www.reddit.com{obj['permalink']}") handle.write("\n") if is_submission: if obj['is_self']: if 'selftext' in obj: if convert_to_ascii: handle.write(obj['selftext'].encode(encoding='ascii', errors='ignore').decode()) else: handle.write(obj['selftext']) else: handle.write(obj['url']) else: if convert_to_ascii: handle.write(obj['body'].encode(encoding='ascii', errors='ignore').decode()) else: handle.write(obj['body']) handle.write("\n-------------------------------\n") def write_csv_line(writer, obj, is_submission): output_list = [] output_list.append(str(obj['score'])) output_list.append(datetime.fromtimestamp(obj['created_utc']).strftime("%Y-%m-%d")) if is_submission: output_list.append(obj['title']) output_list.append(f"u/{obj['author']}") output_list.append(f"https://www.reddit.com{obj['permalink']}") if is_submission: if obj['is_self']: if 'selftext' in obj: output_list.append(obj['selftext']) else: output_list.append("") else: output_list.append(obj['url']) else: output_list.append(obj['body']) writer.writerow(output_list) def write_json_line(handle, obj): handle.write(json.dumps(obj)) handle.write("\n") def download_from_url(filename, url_base, output_format, start_datetime, end_datetime, is_submission, convert_to_ascii): print(f"Saving to {filename}") count = 0 if output_format == "human" or output_format == "json": if convert_to_ascii: handle = open(filename, 'w', encoding='ascii') else: handle = open(filename, 'w', encoding='UTF-8') else: handle = open(filename, 'w', encoding='UTF-8', newline='') writer = csv.writer(handle) previous_epoch = int(start_datetime.timestamp()) break_out = False while True: new_url = url_base+str(previous_epoch) json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1", 'Authorization': f"Bearer {token}"}) time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages try: json_data = json_text.json() except json.decoder.JSONDecodeError: time.sleep(1) continue if 'data' not in json_data: break objects = json_data['data'] if len(objects) == 0: break for obj in objects: previous_epoch = obj['created_utc'] - 1 if end_datetime is not None and datetime.utcfromtimestamp(previous_epoch) < end_datetime: break_out = True break count += 1 try: if output_format == "human": write_human_line(handle, obj, is_submission, convert_to_ascii) elif output_format == "csv": write_csv_line(writer, obj, is_submission) elif output_format == "json": write_json_line(handle, obj) except Exception as err: if 'permalink' in obj: print(f"Couldn't print object: https://www.reddit.com{obj['permalink']}") else: print(f"Couldn't print object, missing permalink: {obj['id']}") print(err) print(traceback.format_exc()) if break_out: break print(f"Saved {count} through {datetime.fromtimestamp(previous_epoch).strftime('%Y-%m-%d')}") print(f"Saved {count}") handle.close() if __name__ == "__main__": filter_string = None if username == "" and subreddit == "" and thread_id == "": print("Fill in username, subreddit or thread id") sys.exit(0) if output_format not in ("human", "csv", "json"): print("Output format must be one of human, csv, json") sys.exit(0) filters = [] if username: filters.append(f"author={username}") if subreddit: filters.append(f"subreddit={subreddit}") if thread_id: if convert_thread_id_to_base_ten: filters.append(f"link_id={int(thread_id, 36)}") else: filters.append(f"link_id=t3_{thread_id}") filter_string = '&'.join(filters) url_template = "https://api.pushshift.io/reddit/{}/search?limit=1000&order=desc&{}&before=" if not thread_id: download_from_url("posts.txt", url_template.format("submission", filter_string), output_format, start_time, end_time, True, convert_to_ascii) download_from_url("comments.txt", url_template.format("comment", filter_string), output_format, start_time, end_time, False, convert_to_ascii)