#!/usr/bin/env python3
# Usage:
# `tmuxnewsh2 r_rational indirm ~dl/subreddits/rational subreddit2org.py rational 1000000000`
# `tmuxnewsh2 r_HPfanfiction indirm ~dl/subreddits/HPfanfiction subreddit2org.py HPfanfiction 1000000000`
#
# Old usage:
# `reddit_sub_posts_urls_pushshift.py rational 100000000 > rational.txt`
#
# Docs:
# - https://github.com/praw-dev/praw
# - https://github.com/dmarx/psaw
##
from IPython import embed

import sys
import os
import traceback
import time

link_output = False
limit = 10
subreddit = "rational"

al = len(sys.argv)
if al >= 2:
    subreddit = sys.argv[1]
if al >= 3:
    limit = int(sys.argv[2])

from psaw import PushshiftAPI
import json

if link_output:
    api = PushshiftAPI()
else:
    import praw
    from praw.exceptions import DuplicateReplaceException
    from praw.models.reddit.more import MoreComments

    r = praw.Reddit(
        client_id=os.environ["REDDIT_CLIENT_ID"],
        client_secret=os.environ["REDDIT_CLIENT_SECRET"],
        user_agent='User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
        # password=os.environ["REDDIT_PASSWORD"],
        # username=os.environ["REDDIT_USERNAME"],
    )

    api = PushshiftAPI(r)

print(f"Getting the submissions from the subreddit {subreddit} (limit={limit}) ...\n", file=sys.stderr, flush=True)
gen = api.search_submissions(subreddit=subreddit, limit=limit)
results = list(gen)

##
# embed()
# print(json.dumps(results))
##


if link_output:
    for result in results:
        print(result.permalink)
else:
    # @todo3 refactor this into a standalone reddit2org (we need a way to get the submission object from the URL)
    ##
    from brish import z, zp, bsh

    def stars(lv):
        return "*" * lv + " "

    def html2org(html):
        # tmp = "tmp.html"
        tmp = z("mktemp").outrs

        res = z("cat > {tmp}", cmd_stdin=html)
        assert res

        res = z("html2org {tmp}")
        assert res

        z("command rm {tmp}")

        return res.outrs

    # def meta_get(c):
    #     meta = ""
    #     meta+=f"{c.author.name}"
    #     if hasattr(c, "score"):
    #         meta+=f" ({c.score})"

    #     return meta

    def meta_get_props(c):
        meta = ":PROPERTIES:"

        if c.author: # can be None when they are deleted
            meta+=f"\n:Author: {c.author.name}"

        if hasattr(c, "score"):
            meta+=f"\n:Score: {c.score}"

        if hasattr(c, "created_utc") and c.created_utc:
            meta+=f"\n:DateUnix: {c.created_utc}"
            res = z('gdate -d "@"{c.created_utc} +"%Y-%b-%d"')
            if res:
                meta+=f"\n:DateShort: {res.outrs}"


        if getattr(c, "link_flair_text", None):
            meta+=f"\n:FlairText: {c.link_flair_text}"

        meta+="\n:END:\n"
        return meta
        
    def process_comment(f, comments, lv, shortname):
        while True:
            try:
                comments.replace_more(limit=None)
                break
            except DuplicateReplaceException:
                print(traceback.format_exc())
                break
            except:
                print(traceback.format_exc())
                time.sleep(1)

        l = len(comments) - 1

        shortname_orig = shortname

        for i, c in enumerate(comments):
            lv_c = lv
            shortname = shortname_orig

            # if isinstance(c, MoreComments):
            #     pass
            ##
            # meta = meta_get(c)
            # meta+=": "
            ##
            meta = meta_get_props(c)
            # Properties are key--value pairs. When they are associated with a single entry or with a tree they need to be inserted into a special drawer (see [[https://orgmode.org/manual/Drawers.html#Drawers][Drawers]]) with the name ‘=PROPERTIES=', which has to be located right below a headline, and its planning line (see [[https://orgmode.org/manual/Deadlines-and-Scheduling.html#Deadlines-and-Scheduling][Deadlines and Scheduling]]) when applicable.
            #
            # Still, putting the props after the heading is no fun; We can rename our drawer to :METADATA:, but why bother?
            ##

            head = "EMPTY_COMMENT"

            # @todo3 using IDs creates too long paths. It's better if just use a counter that goes from 0 to N.
            c_id_old = c.id or z("uuidm").outrs[0:6]

            ##
            if lv_c <= 4:
                c_id = f"{i}_{c_id_old}"
            else:
                c_id = i
                # using this in different runs is unreliable, as the comments ordering can change. But since we use the comments' ID as their filenames, this won't result in data loss, but it can cause data duplication and a flawed comment hierarchy.
                # workarounds:
                # - delete the indices directory and re-run the whole scraping from scratch on every update
                # - @done use the first-n-level comments' ID as well
            ##

            shortname += f"/{c_id}"

            if c.body_html:
                head = (html2org(c.body_html) or "EMPTY_COMMENT")
                index_file = f'indices/{shortname}/{c_id_old}.org'
                z("ensure-dir {index_file}")
                with open(index_file, "w") as f2:
                    f2.write(f"{meta}\n{head}")

                if head.startswith("#+"):
                    # do not put blocks in headings (e.g., #+begin_quote)

                    author = "deleted"
                    if c.author:
                        author = c.author.name or author

                    head = f"u/{author}:\n{head}"

                head = head or "_" # empty headers are invalid org-mode

            f.write("\n" + stars(lv_c) + head + "\n" + meta)
            lv_c += 1
            process_comment(f, c.replies, lv_c, shortname)
            if l != i:
                f.write("\n")

    def utf8len(s):
        return len(s.encode('utf-8'))

    for result in results:
        try:
            # embed() ; exit()

            # if not "looking at this sub" in result.title:
            #     continue

            f_name = z("ecn {result.title} | str2filename").outrs
            f_name = f_name[0:230]
            # [[id:a36bb01f-9b9b-40c9-816a-c762281c43c3][filesystem/filenames.org:maximum allowed length for filenames and paths]]
            if utf8len(f_name) > 240:
                f_name = f_name[0:100]

            if utf8len(f_name) > 240:
                f_name = f_name[0:60]

            shortname = f"{f_name}.{result.id}"
            f_name = f"posts/{shortname}.org"
            z("ensure-dir {f_name}")
            with open(f_name, "w") as f:
                lv = 1
                f.write(f"#+TITLE: {result.title}\n\n")


                if getattr(result, "url_overridden_by_dest", None):
                    f.write(f"{stars(lv)}[[{result.url_overridden_by_dest}][{result.title}]]\n")
                    lv += 1
                else:
                    f.write(f"{stars(lv)}{result.title}\n")
                    lv += 1

                meta = meta_get_props(result)
                f.write(meta)

                if result.selftext_html:
                    f.write(html2org(result.selftext_html) + "\n\n")

                process_comment(f, result.comments, lv, shortname)
            print(f"wrote {f_name}\n", file=sys.stderr, flush=True)

        except:
            print(traceback.format_exc())
            embed()