"""Source code for `llms_txt` Python module, containing helpers to create and use llms.txt files"""

# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_core.ipynb.

# %% auto #0
__all__ = ['opt_re', 'named_re', 'search', 'parse_link', 'parse_llms_file', 'get_doc_content', 'mk_ctx', 'get_sizes',
           'create_ctx', 'llms_txt2ctx']

# %% ../nbs/01_core.ipynb #484b1085
import re

# %% ../nbs/01_core.ipynb #38bed7c7
from fastcore.utils import *
from fastcore.xml import *
from fastcore.script import *
import httpx
from urllib.parse import urlparse

# %% ../nbs/01_core.ipynb #2cbf4527
def opt_re(s):
    "Pattern to optionally match `s`"
    return f'(?:{s})?'

def named_re(nm, pat):
    "Pattern to match `pat` in a named capture group"
    return f'(?P<{nm}>{pat})'

def search(pat, txt, flags=0):
    "Dictionary of matched groups in `pat` within `txt`"
    res = re.search(pat, txt, flags=flags)
    return res.groupdict() if res else None

# %% ../nbs/01_core.ipynb #5e8cbd7d
def parse_link(txt):
    "Parse a link section from llms.txt"
    title = named_re('title', r'[^\]]+')
    url = named_re('url', r'[^\)]+')
    desc = named_re('desc', r'.*')
    desc_pat = opt_re(fr":\s*{desc}")
    pat = fr'-\s*\[{title}\]\({url}\){desc_pat}'
    return re.search(pat, txt).groupdict()

# %% ../nbs/01_core.ipynb #23dee0c8
def _parse_links(links):
    return [parse_link(l) for l in re.split(r'\n+', links.strip()) if l.strip()]

# %% ../nbs/01_core.ipynb #60a080c3
def _parse_llms(txt):
    start,*rest = re.split(fr'^##\s*(.*?$)', txt, flags=re.MULTILINE)
    d = dict(chunked(rest, 2))
    sects = {k: _parse_links(v) for k,v in d.items()}
    return start.strip(),sects

# %% ../nbs/01_core.ipynb #32b8641d
def parse_llms_file(txt):
    "Parse llms.txt file contents in `txt` to an `AttrDict`"
    start,sects = _parse_llms(txt)
    title = named_re('title', r'.+?$')
    summ = named_re('summary', '.+?$')
    summ_pat = opt_re(fr"^>\s*{summ}$")
    info = named_re('info', '.*')
    pat = fr'^#\s*{title}\n+{summ_pat}\n+{info}'
    d = search(pat, start, (re.MULTILINE|re.DOTALL))
    d['sections'] = sects
    return dict2obj(d)

# %% ../nbs/01_core.ipynb #891efae3
from fastcore.xml import Sections,Project,Doc

# %% ../nbs/01_core.ipynb #39c2321a
def _local_docs_pth(path): return path/'_proc'
def _get_config(): return find_file_parents('pyproject.toml')

def get_doc_content(url):
    "Fetch content from local file if in nbdev repo."
    if (path:=_get_config()):
        relative_path = urlparse(url).path.lstrip('/')
        local_path = _local_docs_pth(path) / relative_path
        if local_path.exists(): return local_path.read_text()
    return httpx.get(url).text

# %% ../nbs/01_core.ipynb #7639dab2
def _doc(kw):
    "Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs."
    print(dict(kw))
    url = kw.pop('url')
    txt = get_doc_content(url)
    re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)
    re_base64_img = re.compile(r'<img[^>]*src="data:image/[^"]*"[^>]*>')
    txt = '\n'.join([o for o in txt.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)])
    return Doc(txt, **kw)

# %% ../nbs/01_core.ipynb #3e0c2ff6
def _section(nm, items, n_workers=None):
    "Create a section containing a `Doc` object for each child."
    return ft(nm, *parallel(_doc, items, n_workers=n_workers, threadpool=True))

# %% ../nbs/01_core.ipynb #c0541f10
def mk_ctx(d, optional=True, n_workers=None):
    "Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section."
    skip = '' if optional else 'Optional'
    sections = [_section(k, v, n_workers=n_workers) for k,v in d.sections.items() if k!=skip]
    return Project(title=d.title, summary=d.summary)(d.info, *sections)

# %% ../nbs/01_core.ipynb #52143a58
def get_sizes(ctx):
    "Get the size of each section of the LLM context"
    return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children if hasattr(o,'tag')}

# %% ../nbs/01_core.ipynb #fd3e7ce4
def create_ctx(txt, optional=False, n_workers=None):
    "A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section."
    d = parse_llms_file(txt)
    ctx = mk_ctx(d, optional=optional, n_workers=n_workers)
    return to_xml(ctx, do_escape=False)

# %% ../nbs/01_core.ipynb #d636436d
@call_parse
def llms_txt2ctx(
    fname:str, # File name to read
    optional:bool_arg=False, # Include 'optional' section?
    n_workers:int=None, # Number of threads to use for parallel downloading
    save_nbdev_fname:str=None #save output to nbdev `{docs_path}` instead of emitting to stdout
):
    "Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section."
    ctx = create_ctx(Path(fname).read_text(), optional=optional, n_workers=n_workers)
    if save_nbdev_fname and (cfg:=_get_config()):
        (_local_docs_pth(cfg) / save_nbdev_fname).mk_write(ctx)
    else: print(ctx)