{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "from local.torch_basics import *\n",
    "from local.test import *\n",
    "from local.core import *\n",
    "from local.data.all import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from local.notebook.showdoc import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#default_exp text.core\n",
    "#default_cls_lvl 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text core\n",
    "\n",
    "> Basic function to preprocess text before assembling it in a `DataBunch`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export \n",
    "import spacy,html\n",
    "from spacy.symbols import ORTH"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing rules"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following are rules applied to texts before or after it's tokenized."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "#special tokens\n",
    "UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ = \"xxunk xxpad xxbos xxeos xxfld xxrep xxwrep xxup xxmaj\".split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "_all_ = [\"UNK\", \"PAD\", \"BOS\", \"EOS\", \"FLD\", \"TK_REP\", \"TK_WREP\", \"TK_UP\", \"TK_MAJ\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "_re_spec = re.compile(r'([/#\\\\])')\n",
    "\n",
    "def spec_add_spaces(t):\n",
    "    \"Add spaces around / and #\"\n",
    "    return _re_spec.sub(r' \\1 ', t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(spec_add_spaces('#fastai'), ' # fastai')\n",
    "test_eq(spec_add_spaces('/fastai'), ' / fastai')\n",
    "test_eq(spec_add_spaces('\\\\fastai'), ' \\\\ fastai')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "_re_space = re.compile(' {2,}')\n",
    "\n",
    "def rm_useless_spaces(t):\n",
    "    \"Remove multiple spaces\"\n",
    "    return _re_space.sub(' ', t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(rm_useless_spaces('a  b   c'), 'a b c')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "_re_rep = re.compile(r'(\\S)(\\1{2,})')\n",
    "\n",
    "def replace_rep(t):\n",
    "    \"Replace repetitions at the character level: cccc -- TK_REP 4 c\"\n",
    "    def _replace_rep(m):\n",
    "        c,cc = m.groups()\n",
    "        return f' {TK_REP} {len(cc)+1} {c} '\n",
    "    return _re_rep.sub(_replace_rep, t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It starts replacing at 3 repetitions of the same character or more."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(replace_rep('aa'), 'aa')\n",
    "test_eq(replace_rep('aaaa'), f' {TK_REP} 4 a ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "_re_wrep = re.compile(r'(?:\\s|^)(\\w+)\\s+((?:\\1\\s+)+)\\1(\\s|\\W|$)')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide\n",
    "\"\"\"\n",
    "Matches any word repeated at least four times with spaces between them\n",
    "(?:\\s|^)       Non-Capture either a whitespace character or the beginning of text\n",
    "(\\w+)          Capture any alphanumeric character\n",
    "\\s+            One or more whitespace\n",
    "((?:\\1\\s+)+)   Capture a repetition of one or more times \\1 followed by one or more whitespace\n",
    "\\1             Occurence of \\1\n",
    "(\\s|\\W|$)      Capture last whitespace, non alphanumeric character or end of text\n",
    "\"\"\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def replace_wrep(t):\n",
    "    \"Replace word repetitions: word word word word -- TK_WREP 4 word\"\n",
    "    def _replace_wrep(m):\n",
    "        c,cc,e = m.groups()\n",
    "        return f' {TK_WREP} {len(cc.split())+2} {c} {e}'\n",
    "    return _re_wrep.sub(_replace_wrep, t)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It starts replacing at 3 repetitions of the same word or more."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(replace_wrep('ah ah'), 'ah ah')\n",
    "test_eq(replace_wrep('ah ah ah'), f' {TK_WREP} 3 ah ')\n",
    "test_eq(replace_wrep('ah ah   ah  ah'), f' {TK_WREP} 4 ah ')\n",
    "test_eq(replace_wrep('ah ah ah ah '), f' {TK_WREP} 4 ah  ')\n",
    "test_eq(replace_wrep('ah ah ah ah.'), f' {TK_WREP} 4 ah .')\n",
    "test_eq(replace_wrep('ah ah ahi'), f'ah ah ahi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def fix_html(x):\n",
    "    \"Various messy things we've seen in documents\"\n",
    "    x = x.replace('#39;', \"'\").replace('amp;', '&').replace('#146;', \"'\").replace('nbsp;', ' ').replace(\n",
    "        '#36;', '$').replace('\\\\n', \"\\n\").replace('quot;', \"'\").replace('<br />', \"\\n\").replace(\n",
    "        '\\\\\"', '\"').replace('<unk>',UNK).replace(' @.@ ','.').replace(' @-@ ','-').replace('...',' …')\n",
    "    return html.unescape(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(fix_html('#39;bli#146;'), \"'bli'\")\n",
    "test_eq(fix_html('Sarah amp; Duck...'), 'Sarah & Duck …')\n",
    "test_eq(fix_html('a nbsp; #36;'), 'a   $')\n",
    "test_eq(fix_html('\\\\\" <unk>'), f'\" {UNK}')\n",
    "test_eq(fix_html('quot;  @.@  @-@ '), \"' .-\")\n",
    "test_eq(fix_html('<br />text\\\\n'), '\\ntext\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "_re_all_caps = re.compile(r'(\\s|^)([A-Z]+[^a-z\\s]*)(?=(\\s|$))')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide\n",
    "\"\"\"\n",
    "Catches any word in all caps, even with ' or - inside\n",
    "(\\s|^)        Capture either a whitespace or the beginning of text\n",
    "([A-Z]+       Capture one capitalized letter or more...\n",
    "[^a-z\\s]*)    ...followed by anything that's non lowercase or whitespace\n",
    "(?=(\\s|$))    Look ahead for a space or end of text\n",
    "\"\"\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def replace_all_caps(t):\n",
    "    \"Replace tokens in ALL CAPS by their lower version and add `TK_UP` before.\"\n",
    "    def _replace_all_caps(m):\n",
    "        tok = f'{TK_UP} ' if len(m.groups()[1]) > 1 else ''\n",
    "        return f\"{m.groups()[0]}{tok}{m.groups()[1].lower()}\"\n",
    "    return _re_all_caps.sub(_replace_all_caps, t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(replace_all_caps(\"I'M SHOUTING\"), f\"{TK_UP} i'm {TK_UP} shouting\")\n",
    "test_eq(replace_all_caps(\"I'm speaking normally\"), \"I'm speaking normally\")\n",
    "test_eq(replace_all_caps(\"I am speaking normally\"), \"i am speaking normally\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "_re_maj = re.compile(r'(\\s|^)([A-Z][^A-Z\\s]*)(?=(\\s|$))')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide\n",
    "\"\"\"\n",
    "Catches any capitalized word\n",
    "(\\s|^)       Capture either a whitespace or the beginning of text\n",
    "([A-Z]       Capture exactly one capitalized letter...\n",
    "[^A-Z\\s]*)   ...followed by anything that's not uppercase or whitespace\n",
    "(?=(\\s|$))   Look ahead for a space of end of text\n",
    "\"\"\";"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def replace_maj(t):\n",
    "    \"Replace tokens in ALL CAPS by their lower version and add `TK_UP` before.\"\n",
    "    def _replace_maj(m):\n",
    "        tok = f'{TK_MAJ} ' if len(m.groups()[1]) > 1 else ''\n",
    "        return f\"{m.groups()[0]}{tok}{m.groups()[1].lower()}\"\n",
    "    return _re_maj.sub(_replace_maj, t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(replace_maj(\"Jeremy Howard\"), f'{TK_MAJ} jeremy {TK_MAJ} howard')\n",
    "test_eq(replace_maj(\"I don't think there is any maj here\"), (\"i don't think there is any maj here\"),)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def lowercase(t, add_bos=True, add_eos=False):\n",
    "    \"Converts `t` to lowercase\"\n",
    "    return (f'{BOS} ' if add_bos else '') + t.lower().strip() + (f' {EOS}' if add_eos else '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def replace_space(t):\n",
    "    \"Replace embedded spaces in a token with unicode line char to allow for split/join\"\n",
    "    return t.replace(' ', '▁')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]\n",
    "defaults.text_proc_rules = [fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces,\n",
    "                            replace_all_caps, replace_maj, lowercase]\n",
    "defaults.text_postproc_rules = [replace_space]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenizing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A tokenizer is a class that must implement a `pipe` method. This `pipe` method receives a generator of texts and must return a generator with their tokenized versions. Here is the most basic example:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class BaseTokenizer():\n",
    "    \"Basic tokenizer that just splits on spaces\"\n",
    "    def __init__(self, split_char=' ', **kwargs): self.split_char=split_char\n",
    "    def __call__(self, items): return (t.split(self.split_char) for t in items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tok = BaseTokenizer()\n",
    "for t in tok([\"This is a text\"]): test_eq(t, [\"This\", \"is\", \"a\", \"text\"])\n",
    "tok = BaseTokenizer('x')\n",
    "for t in tok([\"This is a text\"]): test_eq(t, [\"This is a te\", \"t\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class SpacyTokenizer():\n",
    "    \"Spacy tokenizer for `lang`\"\n",
    "    def __init__(self, lang='en', special_toks=None, buf_sz=5000):\n",
    "        special_toks = ifnone(special_toks, defaults.text_spec_tok)\n",
    "        nlp = spacy.blank(lang, disable=[\"parser\", \"tagger\", \"ner\"])\n",
    "        for w in special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}])\n",
    "        self.pipe,self.buf_sz = nlp.pipe,buf_sz\n",
    "\n",
    "    def __call__(self, items):\n",
    "        return (L(doc).attrgot('text') for doc in self.pipe(items, batch_size=self.buf_sz))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tok = SpacyTokenizer()\n",
    "inp,exp = \"This isn't the easiest text.\",[\"This\", \"is\", \"n't\", \"the\", \"easiest\", \"text\", \".\"]\n",
    "test_eq(L(tok([inp]*5)), [exp]*5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class TokenizeBatch:\n",
    "    \"A wrapper around `tok_func` to apply `rules` and tokenize in parallel\"\n",
    "    def __init__(self, tok_func=SpacyTokenizer, rules=None, post_rules=None, **tok_kwargs ):\n",
    "        self.rules = L(ifnone(rules, defaults.text_proc_rules))\n",
    "        self.post_f = compose(*L(ifnone(post_rules, defaults.text_postproc_rules)))\n",
    "        self.tok = tok_func(**tok_kwargs)\n",
    "\n",
    "    def __call__(self, batch):\n",
    "        return (L(o).map(self.post_f) for o in self.tok(maps(*self.rules, batch)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = TokenizeBatch()\n",
    "test_eq(f([\"This isn't a problem\"]), [[BOS, TK_MAJ, 'this', 'is', \"n't\", 'a', 'problem']])\n",
    "f = TokenizeBatch(BaseTokenizer, rules=[], split_char=\"'\")\n",
    "test_eq(f([\"This isn't a problem\"]), [['This▁isn', 't▁a▁problem']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The main function that will be called during one of the processes handling tokenization. It will create an instance of a tokenizer with `tok_func` and `tok_kwargs` at init, then iterate through the `batch` of texts, apply them `rules` and tokenize them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = [\"this is a text\", \"this is another text\"]\n",
    "tok = TokenizeBatch(BaseTokenizer, texts.__getitem__)\n",
    "test_eq([t for t in tok([0,1])],[['this', 'is', 'a', 'text'], ['this', 'is', 'another', 'text']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def tokenize1(text, tok_func=SpacyTokenizer, rules=None, post_rules=None, **tok_kwargs):\n",
    "    \"Tokenize one `text` with an instance of `tok_func` and some `rules`\"\n",
    "    return first(TokenizeBatch(tok_func, rules, post_rules, **tok_kwargs)([text]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_eq(tokenize1(\"This isn't a problem\"),\n",
    "        [BOS, TK_MAJ, 'this', 'is', \"n't\", 'a', 'problem'])\n",
    "test_eq(tokenize1(\"This isn't a problem\", BaseTokenizer, rules=[], split_char=\"'\"),\n",
    "        ['This▁isn', 't▁a▁problem'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def parallel_tokenize(items, tok_func, rules, as_gen=False, n_workers=defaults.cpus, **tok_kwargs):\n",
    "    \"Calls a potential setup on `tok_func` before launching `TokenizeBatch` in parallel\"\n",
    "    if hasattr(tok_func, 'setup'): tok_kwargs = tok_func(**tok_kwargs).setup(items, rules)\n",
    "    return parallel_gen(TokenizeBatch, items, as_gen=as_gen, tok_func=tok_func,\n",
    "                        rules=rules, n_workers=n_workers, **tok_kwargs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenize texts in files"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Preprocessing function for texts in filenames. Tokenized texts will be saved in a similar fashion in a directory suffixed with `_tok` in the parent folder of `path` (override with `output_dir`)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "fn_counter_pkl = 'counter.pkl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def tokenize_folder(path, extensions=None, folders=None, output_dir=None, n_workers=defaults.cpus,\n",
    "                    rules=None, tok_func=SpacyTokenizer, encoding='utf8', **tok_kwargs):\n",
    "    \"Tokenize text files in `path` in parallel using `n_workers`\"\n",
    "    path,extensions = Path(path),ifnone(extensions, ['.txt'])\n",
    "    fnames = get_files(path, extensions=extensions, recurse=True, folders=folders)\n",
    "    output_dir = Path(ifnone(output_dir, path.parent/f'{path.name}_tok'))\n",
    "    rules = partial(Path.read, encoding=encoding) + L(ifnone(rules, defaults.text_proc_rules.copy()))\n",
    "\n",
    "    counter = Counter()\n",
    "    for i,tok in parallel_tokenize(fnames, tok_func, rules, as_gen=True, n_workers=n_workers, **tok_kwargs):\n",
    "        out = output_dir/fnames[i].relative_to(path)\n",
    "        out.write(' '.join(tok))\n",
    "        counter.update(tok)\n",
    "\n",
    "    (output_dir/fn_counter_pkl).save(counter)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The result will be in `output_dir` (defaults to a folder in the same parent directory as `path`, with `_tok` added to `path.name`) with the same structure as in `path`. Tokenized texts for a given file will be in the file having the same name in `output_dir`. Additionally, a file with a .len suffix contains the number of tokens and the count of all words is stored in `output_dir/counter.pkl`.\n",
    "\n",
    "`extensions` will default to `['.txt']` and all text files in `path` are treated unless you specify a list of folders in `include`. `tok_func` is instantiated in each process with `tok_kwargs`, and `rules` (that defaults to `defaults.text_proc_rules`) are applied to each text before going in the tokenizer."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenize texts in a dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def _join_texts(df, mark_fields=False):\n",
    "    \"Join texts in row `idx` of `df`, marking each field with `FLD` if `mark_fields=True`\"\n",
    "    text_col = (f'{FLD} {1} ' if mark_fields else '' ) + df.iloc[:,0].astype(str)\n",
    "    for i in range(1,len(df.columns)):\n",
    "        text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df.iloc[:,i].astype(str)\n",
    "    return text_col.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#hide\n",
    "texts = [f\"This is an example of text {i}\" for i in range(10)]\n",
    "df = pd.DataFrame({'text': texts, 'text1': texts}, columns=['text', 'text1'])\n",
    "col = _join_texts(df, mark_fields=True)    \n",
    "\n",
    "for i in range(len(df)):\n",
    "    test_eq(col[i], f'{FLD} 1 This is an example of text {i} {FLD} 2 This is an example of text {i}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def tokenize_df(df, text_cols, n_workers=defaults.cpus, rules=None, mark_fields=None,\n",
    "                tok_func=SpacyTokenizer, **tok_kwargs):\n",
    "    \"Tokenize texts in `df[text_cols]` in parallel using `n_workers`\"\n",
    "    text_cols = L(text_cols)\n",
    "    #mark_fields defaults to False if there is one column of texts, True if there are multiple\n",
    "    if mark_fields is None: mark_fields = len(text_cols)>1\n",
    "    rules = L(ifnone(rules, defaults.text_proc_rules.copy()))\n",
    "    texts = _join_texts(df[text_cols], mark_fields=mark_fields)\n",
    "    outputs = L(parallel_tokenize(texts, tok_func, rules, n_workers=n_workers, **tok_kwargs)\n",
    "               ).sorted().itemgot(1)\n",
    "\n",
    "    other_cols = df.columns[~df.columns.isin(text_cols)]\n",
    "    res = df[other_cols].copy()\n",
    "    res['text'] = outputs\n",
    "    return res,Counter(outputs.concat())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This function returns a new dataframe with the same non-text columns, a colum named text that contains the tokenized texts and a column named text_lengths that contains their respective length. It also returns a counter of all words see to quickly build a vocabulary afterward.\n",
    "\n",
    "`tok_func` is instantiated in each process with `tok_kwargs`, and `rules` (that defaults to `defaults.text_proc_rules`) are applied to each text before going in the tokenizer. If `mark_fields` isn't specified, it defaults to `False` when there is a single text column, `True` when there are several. In that case, the texts in each of those columns are joined with `FLD` markes followed by the number of the field."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def tokenize_csv(fname, text_cols, outname=None, n_workers=4, rules=None, mark_fields=None,\n",
    "                 tok_func=SpacyTokenizer, header='infer', chunksize=50000, **tok_kwargs):\n",
    "    \"Tokenize texts in the `text_cols` of the csv `fname` in parallel using `n_workers`\"\n",
    "    df = pd.read_csv(fname, header=header, chunksize=chunksize)\n",
    "    outname = Path(ifnone(outname, fname.parent/f'{fname.stem}_tok.csv'))\n",
    "    cnt = Counter()\n",
    "\n",
    "    for i,dfp in enumerate(df):\n",
    "        out,c = tokenize_df(dfp, text_cols, n_workers=n_workers, rules=rules,\n",
    "                            mark_fields=mark_fields, tok_func=tok_func, **tok_kwargs)\n",
    "        out.text = out.text.str.join(' ')\n",
    "        out.to_csv(outname, header=(None,header)[i==0], index=False, mode=('a','w')[i==0])\n",
    "        cnt.update(c)\n",
    "\n",
    "    outname.with_suffix('.pkl').save(cnt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def load_tokenized_csv(fname):\n",
    "    \"Utility function to quickly load a tokenized csv ans the corresponding counter\"\n",
    "    fname = Path(fname)\n",
    "    out = pd.read_csv(fname)\n",
    "    for txt_col in out.columns[1:-1]:\n",
    "        out[txt_col] = out[txt_col].str.split(' ')\n",
    "    return out,fname.with_suffix('.pkl').load()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The result will be written in a new csv file in `outname` (defaults to the same as `fname` with the suffix `_tok.csv`) and will have the same header as the original file, the same non-text columns, a text and a text_lengths column as described in `tokenize_df`.\n",
    "\n",
    "`tok_func` is instantiated in each process with `tok_kwargs`, and `rules` (that defaults to `defaults.text_proc_rules`) are applied to each text before going in the tokenizer. If `mark_fields` isn't specified, it defaults to `False` when there is a single text column, `True` when there are several. In that case, the texts in each of those columns are joined with `FLD` markes followed by the number of the field.\n",
    "\n",
    "The csv file is opened with `header` and optionally with blocks of `chunksize` at a time. If this argument is passed, each chunk is processed independtly and saved in the output file to save memory usage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _prepare_texts(tmp_d):\n",
    "    \"Prepare texts in a folder struct in tmp_d, a csv file and returns a dataframe\"\n",
    "    path = Path(tmp_d)/'tmp'\n",
    "    path.mkdir()\n",
    "    for d in ['a', 'b', 'c']: \n",
    "        (path/d).mkdir()\n",
    "        for i in range(5):\n",
    "            with open(path/d/f'text{i}.txt', 'w') as f: f.write(f\"This is an example of text {d} {i}\")\n",
    "    \n",
    "    texts = [f\"This is an example of text {d} {i}\" for i in range(5) for d in ['a', 'b', 'c']]\n",
    "    df = pd.DataFrame({'text': texts, 'label': list(range(15))}, columns=['text', 'label'])\n",
    "    csv_fname = tmp_d/'input.csv'\n",
    "    df.to_csv(csv_fname, index=False)\n",
    "    return path,df,csv_fname"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with tempfile.TemporaryDirectory() as tmp_d:\n",
    "    path,df,csv_fname = _prepare_texts(Path(tmp_d))\n",
    "    #Tokenize as folders\n",
    "    tokenize_folder(path)\n",
    "    outp = Path(tmp_d)/'tmp_tok'\n",
    "    for d in ['a', 'b', 'c']: \n",
    "        p = outp/d\n",
    "        for i in range(5):\n",
    "            test_eq((p/f'text{i}.txt').read(), ' '.join([\n",
    "                BOS, TK_MAJ, 'this', 'is', 'an', 'example', 'of', 'text', d, str(i) ]))\n",
    "    cnt_a = (outp/fn_counter_pkl).load()\n",
    "    test_eq(cnt_a['this'], 15)\n",
    "    test_eq(cnt_a['a'], 5)\n",
    "    test_eq(cnt_a['0'], 3)\n",
    "    \n",
    "    #Tokenize as a dataframe\n",
    "    out,cnt_b = tokenize_df(df, text_cols='text')\n",
    "    test_eq(list(out.columns), ['label', 'text'])\n",
    "    test_eq(out['label'].values, df['label'].values)\n",
    "    test_eq(out['text'], [(outp/d/f'text{i}.txt').read().split(' ') for i in range(5) for d in ['a', 'b', 'c']])\n",
    "    test_eq(cnt_a, cnt_b)\n",
    "    \n",
    "    #Tokenize as a csv \n",
    "    out_fname = Path(tmp_d)/'output.csv'\n",
    "    tokenize_csv(csv_fname, text_cols='text', outname=out_fname)\n",
    "    test_eq((out,cnt_b), load_tokenized_csv(out_fname))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sentencepiece"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eu_langs = [\"bg\", \"cs\", \"da\", \"de\", \"el\", \"en\", \"es\", \"et\", \"fi\", \"fr\", \"ga\", \"hr\", \"hu\",\n",
    "            \"it\",\"lt\",\"lv\",\"mt\",\"nl\",\"pl\",\"pt\",\"ro\",\"sk\",\"sl\",\"sv\"] # all European langs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp\n",
    "    \"Spacy tokenizer for `lang`\"\n",
    "    def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000,\n",
    "                 model_type='unigram', char_coverage=None, cache_dir='tmp'):\n",
    "        try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor\n",
    "        except ImportError:\n",
    "            raise Exception('sentencepiece module is missing: run `pip install sentencepiece`')\n",
    "        self.sp_model,self.cache_dir = sp_model,Path(cache_dir)\n",
    "        self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type\n",
    "        self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998)\n",
    "        self.special_toks = ifnone(special_toks, defaults.text_spec_tok)\n",
    "        if sp_model is None: self.tok = None\n",
    "        else:\n",
    "            self.tok = SentencePieceProcessor()\n",
    "            self.tok.Load(str(sp_model))\n",
    "        os.makedirs(self.cache_dir, exist_ok=True)\n",
    "\n",
    "    def _get_vocab_sz(self, raw_text_path):\n",
    "        cnt = Counter()\n",
    "        with open(raw_text_path, 'r') as f:\n",
    "            for line in f.readlines():\n",
    "                cnt.update(line.split())\n",
    "                if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz\n",
    "        res = len(cnt)//4\n",
    "        while res%8 != 0: res+=1\n",
    "        return res\n",
    "\n",
    "    def train(self, raw_text_path):\n",
    "        \"Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`\"\n",
    "        from sentencepiece import SentencePieceTrainer\n",
    "        vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz\n",
    "        spec_tokens = ['\\u2581'+s for s in self.special_toks]\n",
    "        SentencePieceTrainer.Train(\" \".join([\n",
    "            f\"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}\",\n",
    "            f\"--character_coverage={self.char_coverage} --model_type={self.model_type}\",\n",
    "            f\"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1\",\n",
    "            f\"--user_defined_symbols={','.join(spec_tokens)}\"]))\n",
    "        raw_text_path.unlink()\n",
    "        return self.cache_dir/'spm.model'\n",
    "\n",
    "    def setup(self, items, rules):\n",
    "        if self.tok is not None: return {'sp_model': self.sp_model}\n",
    "        raw_text_path = self.cache_dir/'texts.out'\n",
    "        with open(raw_text_path, 'w') as f:\n",
    "            for t in progress_bar(maps(*rules, items), total=len(items), leave=False):\n",
    "                f.write(f'{t}\\n')\n",
    "        return {'sp_model': self.train(raw_text_path)}\n",
    "\n",
    "    def __call__(self, items):\n",
    "        for t in items: yield self.tok.EncodeAsPieces(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = [f\"This is an example of text {i}\" for i in range(10)]\n",
    "df = pd.DataFrame({'text': texts, 'label': list(range(10))}, columns=['text', 'label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "out,cnt = tokenize_df(df, text_cols='text', tok_func=SentencePieceTokenizer, vocab_sz=34)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Export -"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converted 00_test.ipynb.\n",
      "Converted 01_core_foundation.ipynb.\n",
      "Converted 01a_core_utils.ipynb.\n",
      "Converted 01b_core_dispatch.ipynb.\n",
      "Converted 01c_core_transform.ipynb.\n",
      "Converted 02_core_script.ipynb.\n",
      "Converted 03_torchcore.ipynb.\n",
      "Converted 03a_layers.ipynb.\n",
      "Converted 04_data_load.ipynb.\n",
      "Converted 05_data_core.ipynb.\n",
      "Converted 06_data_transforms.ipynb.\n",
      "Converted 07_data_block.ipynb.\n",
      "Converted 08_vision_core.ipynb.\n",
      "Converted 09_vision_augment.ipynb.\n",
      "Converted 09a_vision_data.ipynb.\n",
      "Converted 10_pets_tutorial.ipynb.\n",
      "Converted 11_vision_models_xresnet.ipynb.\n",
      "Converted 12_optimizer.ipynb.\n",
      "Converted 13_learner.ipynb.\n",
      "Converted 13a_metrics.ipynb.\n",
      "Converted 14_callback_schedule.ipynb.\n",
      "Converted 14a_callback_data.ipynb.\n",
      "Converted 15_callback_hook.ipynb.\n",
      "Converted 15a_vision_models_unet.ipynb.\n",
      "Converted 16_callback_progress.ipynb.\n",
      "Converted 17_callback_tracker.ipynb.\n",
      "Converted 18_callback_fp16.ipynb.\n",
      "Converted 19_callback_mixup.ipynb.\n",
      "Converted 20_interpret.ipynb.\n",
      "Converted 20a_distributed.ipynb.\n",
      "Converted 21_vision_learner.ipynb.\n",
      "Converted 22_tutorial_imagenette.ipynb.\n",
      "Converted 23_tutorial_transfer_learning.ipynb.\n",
      "Converted 30_text_core.ipynb.\n",
      "Converted 31_text_data.ipynb.\n",
      "Converted 32_text_models_awdlstm.ipynb.\n",
      "Converted 33_text_models_core.ipynb.\n",
      "Converted 34_callback_rnn.ipynb.\n",
      "Converted 35_tutorial_wikitext.ipynb.\n",
      "Converted 36_text_models_qrnn.ipynb.\n",
      "Converted 37_text_learner.ipynb.\n",
      "Converted 38_tutorial_ulmfit.ipynb.\n",
      "Converted 40_tabular_core.ipynb.\n",
      "Converted 41_tabular_model.ipynb.\n",
      "Converted 42_tabular_rapids.ipynb.\n",
      "Converted 50_data_block_examples.ipynb.\n",
      "Converted 60_medical_imaging.ipynb.\n",
      "Converted 65_medical_text.ipynb.\n",
      "Converted 70_callback_wandb.ipynb.\n",
      "Converted 71_callback_tensorboard.ipynb.\n",
      "Converted 90_notebook_core.ipynb.\n",
      "Converted 91_notebook_export.ipynb.\n",
      "Converted 92_notebook_showdoc.ipynb.\n",
      "Converted 93_notebook_export2html.ipynb.\n",
      "Converted 94_notebook_test.ipynb.\n",
      "Converted 95_index.ipynb.\n",
      "Converted 96_data_external.ipynb.\n",
      "Converted 97_utils_test.ipynb.\n",
      "Converted notebook2jekyll.ipynb.\n"
     ]
    }
   ],
   "source": [
    "#hide\n",
    "from local.notebook.export import notebook2script\n",
    "notebook2script(all_fs=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}