[ -e /content ] && pip install -Uqq fastai # upgrade fastai on colab" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "from fastai.torch_basics import *\n", "from fastai.data.all import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "from nbdev.showdoc import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#default_exp text.core\n", "#default_cls_lvl 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Text core\n", "\n", "> Basic function to preprocess text before assembling it in a `DataLoaders`." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export \n", "import spacy,html\n", "from spacy.symbols import ORTH" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocessing rules" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following are rules applied to texts before or after it's tokenized." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "#special tokens\n", "UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ = \"xxunk xxpad xxbos xxeos xxfld xxrep xxwrep xxup xxmaj\".split()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "_all_ = [\"UNK\", \"PAD\", \"BOS\", \"EOS\", \"FLD\", \"TK_REP\", \"TK_WREP\", \"TK_UP\", \"TK_MAJ\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "_re_spec = re.compile(r'([/#\\\\])')\n", "\n", "def spec_add_spaces(t):\n", " \"Add spaces around / and #\"\n", " return _re_spec.sub(r' \\1 ', t)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(spec_add_spaces('#fastai'), ' # fastai')\n", "test_eq(spec_add_spaces('/fastai'), ' / fastai')\n", "test_eq(spec_add_spaces('\\\\fastai'), ' \\\\ fastai')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "_re_space = re.compile(' {2,}')\n", "\n", "def rm_useless_spaces(t):\n", " \"Remove multiple spaces\"\n", " return _re_space.sub(' ', t)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(rm_useless_spaces('a b c'), 'a b c')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "_re_rep = re.compile(r'(\\S)(\\1{2,})')\n", "\n", "def replace_rep(t):\n", " \"Replace repetitions at the character level: cccc -- TK_REP 4 c\"\n", " def _replace_rep(m):\n", " c,cc = m.groups()\n", " return f' {TK_REP} {len(cc)+1} {c} '\n", " return _re_rep.sub(_replace_rep, t)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It starts replacing at 3 repetitions of the same character or more." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(replace_rep('aa'), 'aa')\n", "test_eq(replace_rep('aaaa'), f' {TK_REP} 4 a ')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "_re_wrep = re.compile(r'(?:\\s|^)(\\w+)\\s+((?:\\1\\s+)+)\\1(\\s|\\W|$)')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "\"\"\"\n", "Matches any word repeated at least four times with spaces between them\n", "(?:\\s|^) Non-Capture either a whitespace character or the beginning of text\n", "(\\w+) Capture any alphanumeric character\n", "\\s+ One or more whitespace\n", "((?:\\1\\s+)+) Capture a repetition of one or more times \\1 followed by one or more whitespace\n", "\\1 Occurrence of \\1\n", "(\\s|\\W|$) Capture last whitespace, non alphanumeric character or end of text\n", "\"\"\";" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def replace_wrep(t):\n", " \"Replace word repetitions: word word word word -- TK_WREP 4 word\"\n", " def _replace_wrep(m):\n", " c,cc,e = m.groups()\n", " return f' {TK_WREP} {len(cc.split())+2} {c} {e}'\n", " return _re_wrep.sub(_replace_wrep, t)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It starts replacing at 3 repetitions of the same word or more." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(replace_wrep('ah ah'), 'ah ah')\n", "test_eq(replace_wrep('ah ah ah'), f' {TK_WREP} 3 ah ')\n", "test_eq(replace_wrep('ah ah ah ah'), f' {TK_WREP} 4 ah ')\n", "test_eq(replace_wrep('ah ah ah ah '), f' {TK_WREP} 4 ah ')\n", "test_eq(replace_wrep('ah ah ah ah.'), f' {TK_WREP} 4 ah .')\n", "test_eq(replace_wrep('ah ah ahi'), f'ah ah ahi')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def fix_html(x):\n", " \"Various messy things we've seen in documents\"\n", " x = x.replace('#39;', \"'\").replace('amp;', '&').replace('#146;', \"'\").replace('nbsp;', ' ').replace(\n", " '#36;', '$').replace('\\\\n', \"\\n\").replace('quot;', \"'\").replace('
', \"\\n\").replace(\n", " '\\\\\"', '\"').replace('',UNK).replace(' @.@ ','.').replace(' @-@ ','-').replace('...',' …')\n", " return html.unescape(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(fix_html('#39;bli#146;'), \"'bli'\")\n", "test_eq(fix_html('Sarah amp; Duck...'), 'Sarah & Duck …')\n", "test_eq(fix_html('a nbsp; #36;'), 'a $')\n", "test_eq(fix_html('\\\\\" '), f'\" {UNK}')\n", "test_eq(fix_html('quot; @.@ @-@ '), \"' .-\")\n", "test_eq(fix_html('
text\\\\n'), '\\ntext\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "_re_all_caps = re.compile(r'(\\s|^)([A-Z]+[^a-z\\s]*)(?=(\\s|$))')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "\"\"\"\n", "Catches any word in all caps, even with ' or - inside\n", "(\\s|^) Capture either a whitespace or the beginning of text\n", "([A-Z]+ Capture one capitalized letter or more...\n", "[^a-z\\s]*) ...followed by anything that's non lowercase or whitespace\n", "(?=(\\s|$)) Look ahead for a space or end of text\n", "\"\"\";" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def replace_all_caps(t):\n", " \"Replace tokens in ALL CAPS by their lower version and add `TK_UP` before.\"\n", " def _replace_all_caps(m):\n", " tok = f'{TK_UP} ' if len(m.groups()[1]) > 1 else ''\n", " return f\"{m.groups()[0]}{tok}{m.groups()[1].lower()}\"\n", " return _re_all_caps.sub(_replace_all_caps, t)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(replace_all_caps(\"I'M SHOUTING\"), f\"{TK_UP} i'm {TK_UP} shouting\")\n", "test_eq(replace_all_caps(\"I'm speaking normally\"), \"I'm speaking normally\")\n", "test_eq(replace_all_caps(\"I am speaking normally\"), \"i am speaking normally\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "_re_maj = re.compile(r'(\\s|^)([A-Z][^A-Z\\s]*)(?=(\\s|$))')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "\"\"\"\n", "Catches any capitalized word\n", "(\\s|^) Capture either a whitespace or the beginning of text\n", "([A-Z] Capture exactly one capitalized letter...\n", "[^A-Z\\s]*) ...followed by anything that's not uppercase or whitespace\n", "(?=(\\s|$)) Look ahead for a space of end of text\n", "\"\"\";" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def replace_maj(t):\n", " \"Replace tokens in Sentence Case by their lower version and add `TK_MAJ` before.\"\n", " def _replace_maj(m):\n", " tok = f'{TK_MAJ} ' if len(m.groups()[1]) > 1 else ''\n", " return f\"{m.groups()[0]}{tok}{m.groups()[1].lower()}\"\n", " return _re_maj.sub(_replace_maj, t)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(replace_maj(\"Jeremy Howard\"), f'{TK_MAJ} jeremy {TK_MAJ} howard')\n", "test_eq(replace_maj(\"I don't think there is any maj here\"), (\"i don't think there is any maj here\"),)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def lowercase(t, add_bos=True, add_eos=False):\n", " \"Converts `t` to lowercase\"\n", " return (f'{BOS} ' if add_bos else '') + t.lower().strip() + (f' {EOS}' if add_eos else '')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def replace_space(t):\n", " \"Replace embedded spaces in a token with unicode line char to allow for split/join\"\n", " return t.replace(' ', '▁')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "defaults.text_spec_tok = [UNK, PAD, BOS, EOS, FLD, TK_REP, TK_WREP, TK_UP, TK_MAJ]\n", "defaults.text_proc_rules = [fix_html, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces,\n", " replace_all_caps, replace_maj, lowercase]\n", "defaults.text_postproc_rules = [replace_space]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Tokenizing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A tokenizer is a class that must implement `__call__`. This method receives a iterator of texts and must return a generator with their tokenized versions. Here is the most basic example:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "class BaseTokenizer():\n", " \"Basic tokenizer that just splits on spaces\"\n", " def __init__(self, split_char=' ', **kwargs): self.split_char=split_char\n", " def __call__(self, items): return (t.split(self.split_char) for t in items)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tok = BaseTokenizer()\n", "test_eq(tok([\"This is a text\"]), [[\"This\", \"is\", \"a\", \"text\"]])\n", "tok = BaseTokenizer('x')\n", "test_eq(tok([\"This is a text\"]), [[\"This is a te\", \"t\"]])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "class SpacyTokenizer():\n", " \"Spacy tokenizer for `lang`\"\n", " def __init__(self, lang='en', special_toks=None, buf_sz=5000):\n", " self.special_toks = ifnone(special_toks, defaults.text_spec_tok)\n", " nlp = spacy.blank(lang)\n", " for w in self.special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}])\n", " self.pipe,self.buf_sz = nlp.pipe,buf_sz\n", "\n", " def __call__(self, items):\n", " return (L(doc).attrgot('text') for doc in self.pipe(map(str,items), batch_size=self.buf_sz))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "WordTokenizer = SpacyTokenizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tok = SpacyTokenizer()\n", "inp,exp = \"This isn't the easiest text.\",[\"This\", \"is\", \"n't\", \"the\", \"easiest\", \"text\", \".\"]\n", "test_eq(L(tok([inp,inp])), [exp,exp])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "class TokenizeWithRules:\n", " \"A wrapper around `tok` which applies `rules`, then tokenizes, then applies `post_rules`\"\n", " def __init__(self, tok, rules=None, post_rules=None):\n", " self.rules = L(ifnone(rules, defaults.text_proc_rules))\n", " self.post_f = compose(*L(ifnone(post_rules, defaults.text_postproc_rules)))\n", " self.tok = tok\n", "\n", " def __call__(self, batch):\n", " return (L(o).map(self.post_f) for o in self.tok(maps(*self.rules, batch)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = TokenizeWithRules(BaseTokenizer(),rules=[replace_all_caps])\n", "test_eq(f([\"THIS isn't a problem\"]), [[TK_UP, 'this', \"isn't\", 'a', 'problem']])\n", "f = TokenizeWithRules(SpacyTokenizer())\n", "test_eq(f([\"This isn't a problem\"]), [[BOS, TK_MAJ, 'this', 'is', \"n't\", 'a', 'problem']])\n", "f = TokenizeWithRules(BaseTokenizer(split_char=\"'\"), rules=[])\n", "test_eq(f([\"This isn't a problem\"]), [['This▁isn', 't▁a▁problem']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The main function that will be called during one of the processes handling tokenization. It will iterate through the `batch` of texts, apply them `rules` and tokenize them." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "texts = [\"this is a text\", \"this is another text\"]\n", "tok = TokenizeWithRules(BaseTokenizer(), texts.__getitem__)\n", "test_eq(tok([0,1]), [['this', 'is', 'a', 'text'],['this', 'is', 'another', 'text']])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "@delegates(TokenizeWithRules)\n", "def tokenize1(text, tok, **kwargs):\n", " \"Call `TokenizeWithRules` with a single text\"\n", " return first(TokenizeWithRules(tok=tok, **kwargs)([text]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_eq(tokenize1(\"This isn't a problem\", SpacyTokenizer()),\n", " [BOS, TK_MAJ, 'this', 'is', \"n't\", 'a', 'problem'])\n", "test_eq(tokenize1(\"This isn't a problem\", tok=BaseTokenizer(), rules=[]),\n", " ['This',\"isn't\",'a','problem'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def parallel_tokenize(items, tok=None, rules=None, n_workers=defaults.cpus, **kwargs):\n", " \"Calls optional `setup` on `tok` before launching `TokenizeWithRules` using `parallel_gen\"\n", " if tok is None: tok = WordTokenizer()\n", " if hasattr(tok, 'setup'): tok.setup(items, rules)\n", " return parallel_gen(TokenizeWithRules, items, tok=tok, rules=rules, n_workers=n_workers, **kwargs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that since this uses `parallel_gen` behind the scenes, the generator returned contains tuples of indices and results. There is no guarantee that the results are returned in order, so you should sort by the first item of the tuples (the indices) if you need them ordered." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "res = parallel_tokenize(['0 1', '1 2'], rules=[], n_workers=2)\n", "idxs,toks = zip(*L(res).sorted(itemgetter(0)))\n", "test_eq(toks, [['0','1'],['1','2']])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "res1 = parallel_tokenize(['0 1', '1 2'], tok=BaseTokenizer(), rules=[], n_workers=0)\n", "idxs1,toks1 = zip(*L(res1).sorted(itemgetter(0)))\n", "test_eq(toks, toks1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tokenize texts in files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Preprocessing function for texts in filenames. Tokenized texts will be saved in a similar fashion in a directory suffixed with `_tok` in the parent folder of `path` (override with `output_dir`). This directory is the return value." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "fn_counter_pkl = 'counter.pkl'\n", "fn_lengths_pkl = 'lengths.pkl'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def _tokenize_files(func, files, path, output_dir=None, output_names=None, n_workers=defaults.cpus, rules=None, tok=None,\n", " encoding='utf8', skip_if_exists=False):\n", " \"Tokenize text `files` in parallel using `n_workers`\"\n", " if tok is None: tok = WordTokenizer()\n", " output_dir = Path(ifnone(output_dir, path.parent/f'{path.name}_tok'))\n", " if skip_if_exists and output_dir.exists(): return output_dir\n", " output_dir.mkdir(exist_ok=True)\n", " if output_names is None: output_names = L(output_dir/f.relative_to(path) for f in files)\n", " rules = partial(Path.read_text, encoding=encoding) + L(ifnone(rules, defaults.text_proc_rules.copy()))\n", "\n", " lengths,counter = {},Counter()\n", " for i,tok in parallel_tokenize(files, tok, rules, n_workers=n_workers):\n", " out = func(i,output_dir)\n", " out.mk_write(' '.join(tok), encoding=encoding)\n", " lengths[str(files[i].relative_to(path))] = len(tok)\n", " counter.update(tok)\n", "\n", " save_pickle(output_dir/fn_lengths_pkl, lengths)\n", " save_pickle(output_dir/fn_counter_pkl, counter)\n", " return output_dir" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "@delegates(_tokenize_files)\n", "def tokenize_folder(path, extensions=None, folders=None, output_dir=None, skip_if_exists=True, **kwargs):\n", " \"Tokenize text files in `path` in parallel using `n_workers`\"\n", " path,extensions = Path(path),ifnone(extensions, ['.txt'])\n", " files = get_files(path, extensions=extensions, recurse=True, folders=folders)\n", " def _f(i,output_dir): return output_dir/files[i].relative_to(path)\n", " return _tokenize_files(_f, files, path, skip_if_exists=skip_if_exists, **kwargs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The result will be in `output_dir` (defaults to a folder in the same parent directory as `path`, with `_tok` added to `path.name`) with the same structure as in `path`. Tokenized texts for a given file will be in the file having the same name in `output_dir`. Additionally, a file with a .len suffix contains the number of tokens and the count of all words is stored in `output_dir/counter.pkl`.\n", "\n", "`extensions` will default to `['.txt']` and all text files in `path` are treated unless you specify a list of folders in `include`. `rules` (that defaults to `defaults.text_proc_rules`) are applied to each text before going in the tokenizer." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "@delegates(_tokenize_files)\n", "def tokenize_files(files, path, output_dir, output_names=None, **kwargs):\n", " \"Tokenize text `files` in parallel using `n_workers`\"\n", " if output_names is None: output_names = L(output_dir/f.relative_to(path) for f in files)\n", " def _f(i,output_dir): return output_dir/output_names[i]\n", " return _tokenize_files(_f, files, path, output_dir=output_dir, **kwargs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Tokenize texts in a dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def _join_texts(df, mark_fields=False):\n", " \"Join texts in row `idx` of `df`, marking each field with `FLD` if `mark_fields=True`\"\n", " text_col = (f'{FLD} {1} ' if mark_fields else '' ) + df.iloc[:,0].astype(str)\n", " for i in range(1,len(df.columns)):\n", " text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df.iloc[:,i].astype(str)\n", " return text_col.values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#hide\n", "texts = [f\"This is an example of text {i}\" for i in range(10)]\n", "df = pd.DataFrame({'text': texts, 'text1': texts}, columns=['text', 'text1'])\n", "col = _join_texts(df, mark_fields=True) \n", "\n", "for i in range(len(df)):\n", " test_eq(col[i], f'{FLD} 1 This is an example of text {i} {FLD} 2 This is an example of text {i}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def tokenize_texts(texts, n_workers=defaults.cpus, rules=None, tok=None):\n", " \"Tokenize `texts` in parallel using `n_workers`\"\n", " rules = L(ifnone(rules, defaults.text_proc_rules.copy()))\n", " outputs = L(parallel_tokenize(texts, tok=tok, rules=rules, n_workers=n_workers)\n", " ).sorted().itemgot(1)\n", " return outputs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def tokenize_df(df, text_cols, n_workers=defaults.cpus, rules=None, mark_fields=None,\n", " tok=None, tok_text_col=\"text\"):\n", " \"Tokenize texts in `df[text_cols]` in parallel using `n_workers` and stores them in `df[tok_text_col]`\"\n", " text_cols = [df.columns[c] if isinstance(c, int) else c for c in L(text_cols)]\n", " #mark_fields defaults to False if there is one column of texts, True if there are multiple\n", " if mark_fields is None: mark_fields = len(text_cols)>1\n", " rules = L(ifnone(rules, defaults.text_proc_rules.copy()))\n", " texts = _join_texts(df[text_cols], mark_fields=mark_fields)\n", " outputs = L(parallel_tokenize(texts, tok, rules, n_workers=n_workers)\n", " ).sorted().itemgot(1)\n", "\n", " other_cols = df.columns[~df.columns.isin(text_cols)]\n", " res = df[other_cols].copy()\n", " res[tok_text_col] = outputs\n", " res[f'{tok_text_col}_length'] = [len(o) for o in outputs]\n", " return res,Counter(outputs.concat())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This function returns a new dataframe with the same non-text columns, a column named text that contains the tokenized texts and a column named text_lengths that contains their respective length. It also returns a counter of all seen words to quickly build a vocabulary afterward.\n", "\n", "`rules` (that defaults to `defaults.text_proc_rules`) are applied to each text before going in the tokenizer. If `mark_fields` isn't specified, it defaults to `False` when there is a single text column, `True` when there are several. In that case, the texts in each of those columns are joined with `FLD` markers followed by the number of the field." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def tokenize_csv(fname, text_cols, outname=None, n_workers=4, rules=None, mark_fields=None,\n", " tok=None, header='infer', chunksize=50000):\n", " \"Tokenize texts in the `text_cols` of the csv `fname` in parallel using `n_workers`\"\n", " df = pd.read_csv(fname, header=header, chunksize=chunksize)\n", " outname = Path(ifnone(outname, fname.parent/f'{fname.stem}_tok.csv'))\n", " cnt = Counter()\n", "\n", " for i,dfp in enumerate(df):\n", " out,c = tokenize_df(dfp, text_cols, n_workers=n_workers, rules=rules,\n", " mark_fields=mark_fields, tok=tok)\n", " out.text = out.text.str.join(' ')\n", " out.to_csv(outname, header=(None,header)[i==0], index=False, mode=('a','w')[i==0])\n", " cnt.update(c)\n", "\n", " save_pickle(outname.with_suffix('.pkl'), cnt)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "def load_tokenized_csv(fname):\n", " \"Utility function to quickly load a tokenized csv ans the corresponding counter\"\n", " fname = Path(fname)\n", " out = pd.read_csv(fname)\n", " for txt_col in out.columns[1:-1]:\n", " out[txt_col] = tuple(out[txt_col].str.split(' '))\n", " return out,load_pickle(fname.with_suffix('.pkl'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The result will be written in a new csv file in `outname` (defaults to the same as `fname` with the suffix `_tok.csv`) and will have the same header as the original file, the same non-text columns, a text and a text_lengths column as described in `tokenize_df`.\n", "\n", "`rules` (that defaults to `defaults.text_proc_rules`) are applied to each text before going in the tokenizer. If `mark_fields` isn't specified, it defaults to `False` when there is a single text column, `True` when there are several. In that case, the texts in each of those columns are joined with `FLD` markers followed by the number of the field.\n", "\n", "The csv file is opened with `header` and optionally with blocks of `chunksize` at a time. If this argument is passed, each chunk is processed independently and saved in the output file to save memory usage." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def _prepare_texts(tmp_d):\n", " \"Prepare texts in a folder struct in tmp_d, a csv file and returns a dataframe\"\n", " path = Path(tmp_d)/'tmp'\n", " path.mkdir()\n", " for d in ['a', 'b', 'c']: \n", " (path/d).mkdir()\n", " for i in range(5):\n", " with open(path/d/f'text{i}.txt', 'w') as f: f.write(f\"This is an example of text {d} {i}\")\n", " \n", " texts = [f\"This is an example of text {d} {i}\" for i in range(5) for d in ['a', 'b', 'c']]\n", " df = pd.DataFrame({'text': texts, 'label': list(range(15))}, columns=['text', 'label'])\n", " csv_fname = tmp_d/'input.csv'\n", " df.to_csv(csv_fname, index=False)\n", " return path,df,csv_fname" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [], "text/plain": [ "" ] test_eq(cnt_a['a'], 5)\n", " test_eq(cnt_a['0'], 3)\n", " \n", " #Tokenize as files\n", " files = get_text_files(path)\n", " tokenize_files(files, path, output_dir=path/'d')\n", " for f in files: \n", " test_eq((path/'d'/f.relative_to(path)).read_text(), ' '.join([\n", " BOS, TK_MAJ, 'this', 'is', 'an', 'example', 'of', 'text', f.parent.name, f.name[4]]))\n", " \n", " #Tokenize as individual texts\n", " out = tokenize_texts(df['text'].values)\n", " test_eq(out, [(outp/d/f'text{i}.txt').read_text().split(' ') for i in range(5) for d in ['a', 'b', 'c']])\n", " \n", " #Tokenize as a dataframe\n", " out,cnt_b = tokenize_df(df, text_cols='text')\n", " test_eq(list(out.columns), ['label', 'text', 'text_length'])\n", " test_eq(out['label'].values, df['label'].values)\n", " test_eq(list(out['text']), [(outp/d/f'text{i}.txt').read_text().split(' ') for i in range(5) for d in ['a', 'b', 'c']])\n", " test_eq(cnt_a, cnt_b)\n", " \n", " #Tokenize as a csv \n", " out_fname = Path(tmp_d)/'output.csv'\n", " tokenize_csv(csv_fname, text_cols='text', outname=out_fname)\n", " a,b = load_tokenized_csv(out_fname)\n", " test_eq((out,cnt_b), load_tokenized_csv(out_fname))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## `Tokenizer`-" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "class Tokenizer(Transform):\n", " \"Provides a consistent `Transform` interface to tokenizers operating on `DataFrame`s and folders\"\n", " input_types = (str, list, L, tuple, Path)\n", " def __init__(self, tok, rules=None, counter=None, lengths=None, mode=None, sep=' '):\n", " if isinstance(tok,type): tok=tok()\n", " store_attr('tok,counter,lengths,mode,sep')\n", " self.rules = defaults.text_proc_rules if rules is None else rules\n", "\n", " @classmethod\n", " @delegates(tokenize_df, keep=True)\n", " def from_df(cls, text_cols, tok=None, rules=None, sep=' ', **kwargs):\n", " if tok is None: tok = WordTokenizer()\n", " res = cls(tok, rules=rules, mode='df')\n", " res.kwargs,res.train_setup = merge({'tok': tok}, kwargs),False\n", " res.text_cols,res.sep = text_cols,sep\n", " return res\n", "\n", " @classmethod\n", " @delegates(tokenize_folder, keep=True)\n", " def from_folder(cls, path, tok=None, rules=None, **kwargs):\n", " path = Path(path)\n", " if tok is None: tok = WordTokenizer()\n", " output_dir = tokenize_folder(path, tok=tok, rules=rules, **kwargs)\n", " res = cls(tok, counter=load_pickle(output_dir/fn_counter_pkl),\n", " lengths=load_pickle(output_dir/fn_lengths_pkl), rules=rules, mode='folder')\n", " res.path,res.output_dir = path,output_dir\n", " return res\n", "\n", " def setups(self, dsets):\n", " if not self.mode == 'df' or not isinstance(dsets.items, pd.DataFrame): return\n", " dsets.items,count = tokenize_df(dsets.items, self.text_cols, rules=self.rules, **self.kwargs)\n", " if self.counter is None: self.counter = count\n", " return dsets\n", "\n", " def 'this is another test'])\n", "test_eq(tst, [(['xxbos', 'xxmaj', 'this','is','a','test'],), \n", " (['xxbos','this','is','another','test'],)])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sentencepiece" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "eu_langs = [\"bg\", \"cs\", \"da\", \"de\", \"el\", \"en\", \"es\", \"et\", \"fi\", \"fr\", \"ga\", \"hr\", \"hu\",\n", " \"it\",\"lt\",\"lv\",\"mt\",\"nl\",\"pl\",\"pt\",\"ro\",\"sk\",\"sl\",\"sv\"] # all European langs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "class SentencePieceTokenizer():#TODO: pass the special tokens symbol to sp\n", " \"SentencePiece tokenizer for `lang`\"\n", " def __init__(self, lang='en', special_toks=None, sp_model=None, vocab_sz=None, max_vocab_sz=30000,\n", " model_type='unigram', char_coverage=None, cache_dir='tmp'):\n", " try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor\n", " except ImportError:\n", " raise Exception('sentencepiece module is missing: run `pip install sentencepiece!=0.1.90,!=0.1.91`')\n", " self.sp_model,self.cache_dir = sp_model,Path(cache_dir)\n", " self.vocab_sz,self.max_vocab_sz,self.model_type = vocab_sz,max_vocab_sz,model_type\n", " self.char_coverage = ifnone(char_coverage, 0.99999 if lang in eu_langs else 0.9998)\n", " self.special_toks = ifnone(special_toks, defaults.text_spec_tok)\n", " if sp_model is None: self.tok = None\n", " else:\n", " self.tok = SentencePieceProcessor()\n", " self.tok.Load(str(sp_model))\n", " os.makedirs(self.cache_dir, exist_ok=True)\n", "\n", " def _get_vocab_sz(self, raw_text_path):\n", " cnt = Counter()\n", " with open(raw_text_path, 'r') as f:\n", " for line in f.readlines():\n", " cnt.update(line.split())\n", " if len(cnt)//4 > self.max_vocab_sz: return self.max_vocab_sz\n", " res = len(cnt)//4\n", " while res%8 != 0: res+=1\n", " return max(res,29)\n", "\n", " def train(self, raw_text_path):\n", " \"Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`\"\n", " from sentencepiece import SentencePieceTrainer\n", " vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz\n", " spec_tokens = ['\\u2581'+s for s in self.special_toks]\n", " SentencePieceTrainer.Train(\" \".join([\n", " f\"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}\",\n", " f\"--character_coverage={self.char_coverage} --model_type={self.model_type}\",\n", " f\"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1 --minloglevel=2\",\n", " f\"--user_defined_symbols={','.join(spec_tokens)} --hard_vocab_limit=false\"]))\n", " raw_text_path.unlink()\n", " return self.cache_dir/'spm.model'\n", "\n", " def setup(self, items, rules=None):\n", " from sentencepiece import SentencePieceProcessor\n", " if rules is None: rules = []\n", " if self.tok is not None: return {'sp_model': self.sp_model}\n", " raw_text_path = self.cache_dir/'texts.out'\n", " with open(raw_text_path, 'w') as f:\n", " for t in progress_bar(maps(*rules, items), total=len(items), leave=False):\n", " f.write(f'{t}\\n')\n", " sp_model = self.train(raw_text_path)\n", " self.tok = SentencePieceProcessor()\n", " self.tok.Load(str(sp_model))\n", " return {'sp_model': sp_model}\n", "\n", " def __call__(self, items):\n", " if self.tok is None: self.setup(items)\n", " for t in items: yield self.tok.EncodeAsPieces(t)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#export\n", "SubwordTokenizer = SentencePieceTokenizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "texts = [f\"This is an example of text with tempfile.TemporaryDirectory() as tmp_d:
    path,df,csv_fname = _prepare_texts(Path(tmp_d))
    items = get_text_files(path)
    splits = RandomSplitter()(items)
    tok = SentencePieceTokenizer(special_toks=[])
    dsets = Datasets(items, [Tokenizer.from_folder(path, tok=tok)], splits=splits)
    print(dsets.train[0][0])
    
with warnings.catch_warnings():
    dsets = Datasets(df, [Tokenizer.from_df('text', tok=tok)], splits=splits)
    print(dsets.train[0][0].text)