{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Benchmarking different tokening approaches"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### fastai v1.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from fastai.text import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = untar_data(URLs.IMDB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "il = (TextList.from_folder(path, processor=[OpenFileProcessor(), TokenizeProcessor()])\n",
    "              .filter_by_folder(include=['train', 'test', 'unsup']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "opener = OpenFileProcessor()\n",
    "opener.process(il)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = TokenizeProcessor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time tokenizer.process(il)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### dev_course nb 12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from exp.nb_12 import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = datasets.untar_data(datasets.URLs.IMDB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "il = TextList.from_files(path, include=['train', 'test', 'unsup'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tp = TokenizeProcessor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = [read_file(f) for f in il.items]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time tokens = tp(texts)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Doesn't kill process each time."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Other"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from exp.nb_12 import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = datasets.untar_data(datasets.URLs.IMDB)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "il = TextList.from_files(path, include=['train', 'test', 'unsup'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from multiprocessing import Process, Queue, cpu_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def text_gen(fnames):\n",
    "    for fn in fnames:\n",
    "        with open(fn, 'r') as r:\n",
    "            txt = r.read()\n",
    "            for fn in default_pre_rules:\n",
    "                txt = fn(txt)\n",
    "            yield txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_files(fnames, data_queue, progress_queue, lang='en', batch_size=5000):\n",
    "    nlp = spacy.blank(lang, disable=[\"parser\", \"tagger\", \"ner\"])\n",
    "    for w in default_spec_tok: nlp.tokenizer.add_special_case(w, [{ORTH: w}])\n",
    "    tokens = []\n",
    "    for docs in nlp.pipe(text_gen(fnames), batch_size=batch_size):\n",
    "        toks = [t.text for t in docs]\n",
    "        for fn in default_post_rules: toks = fn(toks)\n",
    "        tokens.append(toks)\n",
    "        progress_queue.put(1)\n",
    "    data_queue.put(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(fnames, lang='en', n_workers=4, chunk_size=5000):\n",
    "    progress_queue,data_queue = Queue(maxsize=n_workers),Queue(maxsize=n_workers)\n",
    "    processes = [Process(target=process_files,\n",
    "                         args=(batch, data_queue, progress_queue, lang, chunk_size))\n",
    "                 for i,batch in enumerate(np.array_split(fnames, n_workers))]\n",
    "    for p in processes: p.start()\n",
    "    tokens = []\n",
    "    for _ in progress_bar(fnames): _ = progress_queue.get()  \n",
    "    for _ in processes: tokens += data_queue.get()\n",
    "    for p in processes: p.join()\n",
    "    return tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%time t = tokenize(il.items)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}