{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Benchmarking different tokening approaches" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### fastai v1.0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from fastai.text import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = untar_data(URLs.IMDB)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "il = (TextList.from_folder(path, processor=[OpenFileProcessor(), TokenizeProcessor()])\n", " .filter_by_folder(include=['train', 'test', 'unsup']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "opener = OpenFileProcessor()\n", "opener.process(il)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = TokenizeProcessor()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time tokenizer.process(il)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### dev_course nb 12" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from exp.nb_12 import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = datasets.untar_data(datasets.URLs.IMDB)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "il = TextList.from_files(path, include=['train', 'test', 'unsup'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tp = TokenizeProcessor()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "texts = [read_file(f) for f in il.items]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time tokens = tp(texts)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Doesn't kill process each time." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Other" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from exp.nb_12 import *" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "path = datasets.untar_data(datasets.URLs.IMDB)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "il = TextList.from_files(path, include=['train', 'test', 'unsup'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from multiprocessing import Process, Queue, cpu_count" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def text_gen(fnames):\n", " for fn in fnames:\n", " with open(fn, 'r') as r:\n", " txt = r.read()\n", " for fn in default_pre_rules:\n", " txt = fn(txt)\n", " yield txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def process_files(fnames, data_queue, progress_queue, lang='en', batch_size=5000):\n", " nlp = spacy.blank(lang, disable=[\"parser\", \"tagger\", \"ner\"])\n", " for w in default_spec_tok: nlp.tokenizer.add_special_case(w, [{ORTH: w}])\n", " tokens = []\n", " for docs in nlp.pipe(text_gen(fnames), batch_size=batch_size):\n", " toks = [t.text for t in docs]\n", " for fn in default_post_rules: toks = fn(toks)\n", " tokens.append(toks)\n", " progress_queue.put(1)\n", " data_queue.put(tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize(fnames, lang='en', n_workers=4, chunk_size=5000):\n", " progress_queue,data_queue = Queue(maxsize=n_workers),Queue(maxsize=n_workers)\n", " processes = [Process(target=process_files,\n", " args=(batch, data_queue, progress_queue, lang, chunk_size))\n", " for i,batch in enumerate(np.array_split(fnames, n_workers))]\n", " for p in processes: p.start()\n", " tokens = []\n", " for _ in progress_bar(fnames): _ = progress_queue.get() \n", " for _ in processes: tokens += data_queue.get()\n", " for p in processes: p.join()\n", " return tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%time t = tokenize(il.items)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }