{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Sentence Tokenization.ipynb", "version": "0.3.2", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "metadata": { "id": "WImZC8UYPfD6", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "!pip install nltk\n", "!pip install gensim\n", "!pip install spacy\n", "\n", "!python -m spacy download en" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "ZHXC4CGcWBpc", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 68 }, "outputId": "98dc2b8e-e546-4d24-a916-2991bbfc805e" }, "cell_type": "code", "source": [ "import nltk\n", "import gensim\n", "import spacy\n", "\n", "import requests\n", "\n", "nltk.download('punkt')" ], "execution_count": 26, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "True" ] }, "metadata": { "tags": [] }, "execution_count": 26 } ] }, { "metadata": { "id": "wemsv33GX5s-", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "text_url = 'http://rare-technologies.com/the_matrix_synopsis.txt'\n", "text = requests.get(text_url).text" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "_y0QPfN1bxae", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## Text length" ] }, { "metadata": { "id": "d476-dHXbyQ7", "colab_type": "text" }, "cell_type": "markdown", "source": [ "words" ] }, { "metadata": { "id": "joVu9MzIbrdL", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "194510ef-c040-4a02-8cd7-adb2e25ddd5e" }, "cell_type": "code", "source": [ "len(text.split(' '))" ], "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "6348" ] }, "metadata": { "tags": [] }, "execution_count": 33 } ] }, { "metadata": { "id": "ccN2VGF_bzfB", "colab_type": "text" }, "cell_type": "markdown", "source": [ "lines" ] }, { "metadata": { "id": "BXR3bQUlbojs", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "74f677de-6753-4a00-b2f4-d088075c51f2" }, "cell_type": "code", "source": [ "len(text.split('\\n'))" ], "execution_count": 31, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "195" ] }, "metadata": { "tags": [] }, "execution_count": 31 } ] }, { "metadata": { "id": "tyLd3SPZbaLz", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## gensim" ] }, { "metadata": { "id": "plbbPDgFWYqG", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "73541c69-e549-48e2-d916-08468b2621b3" }, "cell_type": "code", "source": [ "%%timeit -r 10\n", "\n", "gensim.summarization.textcleaner.split_sentences(text)" ], "execution_count": 23, "outputs": [ { "output_type": "stream", "text": [ "100 loops, best of 10: 4.27 ms per loop\n" ], "name": "stdout" } ] }, { "metadata": { "id": "Mzcwf3W-bcMj", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## NLTK" ] }, { "metadata": { "id": "Tb7UUajLW6xh", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "95d129fd-e123-4658-d231-0ef2d5fdebdc" }, "cell_type": "code", "source": [ "%%timeit -r 10\n", "\n", "nltk.sent_tokenize(text)" ], "execution_count": 24, "outputs": [ { "output_type": "stream", "text": [ "100 loops, best of 10: 14.8 ms per loop\n" ], "name": "stdout" } ] }, { "metadata": { "id": "z3QeC5zibekj", "colab_type": "text" }, "cell_type": "markdown", "source": [ "## spaCy" ] }, { "metadata": { "id": "Y1Ai1qxyYGds", "colab_type": "code", "colab": {} }, "cell_type": "code", "source": [ "nlp = spacy.load('en')" ], "execution_count": 0, "outputs": [] }, { "metadata": { "id": "sy483b2ZanbC", "colab_type": "code", "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "outputId": "429806d3-eec6-4e27-c800-1ff193f5d3f2" }, "cell_type": "code", "source": [ "%%timeit\n", "\n", "nlp(text).sents" ], "execution_count": 30, "outputs": [ { "output_type": "stream", "text": [ "1 loop, best of 3: 1.96 s per loop\n" ], "name": "stdout" } ] } ] }