{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Spacy.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "# prepare spaCy english trained pipelines, e.g. en_core_web_sm, en_core_web_md, en_core_web_lg, ..." ], "metadata": { "id": "_4GVZJMkqpyK" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "source": [ "# import libraries\n", "import re\n", "import pandas as pd\n", "import bs4\n", "import requests\n", "import spacy\n", "from spacy import displacy\n", "\n", "from spacy.matcher import Matcher \n", "from spacy.tokens import Span \n", "\n", "import networkx as nx\n", "\n", "import matplotlib.pyplot as plt\n", "from tqdm import tqdm\n", "\n", "pd.set_option('display.max_colwidth', 200)\n", "%matplotlib inline" ], "metadata": { "id": "99zFA2FishIy" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "# exact sentence example from module sentences, 3rd position from example\n", "\n", "from spacy.lang.en.examples import sentences \n", "\n", "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", "doc = nlp(sentences[3])\n", "\n", "print(doc.text)\n", "\n", "for token in doc:\n", " print(token.text, token.pos_, token.dep_)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "B7Au8A32riVo", "outputId": "b6e5ac80-755d-4db8-f01e-42442785a0ef" }, "execution_count": 30, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "London is a big city in the United Kingdom.\n", "London PROPN nsubj\n", "is AUX ROOT\n", "a DET det\n", "big ADJ amod\n", "city NOUN attr\n", "in ADP prep\n", "the DET det\n", "United PROPN compound\n", "Kingdom PROPN pobj\n", ". PUNCT punct\n" ] } ] }, { "cell_type": "code", "source": [ "sentences" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NL05RF4YeO9v", "outputId": "dac6e7b0-333a-4982-8710-97940d0c22de" }, "execution_count": 31, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "['Apple is looking at buying U.K. startup for $1 billion',\n", " 'Autonomous cars shift insurance liability toward manufacturers',\n", " 'San Francisco considers banning sidewalk delivery robots',\n", " 'London is a big city in the United Kingdom.',\n", " 'Where are you?',\n", " 'Who is the president of France?',\n", " 'What is the capital of the United States?',\n", " 'When was Barack Obama born?']" ] }, "metadata": {}, "execution_count": 31 } ] }, { "cell_type": "code", "source": [ "# download csv sentences from here: https://drive.google.com/file/d/1yuEUhkVFIYfMVfpA_crFGfSeJLgbPUxu/view " ], "metadata": { "id": "vnUTUwAJshMC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# import wikipedia sentences, they have 4318 rows\n", "candidate_sentences = pd.read_csv(\"wiki_sentences_v2.csv\")\n", "candidate_sentences.shape" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ToC3f1xfshPD", "outputId": "8e13a9a6-f62f-4a46-f5ec-9c73ad821c98" }, "execution_count": 32, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(4318, 1)" ] }, "metadata": {}, "execution_count": 32 } ] }, { "cell_type": "code", "source": [ "candidate_sentences" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 419 }, "id": "qS2JOftMawmH", "outputId": "f8f09dbc-5c87-46cd-b172-986a70dec0b6" }, "execution_count": 33, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " sentence\n", "0 confused and frustrated, connie decides to leave on her own.\n", "1 later, a woman’s scream is heard in the distance.\n", "2 christian is then paralyzed by an elder.\n", "3 the temple is set on fire.\n", "4 outside, the cult wails with him.\n", "... ...\n", "4313 confidencial also responded negatively, calling the film a barren drama, unsubtle and self-indulgent.\n", "4314 and le parisien gave the film their highest five-star rating.\n", "4315 the museum collection includes 37,000 film titles, 60,000 posters, 700,000 photographs and 20,000 books.\n", "4316 its predecessor was the dutch historical film archive, founded in 1946.\n", "4317 , 1920'sfilmstar greta garbo by alexander binder,\n", "\n", "[4318 rows x 1 columns]" ], "text/html": [ "\n", "
\n", " | sentence | \n", "
---|---|
0 | \n", "confused and frustrated, connie decides to leave on her own. | \n", "
1 | \n", "later, a woman’s scream is heard in the distance. | \n", "
2 | \n", "christian is then paralyzed by an elder. | \n", "
3 | \n", "the temple is set on fire. | \n", "
4 | \n", "outside, the cult wails with him. | \n", "
... | \n", "... | \n", "
4313 | \n", "confidencial also responded negatively, calling the film a barren drama, unsubtle and self-indulgent. | \n", "
4314 | \n", "and le parisien gave the film their highest five-star rating. | \n", "
4315 | \n", "the museum collection includes 37,000 film titles, 60,000 posters, 700,000 photographs and 20,000 books. | \n", "
4316 | \n", "its predecessor was the dutch historical film archive, founded in 1946. | \n", "
4317 | \n", ", 1920'sfilmstar greta garbo by alexander binder, | \n", "
4318 rows × 1 columns
\n", "