{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "minus-gibson", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:tensorflow:Enabling eager execution\n", "INFO:tensorflow:Enabling v2 tensorshape\n", "INFO:tensorflow:Enabling resource variables\n", "INFO:tensorflow:Enabling tensor equality\n", "INFO:tensorflow:Enabling control flow v2\n" ] } ], "source": [ "import spacy" ] }, { "cell_type": "code", "execution_count": 2, "id": "alternative-flashing", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting en-core-web-md==3.1.0\n", " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.1.0/en_core_web_md-3.1.0-py3-none-any.whl (45.4 MB)\n", "Requirement already satisfied: spacy<3.2.0,>=3.1.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from en-core-web-md==3.1.0) (3.1.1)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (1.0.5)\n", "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.7 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (3.0.8)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2.0.5)\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (1.7.4)\n", "Requirement already satisfied: packaging>=20.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (20.9)\n", "Requirement already satisfied: numpy>=1.15.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (1.20.2)\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.4 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2.0.4)\n", "Requirement already satisfied: pathy>=0.3.5 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (0.4.0)\n", "Requirement already satisfied: blis<0.8.0,>=0.4.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (0.7.4)\n", "Requirement already satisfied: setuptools in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (54.1.1)\n", "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (0.8.2)\n", "Requirement already satisfied: thinc<8.1.0,>=8.0.8 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (8.0.8)" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2021-09-14 14:04:48.662634: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll\n", "WARNING: You are using pip version 21.1.2; however, version 21.2.4 is available.\n", "You should consider upgrading via the 'C:\\Users\\wma22\\AppData\\Local\\Programs\\Python\\Python39\\python.exe -m pip install --upgrade pip' command.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (4.59.0)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2.25.1)\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2.4.1)\n", "Requirement already satisfied: jinja2 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2.11.3)\n", "Requirement already satisfied: typer<0.4.0,>=0.3.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (0.3.2)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (3.0.5)\n", "Requirement already satisfied: pyparsing>=2.0.2 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from packaging>=20.0->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2.4.7)\n", "Requirement already satisfied: smart-open<4.0.0,>=2.2.0 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pathy>=0.3.5->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (3.0.0)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (1.26.3)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (4.0.0)\n", "Requirement already satisfied: idna<3,>=2.5 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (2020.12.5)\n", "Requirement already satisfied: click<7.2.0,>=7.1.1 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from typer<0.4.0,>=0.3.0->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (7.1.2)\n", "Requirement already satisfied: MarkupSafe>=0.23 in c:\\users\\wma22\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from jinja2->spacy<3.2.0,>=3.1.0->en-core-web-md==3.1.0) (1.1.1)\n", "[+] Download and installation successful\n", "You can now load the package via spacy.load('en_core_web_md')\n" ] } ], "source": [ "!python -m spacy download en_core_web_md" ] }, { "cell_type": "code", "execution_count": 3, "id": "shaped-detector", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_md\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "third-genome", "metadata": {}, "outputs": [], "source": [ "with open (\"data/wiki_us.txt\", \"r\") as f:\n", " text = f.read()" ] }, { "cell_type": "code", "execution_count": 5, "id": "disturbed-practice", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.\n" ] } ], "source": [ "doc = nlp(text)\n", "sentence1 = list(doc.sents)[0]\n", "print (sentence1)" ] }, { "cell_type": "code", "execution_count": 8, "id": "creative-robin", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['country', 'COUNTRY', 'NATION', 'nation', 'COUNTIRES', 'nations', 'member-states', 'worLd', 'World', 'world']\n" ] } ], "source": [ "import numpy as np\n", "your_word = \"country\"\n", "\n", "ms = nlp.vocab.vectors.most_similar(\n", " np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)\n", "words = [nlp.vocab.strings[w] for w in ms[0][0]]\n", "distances = ms[2]\n", "print(words)" ] }, { "cell_type": "code", "execution_count": 9, "id": "medical-party", "metadata": {}, "outputs": [], "source": [ "doc1 = nlp(\"I like salty fries and hamburgers.\")\n", "doc2 = nlp(\"Fast food tastes very good.\")" ] }, { "cell_type": "code", "execution_count": 10, "id": "logical-connectivity", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I like salty fries and hamburgers. <-> Fast food tastes very good. 0.7799485853415737\n" ] } ], "source": [ "print (doc1, \"<->\", doc2, doc1.similarity(doc2))" ] }, { "cell_type": "code", "execution_count": 11, "id": "severe-advocacy", "metadata": {}, "outputs": [], "source": [ "doc3 = nlp(\"The Empire State Building is in New York.\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "working-appearance", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I like salty fries and hamburgers. <-> The Empire State Building is in New York. 0.5196037639243649\n" ] } ], "source": [ "print (doc1, \"<->\", doc3, doc1.similarity(doc3))" ] }, { "cell_type": "code", "execution_count": 13, "id": "graphic-start", "metadata": {}, "outputs": [], "source": [ "doc4 = nlp(\"I enjoy oranges.\")\n", "doc5 = nlp(\"I enjoy apples.\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "pregnant-efficiency", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I enjoy oranges. <-> I enjoy apples. 0.9607558420297302\n" ] } ], "source": [ "print (doc4, \"<->\", doc5, doc4.similarity(doc5))" ] }, { "cell_type": "code", "execution_count": 15, "id": "enclosed-questionnaire", "metadata": {}, "outputs": [], "source": [ "doc6 = nlp(\"I enjoy burgers.\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "confused-coral", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "I enjoy oranges. <-> I enjoy burgers. 0.8755329986251214\n" ] } ], "source": [ "print (doc4, \"<->\", doc6, doc4.similarity(doc6))" ] }, { "cell_type": "code", "execution_count": 17, "id": "sensitive-concrete", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "salty fries <-> hamburgers 0.7304624\n" ] } ], "source": [ "french_fries = doc1[2:4]\n", "burgers = doc1[5]\n", "print(french_fries, \"<->\", burgers, french_fries.similarity(burgers))" ] }, { "cell_type": "code", "execution_count": 18, "id": "informational-roberts", "metadata": {}, "outputs": [], "source": [ "nlp = spacy.blank(\"en\")" ] }, { "cell_type": "code", "execution_count": 19, "id": "cloudy-qatar", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp.add_pipe(\"sentencizer\")" ] }, { "cell_type": "code", "execution_count": 21, "id": "practical-carter", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],\n", " 'requires': [],\n", " 'scores': ['sents_f', 'sents_p', 'sents_r'],\n", " 'retokenizes': False}},\n", " 'problems': {'sentencizer': []},\n", " 'attrs': {'doc.sents': {'assigns': ['sentencizer'], 'requires': []},\n", " 'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []}}}" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp.analyze_pipes()" ] }, { "cell_type": "code", "execution_count": 22, "id": "received-proposition", "metadata": {}, "outputs": [], "source": [ "nlp2 = spacy.load(\"en_core_web_sm\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "closed-phenomenon", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'summary': {'tok2vec': {'assigns': ['doc.tensor'],\n", " 'requires': [],\n", " 'scores': [],\n", " 'retokenizes': False},\n", " 'tagger': {'assigns': ['token.tag'],\n", " 'requires': [],\n", " 'scores': ['tag_acc'],\n", " 'retokenizes': False},\n", " 'parser': {'assigns': ['token.dep',\n", " 'token.head',\n", " 'token.is_sent_start',\n", " 'doc.sents'],\n", " 'requires': [],\n", " 'scores': ['dep_uas',\n", " 'dep_las',\n", " 'dep_las_per_type',\n", " 'sents_p',\n", " 'sents_r',\n", " 'sents_f'],\n", " 'retokenizes': False},\n", " 'attribute_ruler': {'assigns': [],\n", " 'requires': [],\n", " 'scores': [],\n", " 'retokenizes': False},\n", " 'lemmatizer': {'assigns': ['token.lemma'],\n", " 'requires': [],\n", " 'scores': ['lemma_acc'],\n", " 'retokenizes': False},\n", " 'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],\n", " 'requires': [],\n", " 'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],\n", " 'retokenizes': False}},\n", " 'problems': {'tok2vec': [],\n", " 'tagger': [],\n", " 'parser': [],\n", " 'attribute_ruler': [],\n", " 'lemmatizer': [],\n", " 'ner': []},\n", " 'attrs': {'token.ent_type': {'assigns': ['ner'], 'requires': []},\n", " 'doc.ents': {'assigns': ['ner'], 'requires': []},\n", " 'token.dep': {'assigns': ['parser'], 'requires': []},\n", " 'token.head': {'assigns': ['parser'], 'requires': []},\n", " 'token.tag': {'assigns': ['tagger'], 'requires': []},\n", " 'token.lemma': {'assigns': ['lemmatizer'], 'requires': []},\n", " 'token.ent_iob': {'assigns': ['ner'], 'requires': []},\n", " 'doc.tensor': {'assigns': ['tok2vec'], 'requires': []},\n", " 'doc.sents': {'assigns': ['parser'], 'requires': []},\n", " 'token.is_sent_start': {'assigns': ['parser'], 'requires': []}}}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nlp2.analyze_pipes()" ] }, { "cell_type": "code", "execution_count": null, "id": "chemical-amendment", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }