{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Greek Spacy.ipynb", "provenance": [], "collapsed_sections": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "metadata": { "id": "yIARIXVzyqZe", "colab_type": "code", "outputId": "0fe8fd77-c0fd-4699-96cb-4ecae7317b14", "colab": { "base_uri": "https://localhost:8080/", "height": 845 } }, "source": [ "!pip install -U spacy" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Collecting spacy\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/47/13/80ad28ef7a16e2a86d16d73e28588be5f1085afd3e85e4b9b912bd700e8a/spacy-2.2.3-cp36-cp36m-manylinux1_x86_64.whl (10.4MB)\n", "\u001b[K |████████████████████████████████| 10.4MB 4.2MB/s \n", "\u001b[?25hRequirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy) (2.0.3)\n", "Requirement already satisfied, skipping upgrade: srsly<1.1.0,>=0.1.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (0.2.0)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.17.4)\n", "Collecting preshed<3.1.0,>=3.0.2\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/db/6b/e07fad36913879757c90ba03d6fb7f406f7279e11dcefc105ee562de63ea/preshed-3.0.2-cp36-cp36m-manylinux1_x86_64.whl (119kB)\n", "\u001b[K |████████████████████████████████| 122kB 42.2MB/s \n", "\u001b[?25hCollecting thinc<7.4.0,>=7.3.0\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/07/59/6bb553bc9a5f072d3cd479fc939fea0f6f682892f1f5cff98de5c9b615bb/thinc-7.3.1-cp36-cp36m-manylinux1_x86_64.whl (2.2MB)\n", "\u001b[K |████████████████████████████████| 2.2MB 30.3MB/s \n", "\u001b[?25hRequirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.0.2)\n", "Collecting catalogue<1.1.0,>=0.0.7\n", " Downloading https://files.pythonhosted.org/packages/4f/d5/46ff975f0d7d055cf95557b944fd5d29d9dfb37a4341038e070f212b24fe/catalogue-0.0.8-py2.py3-none-any.whl\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy) (41.6.0)\n", "Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (0.4.0)\n", "Requirement already satisfied, skipping upgrade: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy) (0.9.6)\n", "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (2.21.0)\n", "Collecting blis<0.5.0,>=0.4.0\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/41/19/f95c75562d18eb27219df3a3590b911e78d131b68466ad79fdf5847eaac4/blis-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)\n", "\u001b[K |████████████████████████████████| 3.7MB 41.9MB/s \n", "\u001b[?25hRequirement already satisfied, skipping upgrade: tqdm<5.0.0,>=4.10.0 in /usr/local/lib/python3.6/dist-packages (from thinc<7.4.0,>=7.3.0->spacy) (4.28.1)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (0.23)\n", "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2019.9.11)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.8)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy) (0.6.0)\n", "Requirement already satisfied, skipping upgrade: more-itertools in /usr/local/lib/python3.6/dist-packages (from zipp>=0.5->importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy) (7.2.0)\n", "Installing collected packages: preshed, blis, thinc, catalogue, spacy\n", " Found existing installation: preshed 2.0.1\n", " Uninstalling preshed-2.0.1:\n", " Successfully uninstalled preshed-2.0.1\n", " Found existing installation: blis 0.2.4\n", " Uninstalling blis-0.2.4:\n", " Successfully uninstalled blis-0.2.4\n", " Found existing installation: thinc 7.0.8\n", " Uninstalling thinc-7.0.8:\n", " Successfully uninstalled thinc-7.0.8\n", " Found existing installation: spacy 2.1.9\n", " Uninstalling spacy-2.1.9:\n", " Successfully uninstalled spacy-2.1.9\n", "Successfully installed blis-0.4.1 catalogue-0.0.8 preshed-3.0.2 spacy-2.2.3 thinc-7.3.1\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "LXpmvaPijt4w", "colab_type": "code", "outputId": "8b9850b6-c137-4982-fb82-697212c64d06", "colab": { "base_uri": "https://localhost:8080/", "height": 717 } }, "source": [ "!python -m spacy download el" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Collecting el_core_news_sm==2.2.5\n", "\u001b[?25l Downloading https://github.com/explosion/spacy-models/releases/download/el_core_news_sm-2.2.5/el_core_news_sm-2.2.5.tar.gz (11.4MB)\n", "\u001b[K |████████████████████████████████| 11.4MB 790kB/s \n", "\u001b[?25hRequirement already satisfied: spacy>=2.2.2 in /usr/local/lib/python3.6/dist-packages (from el_core_news_sm==2.2.5) (2.2.3)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (2.0.3)\n", "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (1.17.4)\n", "Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (0.0.8)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (41.6.0)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (3.0.2)\n", "Requirement already satisfied: thinc<7.4.0,>=7.3.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (7.3.1)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (1.0.2)\n", "Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (0.4.0)\n", "Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (0.9.6)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (2.21.0)\n", "Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (0.4.1)\n", "Requirement already satisfied: srsly<1.1.0,>=0.1.0 in /usr/local/lib/python3.6/dist-packages (from spacy>=2.2.2->el_core_news_sm==2.2.5) (0.2.0)\n", "Requirement already satisfied: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->el_core_news_sm==2.2.5) (0.23)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in /usr/local/lib/python3.6/dist-packages (from thinc<7.4.0,>=7.3.0->spacy>=2.2.2->el_core_news_sm==2.2.5) (4.28.1)\n", "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->el_core_news_sm==2.2.5) (1.24.3)\n", "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->el_core_news_sm==2.2.5) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->el_core_news_sm==2.2.5) (2019.9.11)\n", "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->el_core_news_sm==2.2.5) (2.8)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->el_core_news_sm==2.2.5) (0.6.0)\n", "Requirement already satisfied: more-itertools in /usr/local/lib/python3.6/dist-packages (from zipp>=0.5->importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->el_core_news_sm==2.2.5) (7.2.0)\n", "Building wheels for collected packages: el-core-news-sm\n", " Building wheel for el-core-news-sm (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for el-core-news-sm: filename=el_core_news_sm-2.2.5-cp36-none-any.whl size=11422786 sha256=a2e4fd3c86575b7c8ae7a4ec4211ae68a7c27a748573477b9b43a831742e2b44\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-634wxpxz/wheels/70/a1/c5/6690d6b524d87e287a8070cf957f834fb1b1665b9ede11348b\n", "Successfully built el-core-news-sm\n", "Installing collected packages: el-core-news-sm\n", "Successfully installed el-core-news-sm-2.2.5\n", "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the model via spacy.load('el_core_news_sm')\n", "\u001b[38;5;2m✔ Linking successful\u001b[0m\n", "/usr/local/lib/python3.6/dist-packages/el_core_news_sm -->\n", "/usr/local/lib/python3.6/dist-packages/spacy/data/el\n", "You can now load the model via spacy.load('el')\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "Gc4rd8JJyS9Y", "colab_type": "code", "colab": {} }, "source": [ "import spacy\n", "#nlp = spacy.load(\"el_core_news_sm\")\n", "nlp = spacy.load(\"el\")" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "id": "p8dDdyS8kgXO", "colab": {} }, "source": [ "sample_text=\"Αυτό είναι ένα παράδειγμα για την επεξεργασία κειμένου. Δημιουργήθηκε από το Δημήτρη Παναγόπουλο τον Νoέμβριο του 2019 στην Αθήνα. Μπορείτε να το τρέξετε στο Colab της Google\"\n", "doc = nlp(sample_text)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ge3FRXSA0pQ0", "colab_type": "code", "outputId": "bfc246c1-736d-47fb-d4cc-651c333f1654", "colab": { "base_uri": "https://localhost:8080/", "height": 550 } }, "source": [ "for token in doc:\n", " print(token.text, token.lemma_, token.pos_)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "Αυτό αυτό PRON\n", "είναι είναι AUX\n", "ένα ένα DET\n", "παράδειγμα παράδειγμα NOUN\n", "για για ADP\n", "την την DET\n", "επεξεργασία επεξεργασίας NOUN\n", "κειμένου κειμένο NOUN\n", ". . PUNCT\n", "Δημιουργήθηκε δημιουργήθηκε VERB\n", "από από ADP\n", "το το DET\n", "Δημήτρη δημήτρη NOUN\n", "Παναγόπουλο παναγόπουλο NOUN\n", "τον τον DET\n", "Νoέμβριο νoέμβριο NOUN\n", "του του DET\n", "2019 2019 NUM\n", "στην στην ADJ\n", "Αθήνα Αθήνα PROPN\n", ". . PUNCT\n", "Μπορείτε μπορείτε VERB\n", "να να PART\n", "το το PRON\n", "τρέξετε τρέξω VERB\n", "στο στο ADV\n", "Colab colab X\n", "της της DET\n", "Google google X\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "uwp80jJtlYUE", "colab_type": "code", "outputId": "a7270266-1e2d-4680-e732-eaf99c825f8b", "colab": { "base_uri": "https://localhost:8080/", "height": 122 } }, "source": [ "from spacy import displacy\n", "displacy.render(doc, style=\"ent\", jupyter=True)" ], "execution_count": 0, "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "
Αυτό είναι ένα παράδειγμα για την επεξεργασία κειμένου. Δημιουργήθηκε από το \n", "\n", " Δημήτρη Παναγόπουλο\n", " ORG\n", "\n", " τον Νoέμβριο του 2019 στην \n", "\n", " Αθήνα\n", " GPE\n", "\n", ". Μπορείτε να το τρέξετε στο Colab της \n", "\n", " Google\n", " ORG\n", "\n", "
" ], "text/plain": [ "" ] }, "metadata": { "tags": [] } } ] }, { "cell_type": "code", "metadata": { "id": "IZXd10Tnmefz", "colab_type": "code", "colab": {} }, "source": [ "sample_words=\"σκύλος γάτα βασιλιάς\"" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Msv3wlgbpk_y", "colab_type": "code", "colab": {} }, "source": [ "tokens=nlp(sample_words)" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-hacpD7rppfw", "colab_type": "code", "outputId": "ff897f43-b591-4b97-caa0-ca402550c2d1", "colab": { "base_uri": "https://localhost:8080/", "height": 35 } }, "source": [ "print(tokens)" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "σκύλος γάτα βασιλιάς\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "0uNtAMTNpq_i", "colab_type": "code", "outputId": "14f4ee3e-3bff-40aa-b488-be3cc7f93c3e", "colab": { "base_uri": "https://localhost:8080/", "height": 92 } }, "source": [ "print(tokens[0].similarity(tokens[1]))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "/usr/lib/python3.6/runpy.py:193: ModelsWarning: [W007] The model you're using has no word vectors loaded, so the result of the Token.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.\n", " \"__main__\", mod_spec)\n" ], "name": "stderr" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.69062674" ] }, "metadata": { "tags": [] }, "execution_count": 31 } ] }, { "cell_type": "code", "metadata": { "id": "6nWtntI7puNh", "colab_type": "code", "outputId": "cf44e0eb-8f76-45db-bd80-97a1716c0214", "colab": { "base_uri": "https://localhost:8080/", "height": 92 } }, "source": [ "print(tokens[0].similarity(tokens[2]))" ], "execution_count": 0, "outputs": [ { "output_type": "stream", "text": [ "0.4917702\n" ], "name": "stdout" }, { "output_type": "stream", "text": [ "/usr/lib/python3.6/runpy.py:193: ModelsWarning: [W007] The model you're using has no word vectors loaded, so the result of the Token.similarity method will be based on the tagger, parser and NER, which may not give useful similarity judgements. This may happen if you're using one of the small models, e.g. `en_core_web_sm`, which don't ship with word vectors and only use context-sensitive tensors. You can always add your own word vectors, or use one of the larger models instead if available.\n", " \"__main__\", mod_spec)\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "id": "uflt2YADqmG0", "colab_type": "code", "colab": {} }, "source": [ "" ], "execution_count": 0, "outputs": [] } ] }