{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Demo 1 - Distanz\n", "\n", "Dies ist ein Jupyter Notebook, welches auf mybinder.org gehostet wird. Alle Eingaben werden nach Beendigung gelöscht. \n", "\n", "Jupyter Notebooks enthalten Text- oder Codeblöcke. Codeblöcke werden durch SHIFT-ENTER ausgeführt (Dreiecksymbol oben rechts auf Mobil).\n", "\n", "Solange ein Block ausgeführt wird, wird nicht mehr die Nr [3] des Blocks, sondern [*] angezeigt.\n", "\n", "**<a href=\"./Demo2-Objekterkennung.ipynb\" target=\"_blank\">Link zu Demo 2</a>**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Initialisierung" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Gensim ist die Standard Word2Vec-Implementation\n", "import gensim\n", "\n", "# Check, welche Version genutzt wird. Ändert sich häufig mit starken Änderungen.\n", "print(gensim.__version__)\n", "\n", "# Modell mit Top 5000 deutschen Worten laden, 6MB\n", "model = gensim.models.KeyedVectors.load_word2vec_format(\"min5000.model\", binary=True)\n", "\n", "# FutureWarning ignorieren\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Nächste Nachbarn anzeigen" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Die 10 Wörter laden, die München am Nächsten sind und Wahrscheinlichkeiten dazu ausgegen.\n", "\n", "matches = model.most_similar(positive=[\"Muenchen\"], negative=None, topn=10)\n", "\n", "matches" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Wort-Mathematik" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Muenchen ist zu Bayern wie Hannover zu ???\n", "\n", "matches = model.most_similar(positive=[\"Hannover\", \"Bayern\"], negative=[\"Muenchen\"], topn=10)\n", "\n", "matches" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Visualisierung von Wordvektoren" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "from sklearn.decomposition import PCA\n", "from sklearn.manifold import TSNE\n", "\n", "print('geladen')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Definieren der Anzeigefunktion" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# By: https://github.com/devmount/GermanWordEmbeddings/blob/master/visualize.py\n", "\n", "# function draw_words\n", "# ... reduces dimensionality of vectors of given words either with PCA or with t-SNE and draws the words into a diagram\n", "# @param word2vec model to visualize vectors from\n", "# @param list words list of word strings to visualize\n", "# @param bool pca use PCA (True) or t-SNE (False) to reduce dimensionality \n", "# @param bool alternate use different color and label align for every second word\n", "# @param bool arrows use arrows to connect related words (items that are next to each other in list)\n", "# @param float x1 x axis range (from)\n", "# @param float x2 x axis range (to)\n", "# @param float y1 y axis range (from)\n", "# @param float y2 y axis range (to)\n", "# @param string title for diagram\n", "def draw_words(model, words, pca=False, alternate=True, arrows=True, x1=3, x2=3, y1=3, y2=3, title=''):\n", " # get vectors for given words from model\n", " vectors = [model[word] for word in words]\n", "\n", " if pca:\n", " pca = PCA(n_components=2, whiten=True)\n", " vectors2d = pca.fit(vectors).transform(vectors)\n", " else:\n", " tsne = TSNE(n_components=2, random_state=0)\n", " vectors2d = tsne.fit_transform(vectors)\n", "\n", " # draw image\n", " plt.figure(figsize=(6,6))\n", " if pca:\n", " plt.axis([x1, x2, y1, y2])\n", "\n", " first = True # color alternation to divide given groups\n", " for point, word in zip(vectors2d , words):\n", " # plot points\n", " plt.scatter(point[0], point[1], c='r' if first else 'g')\n", " # plot word annotations\n", " plt.annotate(\n", " word, \n", " xy = (point[0], point[1]),\n", " xytext = (-7, -6) if first else (7, -6),\n", " textcoords = 'offset points',\n", " ha = 'right' if first else 'left',\n", " va = 'bottom',\n", " size = \"x-large\"\n", " )\n", " first = not first if alternate else first\n", "\n", " # draw arrows\n", " if arrows:\n", " for i in range(0, len(words)-1, 2):\n", " a = vectors2d[i][0] + 0.04\n", " b = vectors2d[i][1]\n", " c = vectors2d[i+1][0] - 0.04\n", " d = vectors2d[i+1][1]\n", " plt.arrow(a, b, c-a, d-b,\n", " shape='full',\n", " lw=0.1,\n", " edgecolor='#bbbbbb',\n", " facecolor='#bbbbbb',\n", " length_includes_head=True,\n", " head_width=0.08,\n", " width=0.01\n", " )\n", "\n", " # draw diagram title\n", " if title:\n", " plt.title(title)\n", "\n", " plt.tight_layout()\n", " plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2-D Visualisierung von Nachbarn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Nachbarn suchen und anzeigen\n", "matches = model.most_similar(positive=[\"denken\"], negative=[], topn=10)\n", "words = [match[0] for match in matches]\n", "draw_words(model, words, True, False, False, -3, 3, -3, 3, r'$PCA\\ Visualisierung:\\ Nachbarn$')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }