{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Demo 1 - Distanz\n",
    "\n",
    "Dies ist ein Jupyter Notebook, welches auf mybinder.org gehostet wird. Alle Eingaben werden nach Beendigung gelöscht. \n",
    "\n",
    "Jupyter Notebooks enthalten Text- oder Codeblöcke. Codeblöcke werden durch SHIFT-ENTER ausgeführt (Dreiecksymbol oben rechts auf Mobil).\n",
    "\n",
    "Solange ein Block ausgeführt wird, wird nicht mehr die Nr [3] des Blocks, sondern [*] angezeigt.\n",
    "\n",
    "**<a href=\"./Demo2-Objekterkennung.ipynb\" target=\"_blank\">Link zu Demo 2</a>**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Initialisierung"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Gensim ist die Standard Word2Vec-Implementation\n",
    "import gensim\n",
    "\n",
    "# Check, welche Version genutzt wird. Ändert sich häufig mit starken Änderungen.\n",
    "print(gensim.__version__)\n",
    "\n",
    "# Modell mit Top 5000 deutschen Worten laden, 6MB\n",
    "model = gensim.models.KeyedVectors.load_word2vec_format(\"min5000.model\", binary=True)\n",
    "\n",
    "# FutureWarning ignorieren\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Nächste Nachbarn anzeigen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Die 10 Wörter laden, die München am Nächsten sind und Wahrscheinlichkeiten dazu ausgegen.\n",
    "\n",
    "matches = model.most_similar(positive=[\"Muenchen\"], negative=None, topn=10)\n",
    "\n",
    "matches"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Wort-Mathematik"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Muenchen ist zu Bayern wie Hannover zu ???\n",
    "\n",
    "matches = model.most_similar(positive=[\"Hannover\", \"Bayern\"], negative=[\"Muenchen\"], topn=10)\n",
    "\n",
    "matches"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Visualisierung von Wordvektoren"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.manifold import TSNE\n",
    "\n",
    "print('geladen')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Definieren der Anzeigefunktion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# By: https://github.com/devmount/GermanWordEmbeddings/blob/master/visualize.py\n",
    "\n",
    "# function draw_words\n",
    "# ... reduces dimensionality of vectors of given words either with PCA or with t-SNE and draws the words into a diagram\n",
    "# @param word2vec model     to visualize vectors from\n",
    "# @param list     words     list of word strings to visualize\n",
    "# @param bool     pca       use PCA (True) or t-SNE (False) to reduce dimensionality \n",
    "# @param bool     alternate use different color and label align for every second word\n",
    "# @param bool     arrows    use arrows to connect related words (items that are next to each other in list)\n",
    "# @param float    x1        x axis range (from)\n",
    "# @param float    x2        x axis range (to)\n",
    "# @param float    y1        y axis range (from)\n",
    "# @param float    y2        y axis range (to)\n",
    "# @param string   title     for diagram\n",
    "def draw_words(model, words, pca=False, alternate=True, arrows=True, x1=3, x2=3, y1=3, y2=3, title=''):\n",
    "    # get vectors for given words from model\n",
    "    vectors = [model[word] for word in words]\n",
    "\n",
    "    if pca:\n",
    "        pca = PCA(n_components=2, whiten=True)\n",
    "        vectors2d = pca.fit(vectors).transform(vectors)\n",
    "    else:\n",
    "        tsne = TSNE(n_components=2, random_state=0)\n",
    "        vectors2d = tsne.fit_transform(vectors)\n",
    "\n",
    "    # draw image\n",
    "    plt.figure(figsize=(6,6))\n",
    "    if pca:\n",
    "        plt.axis([x1, x2, y1, y2])\n",
    "\n",
    "    first = True # color alternation to divide given groups\n",
    "    for point, word in zip(vectors2d , words):\n",
    "        # plot points\n",
    "        plt.scatter(point[0], point[1], c='r' if first else 'g')\n",
    "        # plot word annotations\n",
    "        plt.annotate(\n",
    "            word, \n",
    "            xy = (point[0], point[1]),\n",
    "            xytext = (-7, -6) if first else (7, -6),\n",
    "            textcoords = 'offset points',\n",
    "            ha = 'right' if first else 'left',\n",
    "            va = 'bottom',\n",
    "            size = \"x-large\"\n",
    "        )\n",
    "        first = not first if alternate else first\n",
    "\n",
    "    # draw arrows\n",
    "    if arrows:\n",
    "        for i in range(0, len(words)-1, 2):\n",
    "            a = vectors2d[i][0] + 0.04\n",
    "            b = vectors2d[i][1]\n",
    "            c = vectors2d[i+1][0] - 0.04\n",
    "            d = vectors2d[i+1][1]\n",
    "            plt.arrow(a, b, c-a, d-b,\n",
    "                shape='full',\n",
    "                lw=0.1,\n",
    "                edgecolor='#bbbbbb',\n",
    "                facecolor='#bbbbbb',\n",
    "                length_includes_head=True,\n",
    "                head_width=0.08,\n",
    "                width=0.01\n",
    "            )\n",
    "\n",
    "    # draw diagram title\n",
    "    if title:\n",
    "        plt.title(title)\n",
    "\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2-D Visualisierung von Nachbarn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nachbarn suchen und anzeigen\n",
    "matches = model.most_similar(positive=[\"denken\"], negative=[], topn=10)\n",
    "words = [match[0] for match in matches]\n",
    "draw_words(model, words, True, False, False, -3, 3, -3, 3, r'$PCA\\ Visualisierung:\\ Nachbarn$')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}