{ "cells": [ { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "from gensim.models import KeyedVectors" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will load only first 1000 (top 1000) vectors from python fasttext (128) model" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "model = KeyedVectors.load_word2vec_format(\"../model.vec\", binary=False, limit=1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In case you are loading GloVe embeddings, you need to convert it first" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tmpfile=get_tmpfile(\"source2vec\")\n", "glove2word2vec(datapath(\"../glove_model.txt\"), tmpfile)\n", "model = KeyedVectors.load_word2vec_format(tmpfile, binary=False, limit=1000)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can do fancy staff" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('in', 0.8999497890472412),\n", " ('enumerate', 0.7851556539535522),\n", " ('and', 0.6600955724716187),\n", " ('set', 0.6515539884567261),\n", " ('range', 0.6467443704605103),\n", " ('the', 0.6438981294631958),\n", " ('are', 0.6360925436019897),\n", " ('len', 0.6327202916145325),\n", " ('or', 0.625091552734375),\n", " ('list', 0.6236062049865723)]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.most_similar(\"for\")" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('in', 0.949974000453949),\n", " ('enumerate', 0.8925769925117493),\n", " ('and', 0.8300470113754272),\n", " ('set', 0.825776219367981),\n", " ('range', 0.823371410369873),\n", " ('the', 0.8219482898712158),\n", " ('are', 0.8180454969406128),\n", " ('len', 0.8163593411445618),\n", " ('or', 0.8125450015068054),\n", " ('list', 0.811802327632904)]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.most_similar_cosmul(\"for\")" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'a'" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.doesnt_match([\"for\", \"i\", \"a\"])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.6013880741598091" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.similarity(\"for\", \"i\")" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('in', 0.8999497890472412),\n", " ('enumerate', 0.7851556539535522),\n", " ('and', 0.6600955724716187),\n", " ('set', 0.6515539884567261),\n", " ('range', 0.6467443704605103),\n", " ('the', 0.6438981294631958),\n", " ('are', 0.6360925436019897),\n", " ('len', 0.6327202916145325),\n", " ('or', 0.625091552734375),\n", " ('list', 0.6236062049865723)]" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.similar_by_word(\"for\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Raw vector values" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "</s> : [-0.0040119 -0.2772 0.39069 -0.11142 -0.064213 0.031526\n", " 0.072355 0.28111 0.098242 0.44947 0.0033427 0.059818\n", " 0.10572 0.033005 -0.16825 0.027287 -0.014041 -0.13167\n", " 0.20144 0.097083 0.13253 0.09556 -0.12805 0.10373\n", " -0.12057 0.36752 -0.13177 -0.070997 -0.079466 0.29838\n", " -0.066887 -0.069284 -0.26501 0.21408 0.020991 -0.34294\n", " -0.3189 -0.1705 0.11337 -0.22872 -0.024095 0.069369\n", " -0.31733 0.63158 0.084219 -0.23931 -0.17847 -0.38957\n", " -0.038808 -0.046805 -0.20444 -0.15775 -0.12279 -0.014646\n", " -0.10996 -0.060379 -0.16898 -0.0048211 -0.57151 0.18944\n", " 0.11457 -0.2425 -0.08871 -0.054677 -0.2549 -0.15642\n", " 0.12891 -0.27773 0.10004 -0.46064 0.25698 0.039099\n", " 0.24376 -0.14525 -0.27021 0.018427 0.046646 -0.090066\n", " 0.1492 0.0032186 -0.15175 -0.11093 0.35132 -0.068802\n", " 0.0021299 0.29755 -0.19092 0.0321 -0.086515 0.36746\n", " -0.15456 -0.051887 0.63347 0.02882 0.3993 -0.20558\n", " 0.08532 0.10247 -0.056457 -0.12951 -0.28994 0.15222\n", " 0.16311 -0.22158 0.032566 -0.38924 -0.20935 -0.12184\n", " 0.064111 -0.11226 0.10365 0.065956 -0.064537 0.072354\n", " -0.029869 0.016191 0.22993 0.03368 -0.074305 0.080369\n", " 0.062322 0.14384 0.059349 0.25721 -0.016504 0.034721\n", " -0.35689 -0.20129 ]\n" ] } ], "source": [ "for word in model.vocab:\n", " print(word, \":\", model[word])\n", " break # only first one now" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }