{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Chapter 1: Introduction\n", "\n", "Let's build a vector for input text, e.g., from `doc1`:" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[3, 5]\n" ] } ], "source": [ "doc1 = \"meeting ... management ... meeting ... management ... meeting \"\n", "doc1 += \"... management ... meeting ... meeting\"\n", "\n", "vector = [0, 0]\n", "\n", "for word in doc1.split(\" \"):\n", " if word==\"management\":\n", " vector[0] = vector[0] + 1\n", " if word==\"meeting\":\n", " vector[1] = vector[1] + 1\n", " \n", "print (vector)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is how you can calculate *Euclidean distance* between a document and a query:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.47213595499958\n" ] } ], "source": [ "import math\n", "\n", "query = [1, 1]\n", "doc1 = [3, 5]\n", "sq_length = 0\n", "\n", "for index in range(0, len(query)):\n", " sq_length += math.pow((doc1[index] - query[index]), 2)\n", " \n", "print (math.sqrt(sq_length))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, let's estimate *cosine similarity*:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.9701425001453319\n" ] } ], "source": [ "import math\n", "\n", "query = [1, 1]\n", "doc1 = [3, 5]\n", "\n", "def length(vector):\n", " sq_length = 0\n", " for index in range(0, len(vector)):\n", " sq_length += math.pow(vector[index], 2)\n", " return math.sqrt(sq_length)\n", " \n", "def dot_product(vector1, vector2):\n", " if len(vector1)==len(vector2):\n", " dot_prod = 0\n", " for index in range(0, len(vector1)):\n", " dot_prod += vector1[index]*vector2[index]\n", " return dot_prod\n", " else:\n", " return \"Unmatching dimensionality\"\n", "\n", "cosine=dot_product(query, doc1)/(length(query)*length(doc1))\n", "print (cosine)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }