{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Linear Algebra and Embeddings\n", "\n", "(C) 2023-2024 by [Damir Cavar](http://damir.cavar.me/)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Prerequisites:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -U numpy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -U pip setuptools wheel" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The following installation of spaCy is ideal for my environment, i.e., using a GPU and CUDA 12.x. See the [spaCy homepage](https://spacy.io/usage) for detailed installation instructions." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -U 'spacy[cuda12x,transformers,lookups,ja]'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Installing language module:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!python -m spacy download en_core_web_trf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using spaCy Word Embeddings\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load(\"en_core_web_lg\")" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-1.0969249 , 3.5422752 , 0.23759246, -0.555475 , 5.2427 ,\n", " -2.4753752 , 2.3339748 , 2.8237414 , 0.33422732, -0.3434676 ,\n", " 12.083975 , 0.33232486, -3.362825 , -1.320285 , 0.388 ,\n", " 4.3468122 , 3.9268746 , 6.310725 , 1.81281 , 3.459995 ,\n", " -0.60847497, 0.6960525 , -3.12215 , 1.7629642 , -3.78505 ,\n", " -3.615975 , -2.401775 , -3.773425 , -4.586575 , -0.17040491,\n", " -2.2923126 , 0.20159996, -2.092505 , -0.081205 , -3.5820775 ,\n", " 1.7506775 , -0.04733169, -1.336425 , 6.71595 , -1.0187502 ,\n", " -3.2500327 , 1.653998 , -1.67977 , 0.19409996, -1.7241253 ,\n", " 3.2048974 , 5.43085 , -2.63674 , -3.197685 , -0.8757375 ,\n", " -1.1576226 , -1.9054224 , 4.32505 , -3.5928502 , -0.06439996,\n", " -0.49245 , 3.9352 , 0.5091 , -1.5124725 , -1.2777625 ,\n", " 3.9715326 , 1.292665 , -1.6772001 , -0.33407497, -3.6834948 ,\n", " 4.0337377 , -2.8430524 , -8.60705 , 3.8568497 , 3.188225 ,\n", " -1.6995806 , 5.7394476 , -5.68779 , 0.86530495, -2.75266 ,\n", " 3.5973 , -2.9996974 , 4.636125 , -4.753975 , -0.2553249 ,\n", " -5.2018423 , -4.1078873 , 1.892025 , -2.5739925 , 3.290225 ,\n", " 0.91276246, 2.585525 , 0.35443497, 7.3732 , -2.6650624 ,\n", " -0.56189257, -5.0054746 , 4.42545 , -6.362225 , 1.3437551 ,\n", " -5.9611998 , -0.04044253, 1.149175 , -2.2931252 , -5.1349 ,\n", " 0.47136992, 0.9424001 , 6.57765 , 5.74815 , 1.3653026 ,\n", " 6.008925 , 2.159755 , -0.08589995, -3.4470825 , 4.2521124 ,\n", " 0.10369492, 4.1392746 , -1.749475 , 7.122425 , 0.04479751,\n", " -2.5704076 , -5.2203474 , 0.983475 , 0.13849998, -0.45529997,\n", " -1.576725 , -2.256875 , 1.4163951 , 7.4344 , -1.99145 ,\n", " -8.1637745 , -1.7708502 , -1.64675 , 2.368075 , -4.5596 ,\n", " -4.775775 , -0.41967502, 5.3922496 , -8.33895 , 0.11092576,\n", " 0.98969454, 4.9369574 , -2.4393501 , 6.700625 , -3.98615 ,\n", " -1.87852 , 0.36134487, 4.5063353 , 2.10782 , 3.9655325 ,\n", " 1.5841775 , -3.2904477 , 1.2405727 , -0.65805 , 7.336225 ,\n", " 1.882325 , 7.14828 , 0.38635895, 3.0701375 , 0.289495 ,\n", " 1.954175 , 4.4154773 , 4.4448624 , 0.21266496, -0.7630849 ,\n", " -1.7062174 , -1.5508088 , 3.17329 , 0.08830002, -4.14565 ,\n", " -3.20905 , -5.540575 , 7.433675 , 1.93855 , -0.79538745,\n", " 2.7830474 , -1.2273326 , 2.6722 , 3.1045501 , 1.9526999 ,\n", " -3.207857 , 0.6988225 , -0.47439998, -4.4026175 , -2.2394 ,\n", " 0.28175497, 2.4338498 , 2.2315974 , -4.188975 , -1.8792049 ,\n", " -1.2445525 , -4.4771748 , -3.99115 , -0.74327505, 3.2435255 ,\n", " -3.2351851 , -2.325425 , -1.6298001 , -0.8707749 , 4.8065248 ,\n", " 2.1342926 , -3.81695 , -0.85352504, 4.0910277 , -1.7153752 ,\n", " 0.04182005, -1.887125 , -5.3869495 , -5.5109253 , 2.2279 ,\n", " 2.9002626 , -7.1083503 , 0.454875 , -2.0965447 , -2.765925 ,\n", " 4.333975 , 2.377125 , 0.51845 , 5.481675 , 3.5761576 ,\n", " -0.91882753, 0.7577725 , -2.49445 , -1.6242576 , 1.6717949 ,\n", " -2.1889749 , -2.0408049 , 2.3667 , -1.1684225 , -0.07747507,\n", " -2.3084002 , 3.493225 , 0.72685254, 3.7181323 , -3.602925 ,\n", " 0.64301497, -8.118751 , 1.93061 , 3.2603297 , -5.6989703 ,\n", " 0.41267496, 2.078825 , 0.18630004, 2.070035 , 1.38596 ,\n", " 1.1946499 , 0.78668505, 2.7850251 , 5.969575 , -1.6403899 ,\n", " -2.9643674 , -4.727984 , 1.2468748 , 1.7687201 , 4.2653275 ,\n", " 2.348125 , -1.56483 , -4.87081 , -6.1311 , -1.8386024 ,\n", " -0.61717516, 4.7199774 , 5.5335 , 1.6379997 , -1.7111676 ,\n", " -0.9985751 , 4.558325 , 3.0915775 , 7.3542247 , -2.38635 ,\n", " 1.1966676 , -2.45665 , 1.8378799 , -6.0176997 , -2.69588 ,\n", " 4.44735 , -3.6373749 , -0.8115974 , -5.8442 , 0.4275999 ,\n", " 1.28295 , 3.026945 , 3.9658577 , -1.0834899 , 0.150175 ,\n", " -0.14967 , -5.245175 , -3.9003997 , 2.0858262 , 10.94865 ,\n", " -0.87130004, 5.7962255 , -1.77914 , -2.003475 , 4.728225 ,\n", " 2.3407001 , 1.0157275 , 5.048585 , 0.57595 , -0.27465004,\n", " 4.7957 , 2.306925 , -0.85118246, -3.8584175 , 2.6602824 ],\n", " dtype=float32)" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc = nlp(\"This is a text\")\n", "doc.vector" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "doc_v = [ v.vector for v in doc ]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1.5, 2.5])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "a = np.array([ 1.0, 2.0 ])\n", "b = np.array([ 2.0, 3.0 ])\n", "(a + b) / 2.0" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1.5, 2. ])" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c = [ a, b ]\n", "sum(c) / len(c)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean([a, b], dtype=float)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1.5, 2.5])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.mean([a, b], dtype=float, axis=0)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.1" } }, "nbformat": 4, "nbformat_minor": 2 }