{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy \n", "import sys \n", "import nmslib \n", "import time \n", "import math \n", "from sklearn.neighbors import NearestNeighbors\n", "from sklearn.model_selection import train_test_split\n", "print(sys.version)\n", "print(\"NMSLIB version:\", nmslib.__version__)\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Just read the data\n", "all_data_matrix = numpy.loadtxt('../../sample_data/final128_10K.txt')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Create a held-out query data set\n", "(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "# of queries 1000, # of data points 9000\n" ] } ], "source": [ "print(\"# of queries %d, # of data points %d\" % (query_matrix.shape[0], data_matrix.shape[0]) )" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}\n" ] } ], "source": [ "# Set index parameters\n", "# These are the most important onese\n", "M = 15\n", "efC = 100\n", "\n", "num_threads = 4\n", "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}\n", "print('Index-time parameters', index_time_params)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Number of neighbors \n", "K=100" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Space name should correspond to the space name \n", "# used for brute-force search\n", "space_name='l2'" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "9000" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Intitialize the library, specify the space, the type of the vector and add data points \n", "index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n", "index.addDataPointBatch(data_matrix) " ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100}\n", "Indexing time = 0.246434\n" ] } ], "source": [ "# Create an index\n", "start = time.time()\n", "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}\n", "index.createIndex(index_time_params) \n", "end = time.time() \n", "print('Index-time parameters', index_time_params)\n", "print('Indexing time = %f' % (end-start))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting query-time parameters {'efSearch': 100}\n" ] } ], "source": [ "# Setting query-time parameters\n", "efS = 100\n", "query_time_params = {'efSearch': efS}\n", "print('Setting query-time parameters', query_time_params)\n", "index.setQueryTimeParams(query_time_params)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kNN time total=0.026962 (sec), per query=0.000027 (sec), per query adjusted for thread number=0.000108 (sec)\n" ] } ], "source": [ "# Querying\n", "query_qty = query_matrix.shape[0]\n", "start = time.time() \n", "nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", "end = time.time() \n", "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Computing gold-standard data\n", "Brute-force preparation time 0.001275\n", "brute-force kNN time total=0.242228 (sec), per query=0.000242 (sec)\n" ] } ], "source": [ "# Computing gold-standard data \n", "print('Computing gold-standard data')\n", "\n", "start = time.time()\n", "sindx = NearestNeighbors(n_neighbors=K, metric='l2', algorithm='brute').fit(data_matrix)\n", "end = time.time()\n", "\n", "print('Brute-force preparation time %f' % (end - start))\n", "\n", "start = time.time() \n", "gs = sindx.kneighbors(query_matrix)\n", "end = time.time()\n", "\n", "print('brute-force kNN time total=%f (sec), per query=%f (sec)' % \n", " (end-start, float(end-start)/query_qty) )" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kNN recall 0.993460\n" ] } ], "source": [ "# Finally computing recall\n", "recall=0.0\n", "for i in range(0, query_qty):\n", " correct_set = set(gs[1][i])\n", " ret_set = set(nbrs[i][0])\n", " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", "recall = recall / query_qty\n", "print('kNN recall %f' % recall)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# Save a meta index, but no data!\n", "index.saveIndex('dense_index_optim.bin', save_data=False)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Re-intitialize the library, specify the space, the type of the vector.\n", "newIndex = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n", "# For an optimized L2 index, there's no need to re-load data points, but this would be required for\n", "# non-optimized index or any other methods different from HNSW (other methods can save only meta indices)\n", "#newIndex.addDataPointBatch(data_matrix) " ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Re-load the index and re-run queries\n", "newIndex.loadIndex('dense_index_optim.bin')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Setting query-time parameters {'efSearch': 100}\n", "kNN time total=0.026182 (sec), per query=0.000026 (sec), per query adjusted for thread number=0.000105 (sec)\n" ] } ], "source": [ "# Setting query-time parameters and querying\n", "print('Setting query-time parameters', query_time_params)\n", "newIndex.setQueryTimeParams(query_time_params)\n", "\n", "query_qty = query_matrix.shape[0]\n", "start = time.time() \n", "new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n", "end = time.time() \n", "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n", " (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) " ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "kNN recall 0.993460\n" ] } ], "source": [ "# Finally computing recall for the new result set\n", "recall=0.0\n", "for i in range(0, query_qty):\n", " correct_set = set(gs[1][i])\n", " ret_set = set(new_nbrs[i][0])\n", " recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n", "recall = recall / query_qty\n", "print('kNN recall %f' % recall)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8" } }, "nbformat": 4, "nbformat_minor": 2 }