{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy \n",
    "import sys \n",
    "import nmslib \n",
    "import time \n",
    "import math \n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "from sklearn.model_selection import train_test_split\n",
    "print(sys.version)\n",
    "print(\"NMSLIB version:\", nmslib.__version__)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Just read the data\n",
    "all_data_matrix = numpy.loadtxt('../../sample_data/final128_10K.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a held-out query data set\n",
    "(data_matrix, query_matrix) = train_test_split(all_data_matrix, test_size = 0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# of queries 1000, # of data points 9000\n"
     ]
    }
   ],
   "source": [
    "print(\"# of queries %d, # of data points %d\"  % (query_matrix.shape[0], data_matrix.shape[0]) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}\n"
     ]
    }
   ],
   "source": [
    "# Set index parameters\n",
    "# These are the most important onese\n",
    "M = 15\n",
    "efC = 100\n",
    "\n",
    "num_threads = 4\n",
    "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}\n",
    "print('Index-time parameters', index_time_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of neighbors \n",
    "K=100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Space name should correspond to the space name \n",
    "# used for brute-force search\n",
    "space_name='l2'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9000"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Intitialize the library, specify the space, the type of the vector and add data points \n",
    "index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n",
    "index.addDataPointBatch(data_matrix) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index-time parameters {'M': 15, 'indexThreadQty': 4, 'efConstruction': 100}\n",
      "Indexing time = 0.246434\n"
     ]
    }
   ],
   "source": [
    "# Create an index\n",
    "start = time.time()\n",
    "index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}\n",
    "index.createIndex(index_time_params) \n",
    "end = time.time() \n",
    "print('Index-time parameters', index_time_params)\n",
    "print('Indexing time = %f' % (end-start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Setting query-time parameters {'efSearch': 100}\n"
     ]
    }
   ],
   "source": [
    "# Setting query-time parameters\n",
    "efS = 100\n",
    "query_time_params = {'efSearch': efS}\n",
    "print('Setting query-time parameters', query_time_params)\n",
    "index.setQueryTimeParams(query_time_params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "kNN time total=0.026962 (sec), per query=0.000027 (sec), per query adjusted for thread number=0.000108 (sec)\n"
     ]
    }
   ],
   "source": [
    "# Querying\n",
    "query_qty = query_matrix.shape[0]\n",
    "start = time.time() \n",
    "nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n",
    "end = time.time() \n",
    "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n",
    "      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Computing gold-standard data\n",
      "Brute-force preparation time 0.001275\n",
      "brute-force kNN time total=0.242228 (sec), per query=0.000242 (sec)\n"
     ]
    }
   ],
   "source": [
    "# Computing gold-standard data \n",
    "print('Computing gold-standard data')\n",
    "\n",
    "start = time.time()\n",
    "sindx = NearestNeighbors(n_neighbors=K, metric='l2', algorithm='brute').fit(data_matrix)\n",
    "end = time.time()\n",
    "\n",
    "print('Brute-force preparation time %f' % (end - start))\n",
    "\n",
    "start = time.time() \n",
    "gs = sindx.kneighbors(query_matrix)\n",
    "end = time.time()\n",
    "\n",
    "print('brute-force kNN time total=%f (sec), per query=%f (sec)' % \n",
    "      (end-start, float(end-start)/query_qty) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "kNN recall 0.993460\n"
     ]
    }
   ],
   "source": [
    "# Finally computing recall\n",
    "recall=0.0\n",
    "for i in range(0, query_qty):\n",
    "  correct_set = set(gs[1][i])\n",
    "  ret_set = set(nbrs[i][0])\n",
    "  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n",
    "recall = recall / query_qty\n",
    "print('kNN recall %f' % recall)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save a meta index, but no data!\n",
    "index.saveIndex('dense_index_optim.bin', save_data=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Re-intitialize the library, specify the space, the type of the vector.\n",
    "newIndex = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) \n",
    "# For an optimized L2 index, there's no need to re-load data points, but this would be required for\n",
    "# non-optimized index or any other methods different from HNSW (other methods can save only meta indices)\n",
    "#newIndex.addDataPointBatch(data_matrix) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Re-load the index and re-run queries\n",
    "newIndex.loadIndex('dense_index_optim.bin')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Setting query-time parameters {'efSearch': 100}\n",
      "kNN time total=0.026182 (sec), per query=0.000026 (sec), per query adjusted for thread number=0.000105 (sec)\n"
     ]
    }
   ],
   "source": [
    "# Setting query-time parameters and querying\n",
    "print('Setting query-time parameters', query_time_params)\n",
    "newIndex.setQueryTimeParams(query_time_params)\n",
    "\n",
    "query_qty = query_matrix.shape[0]\n",
    "start = time.time() \n",
    "new_nbrs = newIndex.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)\n",
    "end = time.time() \n",
    "print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % \n",
    "      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "kNN recall 0.993460\n"
     ]
    }
   ],
   "source": [
    "# Finally computing recall for the new result set\n",
    "recall=0.0\n",
    "for i in range(0, query_qty):\n",
    "  correct_set = set(gs[1][i])\n",
    "  ret_set = set(new_nbrs[i][0])\n",
    "  recall = recall + float(len(correct_set.intersection(ret_set))) / len(correct_set)\n",
    "recall = recall / query_qty\n",
    "print('kNN recall %f' % recall)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}