{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from bisect import bisect_left\n", "from timeit import timeit\n", "from tf.app import use\n", "from pack import deepSize" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "chapters = [[1, 1, 10], [2, 11, 20], [3, 21, 30]]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "lastMs = []\n", "ids = []\n", "\n", "for record in chapters:\n", " lastMs.append(record[2])\n", " ids.append(record[0])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 1 => 0 => 1\n", " 2 => 0 => 1\n", " 3 => 0 => 1\n", " 4 => 0 => 1\n", " 5 => 0 => 1\n", " 6 => 0 => 1\n", " 7 => 0 => 1\n", " 8 => 0 => 1\n", " 9 => 0 => 1\n", "10 => 0 => 1\n", "11 => 1 => 2\n", "12 => 1 => 2\n", "13 => 1 => 2\n", "14 => 1 => 2\n", "15 => 1 => 2\n", "16 => 1 => 2\n", "17 => 1 => 2\n", "18 => 1 => 2\n", "19 => 1 => 2\n", "20 => 1 => 2\n", "21 => 2 => 3\n", "22 => 2 => 3\n", "23 => 2 => 3\n", "24 => 2 => 3\n", "25 => 2 => 3\n", "26 => 2 => 3\n", "27 => 2 => 3\n", "28 => 2 => 3\n", "29 => 2 => 3\n", "30 => 2 => 3\n", "31 => 3 => x\n", "32 => 3 => x\n", "33 => 3 => x\n", "34 => 3 => x\n" ] } ], "source": [ "for i in range(1, 35):\n", " ind = bisect_left(lastMs, i)\n", " cid = ids[ind] if ind < len(ids) else \"x\"\n", " print(f\"{i:>2} => {ind:>2} => {cid:>2}\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[10, 20, 30]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lastMs" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "TF-app: ~/github/annotation/app-bhsa/code" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/text-fabric-data/etcbc/bhsa/tf/c" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/text-fabric-data/etcbc/phono/tf/c" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/text-fabric-data/etcbc/parallels/tf/c" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "A = use('bhsa:clone', silent='deep')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Given a list of valid indices and a list of all values, we can look up all values by means of bisect.\n", "\n", "The get function is surprisingly simple and quite fast." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# nametype\n", "# sp\n", "\n", "def testPerformance(feat):\n", " Fs = A.api.Fs\n", " featObj = Fs(feat)\n", " data = featObj.data\n", " \n", " indices = []\n", " values = []\n", " for (k, v) in sorted(data.items()):\n", " indices.append(k)\n", " values.append(v)\n", " # print(indices[0:10])\n", " # print(values[0:10])\n", "\n", " indSize = deepSize(indices)\n", " valSize = deepSize(values)\n", " totSize = indSize + valSize\n", "\n", " print(f\"{len(data)} items\")\n", " print(f\"Size = {deepSize(data)}\")\n", " print(f\"Size = {indSize} + {valSize} = {totSize}\")\n", " \n", " tfLookup = featObj.v\n", " \n", " def bsLookup(i):\n", " j = bisect_left(indices, i)\n", " if j >= len(indices):\n", " return None\n", " k = indices[j]\n", " return values[j] if k == i else None\n", " \n", " maxIndex = max(indices)\n", " \n", " def execute(v):\n", " upperIndex = maxIndex + 10\n", " key0 = 739 # not in the data\n", " key1 = 740 # in the data\n", " \n", " def w():\n", " for i in range(700, 1700):\n", " x = v(i)\n", " \n", " def a():\n", " n = 0\n", " for i in range(upperIndex):\n", " if v(i) is not None:\n", " n += 1\n", " \n", " times1 = 1000000\n", " times2 = 1000\n", " times3 = 1\n", " \n", " t1 = timeit(\"v(key0)\", globals=locals(), number=times1)\n", " t2 = timeit(\"v(key1)\", globals=locals(), number=times1)\n", " t3 = timeit(\"w()\", globals=locals(), number=times2)\n", " t4 = timeit(\"a()\", globals=locals(), number=times3)\n", " print(f\"{t1:>.3f} {t2:>.3f} {t3:>.3f} {t4 * 1000000 / upperIndex:>.3f}\")\n", " \n", " execute(tfLookup)\n", " execute(bsLookup)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "38184 items\n", "Size = 2380461\n", "Size = 1390248 + 321597 = 1711845\n", "0.154 0.216 0.159 0.164\n", "0.523 0.537 0.512 0.515\n" ] } ], "source": [ "testPerformance('nametype')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "435817 items\n", "Size = 33175225\n", "Size = 16016172 + 3814037 = 19830209\n", "0.210 0.214 0.210 0.186\n", "0.637 0.591 0.618 0.634\n" ] } ], "source": [ "testPerformance('sp')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Observation\n", "\n", "The performance degradation is a factor of **3-4**, but no more memory is used (rather a bit less).\n", "More over, instead of a dict we have two lists, which we can manage in a separate process by means of SharableList." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }