{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from bisect import bisect_left\n",
"from timeit import timeit\n",
"from tf.app import use\n",
"from pack import deepSize"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"chapters = [[1, 1, 10], [2, 11, 20], [3, 21, 30]]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"lastMs = []\n",
"ids = []\n",
"\n",
"for record in chapters:\n",
" lastMs.append(record[2])\n",
" ids.append(record[0])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 1 => 0 => 1\n",
" 2 => 0 => 1\n",
" 3 => 0 => 1\n",
" 4 => 0 => 1\n",
" 5 => 0 => 1\n",
" 6 => 0 => 1\n",
" 7 => 0 => 1\n",
" 8 => 0 => 1\n",
" 9 => 0 => 1\n",
"10 => 0 => 1\n",
"11 => 1 => 2\n",
"12 => 1 => 2\n",
"13 => 1 => 2\n",
"14 => 1 => 2\n",
"15 => 1 => 2\n",
"16 => 1 => 2\n",
"17 => 1 => 2\n",
"18 => 1 => 2\n",
"19 => 1 => 2\n",
"20 => 1 => 2\n",
"21 => 2 => 3\n",
"22 => 2 => 3\n",
"23 => 2 => 3\n",
"24 => 2 => 3\n",
"25 => 2 => 3\n",
"26 => 2 => 3\n",
"27 => 2 => 3\n",
"28 => 2 => 3\n",
"29 => 2 => 3\n",
"30 => 2 => 3\n",
"31 => 3 => x\n",
"32 => 3 => x\n",
"33 => 3 => x\n",
"34 => 3 => x\n"
]
}
],
"source": [
"for i in range(1, 35):\n",
" ind = bisect_left(lastMs, i)\n",
" cid = ids[ind] if ind < len(ids) else \"x\"\n",
" print(f\"{i:>2} => {ind:>2} => {cid:>2}\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[10, 20, 30]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lastMs"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"TF-app: ~/github/annotation/app-bhsa/code"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"data: ~/text-fabric-data/etcbc/bhsa/tf/c"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"data: ~/text-fabric-data/etcbc/phono/tf/c"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"data: ~/text-fabric-data/etcbc/parallels/tf/c"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A = use('bhsa:clone', silent='deep')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Given a list of valid indices and a list of all values, we can look up all values by means of bisect.\n",
"\n",
"The get function is surprisingly simple and quite fast."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# nametype\n",
"# sp\n",
"\n",
"def testPerformance(feat):\n",
" Fs = A.api.Fs\n",
" featObj = Fs(feat)\n",
" data = featObj.data\n",
" \n",
" indices = []\n",
" values = []\n",
" for (k, v) in sorted(data.items()):\n",
" indices.append(k)\n",
" values.append(v)\n",
" # print(indices[0:10])\n",
" # print(values[0:10])\n",
"\n",
" indSize = deepSize(indices)\n",
" valSize = deepSize(values)\n",
" totSize = indSize + valSize\n",
"\n",
" print(f\"{len(data)} items\")\n",
" print(f\"Size = {deepSize(data)}\")\n",
" print(f\"Size = {indSize} + {valSize} = {totSize}\")\n",
" \n",
" tfLookup = featObj.v\n",
" \n",
" def bsLookup(i):\n",
" j = bisect_left(indices, i)\n",
" if j >= len(indices):\n",
" return None\n",
" k = indices[j]\n",
" return values[j] if k == i else None\n",
" \n",
" maxIndex = max(indices)\n",
" \n",
" def execute(v):\n",
" upperIndex = maxIndex + 10\n",
" key0 = 739 # not in the data\n",
" key1 = 740 # in the data\n",
" \n",
" def w():\n",
" for i in range(700, 1700):\n",
" x = v(i)\n",
" \n",
" def a():\n",
" n = 0\n",
" for i in range(upperIndex):\n",
" if v(i) is not None:\n",
" n += 1\n",
" \n",
" times1 = 1000000\n",
" times2 = 1000\n",
" times3 = 1\n",
" \n",
" t1 = timeit(\"v(key0)\", globals=locals(), number=times1)\n",
" t2 = timeit(\"v(key1)\", globals=locals(), number=times1)\n",
" t3 = timeit(\"w()\", globals=locals(), number=times2)\n",
" t4 = timeit(\"a()\", globals=locals(), number=times3)\n",
" print(f\"{t1:>.3f} {t2:>.3f} {t3:>.3f} {t4 * 1000000 / upperIndex:>.3f}\")\n",
" \n",
" execute(tfLookup)\n",
" execute(bsLookup)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"38184 items\n",
"Size = 2380461\n",
"Size = 1390248 + 321597 = 1711845\n",
"0.154 0.216 0.159 0.164\n",
"0.523 0.537 0.512 0.515\n"
]
}
],
"source": [
"testPerformance('nametype')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"435817 items\n",
"Size = 33175225\n",
"Size = 16016172 + 3814037 = 19830209\n",
"0.210 0.214 0.210 0.186\n",
"0.637 0.591 0.618 0.634\n"
]
}
],
"source": [
"testPerformance('sp')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Observation\n",
"\n",
"The performance degradation is a factor of **3-4**, but no more memory is used (rather a bit less).\n",
"More over, instead of a dict we have two lists, which we can manage in a separate process by means of SharableList."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}