{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Reading node feature in Python\n", "\n", "We read the BHSA feature `g_word_utf8`, which maps nearly half a million integers to Hebrew word occurrences\n", "in the Hebrew Bible.\n", "\n", "We measure the execution time of a second run of the last cell, so that we do not count warming up effects." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:47:56.225513Z", "start_time": "2018-07-19T11:47:56.218956Z" } }, "outputs": [], "source": [ "import os\n", "import sys\n", "from typing import Dict" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Choice of test feature" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:47:58.922497Z", "start_time": "2018-07-19T11:47:58.918414Z" } }, "outputs": [], "source": [ "base = f'~/text-fabric-data/etcbc/bhsa/tf/c'\n", "feature = 'g_word_utf8'\n", "featurePath = f'{os.path.expanduser(base)}/{feature}.tf'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Auxiliary functions for reading a TF feature" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:00.082742Z", "start_time": "2018-07-19T11:48:00.078974Z" } }, "outputs": [], "source": [ "def error(msg):\n", " sys.stderr.write(f'{msg}\\n')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:00.591342Z", "start_time": "2018-07-19T11:48:00.586803Z" } }, "outputs": [], "source": [ "def showResults(errors, data):\n", " if errors == 0:\n", " maxNode = max(data.keys()) if type(data) is dict else len(data)\n", " print(f'{len(data)} results, last node {maxNode}')\n", " print(data[1])\n", " print(data[2])\n", " print(data[maxNode if type(data) is dict else maxNode - 1])\n", " else:\n", " print(f'{errors} errors')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:00.983864Z", "start_time": "2018-07-19T11:48:00.978419Z" } }, "outputs": [], "source": [ "def valueFromTf(tf):\n", " return '\\\\'.join(x.replace('\\\\t', '\\t').replace('\\\\n', '\\n') for x in tf.split('\\\\\\\\'))\n", "\n", "def setFromSpec(spec):\n", " covered = set()\n", " for r_str in spec.split(','):\n", " bounds = r_str.split('-')\n", " if len(bounds) == 1:\n", " covered.add(int(r_str))\n", " else:\n", " b = int(bounds[0])\n", " e = int(bounds[1])\n", " if (e < b):\n", " (b, e) = (e, b)\n", " for n in range(b, e + 1):\n", " covered.add(n)\n", " return covered" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Just reading a TF feature from disk, get through the metadata, en deliver all lines in memory, plus the starting line for the data.\n", "\n", "The whole file gets slurped." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:01.633852Z", "start_time": "2018-07-19T11:48:01.627609Z" } }, "outputs": [], "source": [ "def readFile(path):\n", " if not os.path.exists(path):\n", " error('TF reading: feature file \"{}\" does not exist'.format(path))\n", " return False\n", " with open(path, encoding='utf8') as fh:\n", " contents = fh.read()\n", " lines = contents.split('\\n')\n", " if lines[-1] == '':\n", " lines.pop()\n", " i = 0\n", " for line in lines:\n", " i += 1\n", " if line.startswith('@'):\n", " continue\n", " else:\n", " if line != '':\n", " error('Line {}: missing blank line after metadata'.format(i))\n", " return False\n", " else:\n", " break\n", " return (lines, i)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The readTf function as done in Text-Fabric." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:02.236340Z", "start_time": "2018-07-19T11:48:02.231003Z" } }, "outputs": [], "source": [ "def readTf(path):\n", " if not os.path.exists(path):\n", " error('TF reading: feature file \"{}\" does not exist'.format(path))\n", " return False\n", " fh = open(path, encoding='utf8')\n", " i = 0\n", " for line in fh:\n", " i += 1\n", " text = line.rstrip()\n", " if text.startswith('@'):\n", " continue\n", " else:\n", " if text != '':\n", " error('Line {}: missing blank line after metadata'.format(i))\n", " fh.close()\n", " return False\n", " else:\n", " break\n", " result = readDataTf(fh, i)\n", " fh.close()\n", " return result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Reading the data part pf a feature and storing it in a dict." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:02.702087Z", "start_time": "2018-07-19T11:48:02.695497Z" } }, "outputs": [], "source": [ "def readDataTf(fh, firstI):\n", " i = firstI\n", " implicit_node = 1\n", " data = {}\n", " normFields = 2\n", " isNum = False\n", " errors = 0\n", " for line in fh:\n", " i += 1\n", " fields = line.rstrip('\\n').split('\\t')\n", " lfields = len(fields)\n", " if lfields > normFields:\n", " error(f'{i}: wrongFields')\n", " errors += 1\n", " continue\n", " if lfields == normFields:\n", " nodes = setFromSpec(fields[0])\n", " valTf = fields[-1]\n", " else:\n", " nodes = {implicit_node}\n", " if lfields == 1:\n", " valTf = fields[0]\n", " else:\n", " valTf = ''\n", " implicit_node = max(nodes) + 1\n", " value = (\n", " int(valTf) if isNum and valTf != '' else None if isNum else ''\n", " if valTf == '' else valueFromTf(valTf)\n", " )\n", " for n in nodes:\n", " if value is not None:\n", " data[n] = value\n", " return (errors, data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A variant: read a TF feature and store it in a list." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:03.069068Z", "start_time": "2018-07-19T11:48:03.063652Z" } }, "outputs": [], "source": [ "def readTfList(path):\n", " if not os.path.exists(path):\n", " error('TF reading: feature file \"{}\" does not exist'.format(path))\n", " return False\n", " fh = open(path, encoding='utf8')\n", " i = 0\n", " for line in fh:\n", " i += 1\n", " text = line.rstrip()\n", " if text.startswith('@'):\n", " continue\n", " else:\n", " if text != '':\n", " error('Line {}: missing blank line after metadata'.format(i))\n", " fh.close()\n", " return False\n", " else:\n", " break\n", " result = readDataTfList(fh, i)\n", " fh.close()\n", " return result" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:03.229306Z", "start_time": "2018-07-19T11:48:03.218625Z" } }, "outputs": [], "source": [ "def readDataTfList(fh, firstI):\n", " i = firstI\n", " implicit_node = 1\n", " data = []\n", " normFields = 2\n", " isNum = False\n", " errors = 0\n", " for line in fh:\n", " i += 1\n", " fields = line.rstrip('\\n').split('\\t')\n", " lfields = len(fields)\n", " if lfields > normFields:\n", " error(f'{i}: wrongFields')\n", " errors += 1\n", " continue\n", " if lfields == normFields:\n", " nodes = setFromSpec(fields[0])\n", " valTf = fields[-1]\n", " else:\n", " nodes = {implicit_node}\n", " if lfields == 1:\n", " valTf = fields[0]\n", " else:\n", " valTf = ''\n", " implicit_node = max(nodes) + 1\n", " value = (\n", " int(valTf) if isNum and valTf != '' else None if isNum else ''\n", " if valTf == '' else valueFromTf(valTf)\n", " )\n", " for n in nodes:\n", " if value is not None:\n", " data.append(value)\n", " return (errors, data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Read a TF feature by slurping." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:03.579472Z", "start_time": "2018-07-19T11:48:03.569737Z" } }, "outputs": [], "source": [ "def readTfSlurp(path):\n", " if not os.path.exists(path):\n", " error('TF reading: feature file \"{}\" does not exist'.format(path))\n", " return False\n", " with open(path, encoding='utf8') as fh:\n", " contents = fh.read()\n", " lines = contents.split('\\n')\n", " if lines[-1] == '':\n", " lines.pop()\n", " i = 0\n", " for line in lines:\n", " i += 1\n", " if line.startswith('@'):\n", " continue\n", " else:\n", " if line != '':\n", " error('Line {}: missing blank line after metadata'.format(i))\n", " return False\n", " else:\n", " break\n", " result = readDataTfSlurp(lines, i)\n", " return result" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:03.753557Z", "start_time": "2018-07-19T11:48:03.746903Z" } }, "outputs": [], "source": [ "def readDataTfSlurp(lines, firstI):\n", " i = firstI - 1\n", " implicit_node = 1\n", " data = {}\n", " normFields = 2\n", " isNum = False\n", " errors = 0\n", " for line in lines[firstI:]:\n", " i += 1\n", " fields = line.split('\\t')\n", " lfields = len(fields)\n", " if lfields > normFields:\n", " error(f'{i}: wrongFields')\n", " errors += 1\n", " continue\n", " if lfields == normFields:\n", " nodes = setFromSpec(fields[0])\n", " valTf = fields[-1]\n", " else:\n", " nodes = {implicit_node}\n", " if lfields == 1:\n", " valTf = fields[0]\n", " else:\n", " valTf = ''\n", " implicit_node = max(nodes) + 1\n", " value = (\n", " int(valTf) if isNum and valTf != '' else None if isNum else ''\n", " if valTf == '' else valueFromTf(valTf)\n", " )\n", " for n in nodes:\n", " if value is not None:\n", " data[n] = value\n", " return (errors, data)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:59:05.067131Z", "start_time": "2018-07-19T11:59:05.056167Z" } }, "outputs": [], "source": [ "def readDataTfSlurpOpt(lines, firstI):\n", " i = firstI - 1\n", " implicit_node = 1\n", " data: Dict[int, str] = dict()\n", " normFields = 2\n", " isNum = False\n", " errors = 0\n", " for line in lines[firstI:]:\n", " i += 1\n", " fields = line.split('\\t')\n", " lfields = len(fields)\n", " if lfields > normFields:\n", " error(f'{i}: wrongFields')\n", " errors += 1\n", " continue\n", " if lfields == normFields:\n", " nodes = setFromSpec(fields[0])\n", " valTf = fields[-1]\n", " else:\n", " nodes = {implicit_node}\n", " if lfields == 1:\n", " valTf = fields[0]\n", " else:\n", " valTf = ''\n", " implicit_node = max(nodes) + 1\n", " value = (\n", " int(valTf) if isNum and valTf != '' else None if isNum else ''\n", " if valTf == '' else valueFromTf(valTf)\n", " )\n", " for n in nodes:\n", " if value is not None:\n", " data[n] = value\n", " return (errors, data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test: straight TF reading" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:06.426847Z", "start_time": "2018-07-19T11:48:05.262865Z" } }, "outputs": [], "source": [ "(errors, data) = readTf(featurePath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execution time: around 1.2s" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:08.891030Z", "start_time": "2018-07-19T11:48:08.869834Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "426584 results, last node 426584\n", "בְּ\n", "רֵאשִׁ֖ית\n", "יָֽעַל\n" ] } ], "source": [ "showResults(errors, data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test: TF reading as list" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:17.065267Z", "start_time": "2018-07-19T11:48:15.855416Z" } }, "outputs": [], "source": [ "(errors, data) = readTfList(featurePath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execution time: around 1.2s" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:20.257274Z", "start_time": "2018-07-19T11:48:20.250622Z" }, "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "426584 results, last node 426584\n", "רֵאשִׁ֖ית\n", "בָּרָ֣א\n", "יָֽעַל\n" ] } ], "source": [ "showResults(errors, data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test: TF slurping" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:23.825878Z", "start_time": "2018-07-19T11:48:22.693180Z" } }, "outputs": [], "source": [ "(errors, data) = readTfSlurp(featurePath)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:48:26.234900Z", "start_time": "2018-07-19T11:48:26.214816Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "426584 results, last node 426584\n", "בְּ\n", "רֵאשִׁ֖ית\n", "יָֽעַל\n" ] } ], "source": [ "showResults(errors, data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execution time: around 1.1s" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Test: slurping and then optimized TF processing" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:59:23.449149Z", "start_time": "2018-07-19T11:59:23.368331Z" } }, "outputs": [], "source": [ "(lines, first) = readFile(featurePath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execution time: around 0.1s" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:59:26.987024Z", "start_time": "2018-07-19T11:59:25.944259Z" }, "scrolled": true }, "outputs": [], "source": [ "(errors, data) = readDataTfSlurpOpt(lines, first)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Execution time: around 1.0s" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2018-07-19T11:59:20.380376Z", "start_time": "2018-07-19T11:59:20.363503Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "426584 results, last node 426584\n", "בְּ\n", "רֵאשִׁ֖ית\n", "יָֽעַל\n" ] } ], "source": [ "showResults(errors, data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }