{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reading node feature in Python\n",
    "\n",
    "We read the BHSA feature `g_word_utf8`, which maps nearly half a million integers to Hebrew word occurrences\n",
    "in the Hebrew Bible.\n",
    "\n",
    "We measure the execution time of a second run of the last cell, so that we do not count warming up effects."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:47:56.225513Z",
     "start_time": "2018-07-19T11:47:56.218956Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "from typing import Dict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Choice of test feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:47:58.922497Z",
     "start_time": "2018-07-19T11:47:58.918414Z"
    }
   },
   "outputs": [],
   "source": [
    "base = f'~/text-fabric-data/etcbc/bhsa/tf/c'\n",
    "feature = 'g_word_utf8'\n",
    "featurePath = f'{os.path.expanduser(base)}/{feature}.tf'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Auxiliary functions for reading a TF feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:00.082742Z",
     "start_time": "2018-07-19T11:48:00.078974Z"
    }
   },
   "outputs": [],
   "source": [
    "def error(msg):\n",
    "    sys.stderr.write(f'{msg}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:00.591342Z",
     "start_time": "2018-07-19T11:48:00.586803Z"
    }
   },
   "outputs": [],
   "source": [
    "def showResults(errors, data):\n",
    "  if errors == 0:\n",
    "    maxNode = max(data.keys()) if type(data) is dict else len(data)\n",
    "    print(f'{len(data)} results, last node {maxNode}')\n",
    "    print(data[1])\n",
    "    print(data[2])\n",
    "    print(data[maxNode if type(data) is dict else maxNode - 1])\n",
    "  else:\n",
    "    print(f'{errors} errors')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:00.983864Z",
     "start_time": "2018-07-19T11:48:00.978419Z"
    }
   },
   "outputs": [],
   "source": [
    "def valueFromTf(tf):\n",
    "  return '\\\\'.join(x.replace('\\\\t', '\\t').replace('\\\\n', '\\n') for x in tf.split('\\\\\\\\'))\n",
    "\n",
    "def setFromSpec(spec):\n",
    "  covered = set()\n",
    "  for r_str in spec.split(','):\n",
    "    bounds = r_str.split('-')\n",
    "    if len(bounds) == 1:\n",
    "      covered.add(int(r_str))\n",
    "    else:\n",
    "      b = int(bounds[0])\n",
    "      e = int(bounds[1])\n",
    "      if (e < b):\n",
    "        (b, e) = (e, b)\n",
    "      for n in range(b, e + 1):\n",
    "        covered.add(n)\n",
    "  return covered"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just reading a TF feature from disk, get through the metadata, en deliver all lines in memory, plus the starting line for the data.\n",
    "\n",
    "The whole file gets slurped."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:01.633852Z",
     "start_time": "2018-07-19T11:48:01.627609Z"
    }
   },
   "outputs": [],
   "source": [
    "def readFile(path):\n",
    "  if not os.path.exists(path):\n",
    "    error('TF reading: feature file \"{}\" does not exist'.format(path))\n",
    "    return False\n",
    "  with open(path, encoding='utf8') as fh:\n",
    "    contents = fh.read()\n",
    "  lines = contents.split('\\n')\n",
    "  if lines[-1] == '':\n",
    "    lines.pop()\n",
    "  i = 0\n",
    "  for line in lines:\n",
    "    i += 1\n",
    "    if line.startswith('@'):\n",
    "      continue\n",
    "    else:\n",
    "      if line != '':\n",
    "        error('Line {}: missing blank line after metadata'.format(i))\n",
    "        return False\n",
    "      else:\n",
    "        break\n",
    "  return (lines, i)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The readTf function as done in Text-Fabric."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:02.236340Z",
     "start_time": "2018-07-19T11:48:02.231003Z"
    }
   },
   "outputs": [],
   "source": [
    "def readTf(path):\n",
    "  if not os.path.exists(path):\n",
    "    error('TF reading: feature file \"{}\" does not exist'.format(path))\n",
    "    return False\n",
    "  fh = open(path, encoding='utf8')\n",
    "  i = 0\n",
    "  for line in fh:\n",
    "    i += 1\n",
    "    text = line.rstrip()\n",
    "    if text.startswith('@'):\n",
    "      continue\n",
    "    else:\n",
    "      if text != '':\n",
    "        error('Line {}: missing blank line after metadata'.format(i))\n",
    "        fh.close()\n",
    "        return False\n",
    "      else:\n",
    "        break\n",
    "  result = readDataTf(fh, i)\n",
    "  fh.close()\n",
    "  return result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reading the data part pf a feature and storing it in a dict."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:02.702087Z",
     "start_time": "2018-07-19T11:48:02.695497Z"
    }
   },
   "outputs": [],
   "source": [
    "def readDataTf(fh, firstI):\n",
    "  i = firstI\n",
    "  implicit_node = 1\n",
    "  data = {}\n",
    "  normFields = 2\n",
    "  isNum = False\n",
    "  errors = 0\n",
    "  for line in fh:\n",
    "    i += 1\n",
    "    fields = line.rstrip('\\n').split('\\t')\n",
    "    lfields = len(fields)\n",
    "    if lfields > normFields:\n",
    "      error(f'{i}: wrongFields')\n",
    "      errors += 1\n",
    "      continue\n",
    "    if lfields == normFields:\n",
    "      nodes = setFromSpec(fields[0])\n",
    "      valTf = fields[-1]\n",
    "    else:\n",
    "      nodes = {implicit_node}\n",
    "      if lfields == 1:\n",
    "        valTf = fields[0]\n",
    "      else:\n",
    "        valTf = ''\n",
    "    implicit_node = max(nodes) + 1\n",
    "    value = (\n",
    "        int(valTf) if isNum and valTf != '' else None if isNum else ''\n",
    "        if valTf == '' else valueFromTf(valTf)\n",
    "    )\n",
    "    for n in nodes:\n",
    "      if value is not None:\n",
    "        data[n] = value\n",
    "  return (errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A variant: read a TF feature and store it in a list."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:03.069068Z",
     "start_time": "2018-07-19T11:48:03.063652Z"
    }
   },
   "outputs": [],
   "source": [
    "def readTfList(path):\n",
    "  if not os.path.exists(path):\n",
    "    error('TF reading: feature file \"{}\" does not exist'.format(path))\n",
    "    return False\n",
    "  fh = open(path, encoding='utf8')\n",
    "  i = 0\n",
    "  for line in fh:\n",
    "    i += 1\n",
    "    text = line.rstrip()\n",
    "    if text.startswith('@'):\n",
    "      continue\n",
    "    else:\n",
    "      if text != '':\n",
    "        error('Line {}: missing blank line after metadata'.format(i))\n",
    "        fh.close()\n",
    "        return False\n",
    "      else:\n",
    "        break\n",
    "  result = readDataTfList(fh, i)\n",
    "  fh.close()\n",
    "  return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:03.229306Z",
     "start_time": "2018-07-19T11:48:03.218625Z"
    }
   },
   "outputs": [],
   "source": [
    "def readDataTfList(fh, firstI):\n",
    "  i = firstI\n",
    "  implicit_node = 1\n",
    "  data = []\n",
    "  normFields = 2\n",
    "  isNum = False\n",
    "  errors = 0\n",
    "  for line in fh:\n",
    "    i += 1\n",
    "    fields = line.rstrip('\\n').split('\\t')\n",
    "    lfields = len(fields)\n",
    "    if lfields > normFields:\n",
    "      error(f'{i}: wrongFields')\n",
    "      errors += 1\n",
    "      continue\n",
    "    if lfields == normFields:\n",
    "      nodes = setFromSpec(fields[0])\n",
    "      valTf = fields[-1]\n",
    "    else:\n",
    "      nodes = {implicit_node}\n",
    "      if lfields == 1:\n",
    "        valTf = fields[0]\n",
    "      else:\n",
    "        valTf = ''\n",
    "    implicit_node = max(nodes) + 1\n",
    "    value = (\n",
    "        int(valTf) if isNum and valTf != '' else None if isNum else ''\n",
    "        if valTf == '' else valueFromTf(valTf)\n",
    "    )\n",
    "    for n in nodes:\n",
    "      if value is not None:\n",
    "        data.append(value)\n",
    "  return (errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read a TF feature by slurping."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:03.579472Z",
     "start_time": "2018-07-19T11:48:03.569737Z"
    }
   },
   "outputs": [],
   "source": [
    "def readTfSlurp(path):\n",
    "  if not os.path.exists(path):\n",
    "    error('TF reading: feature file \"{}\" does not exist'.format(path))\n",
    "    return False\n",
    "  with open(path, encoding='utf8') as fh:\n",
    "    contents = fh.read()\n",
    "  lines = contents.split('\\n')\n",
    "  if lines[-1] == '':\n",
    "    lines.pop()\n",
    "  i = 0\n",
    "  for line in lines:\n",
    "    i += 1\n",
    "    if line.startswith('@'):\n",
    "      continue\n",
    "    else:\n",
    "      if line != '':\n",
    "        error('Line {}: missing blank line after metadata'.format(i))\n",
    "        return False\n",
    "      else:\n",
    "        break\n",
    "  result = readDataTfSlurp(lines, i)\n",
    "  return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:03.753557Z",
     "start_time": "2018-07-19T11:48:03.746903Z"
    }
   },
   "outputs": [],
   "source": [
    "def readDataTfSlurp(lines, firstI):\n",
    "  i = firstI - 1\n",
    "  implicit_node = 1\n",
    "  data = {}\n",
    "  normFields = 2\n",
    "  isNum = False\n",
    "  errors = 0\n",
    "  for line in lines[firstI:]:\n",
    "    i += 1\n",
    "    fields = line.split('\\t')\n",
    "    lfields = len(fields)\n",
    "    if lfields > normFields:\n",
    "      error(f'{i}: wrongFields')\n",
    "      errors += 1\n",
    "      continue\n",
    "    if lfields == normFields:\n",
    "      nodes = setFromSpec(fields[0])\n",
    "      valTf = fields[-1]\n",
    "    else:\n",
    "      nodes = {implicit_node}\n",
    "      if lfields == 1:\n",
    "        valTf = fields[0]\n",
    "      else:\n",
    "        valTf = ''\n",
    "    implicit_node = max(nodes) + 1\n",
    "    value = (\n",
    "        int(valTf) if isNum and valTf != '' else None if isNum else ''\n",
    "        if valTf == '' else valueFromTf(valTf)\n",
    "    )\n",
    "    for n in nodes:\n",
    "      if value is not None:\n",
    "        data[n] = value\n",
    "  return (errors, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:59:05.067131Z",
     "start_time": "2018-07-19T11:59:05.056167Z"
    }
   },
   "outputs": [],
   "source": [
    "def readDataTfSlurpOpt(lines, firstI):\n",
    "  i = firstI - 1\n",
    "  implicit_node = 1\n",
    "  data: Dict[int, str] = dict()\n",
    "  normFields = 2\n",
    "  isNum = False\n",
    "  errors = 0\n",
    "  for line in lines[firstI:]:\n",
    "    i += 1\n",
    "    fields = line.split('\\t')\n",
    "    lfields = len(fields)\n",
    "    if lfields > normFields:\n",
    "      error(f'{i}: wrongFields')\n",
    "      errors += 1\n",
    "      continue\n",
    "    if lfields == normFields:\n",
    "      nodes = setFromSpec(fields[0])\n",
    "      valTf = fields[-1]\n",
    "    else:\n",
    "      nodes = {implicit_node}\n",
    "      if lfields == 1:\n",
    "        valTf = fields[0]\n",
    "      else:\n",
    "        valTf = ''\n",
    "    implicit_node = max(nodes) + 1\n",
    "    value = (\n",
    "        int(valTf) if isNum and valTf != '' else None if isNum else ''\n",
    "        if valTf == '' else valueFromTf(valTf)\n",
    "    )\n",
    "    for n in nodes:\n",
    "      if value is not None:\n",
    "        data[n] = value\n",
    "  return (errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: straight TF reading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:06.426847Z",
     "start_time": "2018-07-19T11:48:05.262865Z"
    }
   },
   "outputs": [],
   "source": [
    "(errors, data) = readTf(featurePath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 1.2s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:08.891030Z",
     "start_time": "2018-07-19T11:48:08.869834Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "בְּ\n",
      "רֵאשִׁ֖ית\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: TF reading as list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:17.065267Z",
     "start_time": "2018-07-19T11:48:15.855416Z"
    }
   },
   "outputs": [],
   "source": [
    "(errors, data) = readTfList(featurePath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 1.2s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:20.257274Z",
     "start_time": "2018-07-19T11:48:20.250622Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "רֵאשִׁ֖ית\n",
      "בָּרָ֣א\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: TF slurping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:23.825878Z",
     "start_time": "2018-07-19T11:48:22.693180Z"
    }
   },
   "outputs": [],
   "source": [
    "(errors, data) = readTfSlurp(featurePath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:48:26.234900Z",
     "start_time": "2018-07-19T11:48:26.214816Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "בְּ\n",
      "רֵאשִׁ֖ית\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 1.1s"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: slurping and then optimized TF processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:59:23.449149Z",
     "start_time": "2018-07-19T11:59:23.368331Z"
    }
   },
   "outputs": [],
   "source": [
    "(lines, first) = readFile(featurePath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 0.1s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:59:26.987024Z",
     "start_time": "2018-07-19T11:59:25.944259Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "(errors, data) = readDataTfSlurpOpt(lines, first)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 1.0s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-19T11:59:20.380376Z",
     "start_time": "2018-07-19T11:59:20.363503Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "בְּ\n",
      "רֵאשִׁ֖ית\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}