{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reading node feature in Julia\n",
    "\n",
    "We read the BHSA feature `g_word_utf8`, which maps nearly half a million integers to Hebrew word occurrences\n",
    "in the Hebrew Bible.\n",
    "\n",
    "We measure the execution time of a second run of the last cell, so that we do not count warming up effects."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Choice of test feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:42.097000+02:00",
     "start_time": "2018-07-20T20:44:39.474Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"/Users/dirk/text-fabric-data/etcbc/bhsa/tf/c/g_word_utf8.tf\""
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "base = \"$(homedir())/text-fabric-data/etcbc/bhsa/tf/c\"\n",
    "feature = \"g_word_utf8\"\n",
    "featurePath = \"$base/$feature.tf\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Auxiliary functions for reading a TF feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:43.173000+02:00",
     "start_time": "2018-07-20T20:44:42.870Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "error (generic function with 1 method)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function error(msg)\n",
    "    write(STDERR, \"$msg\\n\")\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:44.286000+02:00",
     "start_time": "2018-07-20T20:44:44.149Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "showResults (generic function with 1 method)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function showResults(errors, data)\n",
    "  if errors == 0\n",
    "      maxNode = maximum(keys(data))\n",
    "      print(\"$(length(data)) results, last node $maxNode\\n\")\n",
    "      print(\"$(data[1])\\n\")\n",
    "      print(\"$(data[2])\\n\")\n",
    "      print(\"$(data[maxNode])\\n\")\n",
    "  else\n",
    "      print(\"$errors errors\")\n",
    "  end\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:45.282000+02:00",
     "start_time": "2018-07-20T20:44:45.136Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "valueFromTf (generic function with 1 method)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function valueFromTf(tf)\n",
    "  join([replace(replace(x, \"\\\\t\", \"\\t\"), \"\\\\n\", \"\\n\") for x in split(tf, \"\\\\\\\\\")], \"\\\\\")\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:46.052000+02:00",
     "start_time": "2018-07-20T20:44:45.938Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "setFromSpec (generic function with 1 method)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function setFromSpec(spec)::Set{UInt32}\n",
    "  covered = Set{UInt32}()\n",
    "  for r_str in split(spec, \",\")\n",
    "    bounds = split(r_str, \"-\")\n",
    "    if length(bounds) == 1\n",
    "      push!(covered, parse(UInt32, r_str))\n",
    "    else\n",
    "      b = parse(UInt32, bounds[1])\n",
    "      e = parse(UInt32, bounds[2])\n",
    "      if e < b\n",
    "        (b, e) = (e, b)\n",
    "      end\n",
    "      for n in b:e\n",
    "        push!(covered, n)\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  covered\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just reading a TF feature from disk, get through the metadata, and deliver all lines in memory, plus\n",
    "the starting line for the data.\n",
    "\n",
    "The whole file gets slurped."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:47.409000+02:00",
     "start_time": "2018-07-20T20:44:47.307Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readFile (generic function with 1 method)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readFile(path)\n",
    "  if !isfile(path)\n",
    "    error(\"TF reading: feature file '$path' does not exist\")\n",
    "    return false\n",
    "  end\n",
    "  contents = open(path) do fh\n",
    "    read(fh, String)\n",
    "  end\n",
    "  lines = split(contents, \"\\n\")\n",
    "  if lines[end] == \"\"\n",
    "    pop!(lines)\n",
    "  end\n",
    "  i::UInt32 = 0\n",
    "  for line in lines\n",
    "    i += 1\n",
    "    if startswith(line, \"@\")\n",
    "      continue\n",
    "    else\n",
    "      if line != \"\"\n",
    "        error(\"Line $i: missing blank line after metadata\")\n",
    "        return false\n",
    "      else\n",
    "        break\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  i += 1\n",
    "  (lines, i)\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The readTf function as done in Text-Fabric."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:48.827000+02:00",
     "start_time": "2018-07-20T20:44:48.731Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readTf (generic function with 1 method)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readTf(path)\n",
    "  if !isfile(path)\n",
    "    error(\"TF reading: feature file '$path' does not exist\")\n",
    "    return false\n",
    "  end\n",
    "  fh = open(path)\n",
    "  i = 0\n",
    "  for line in eachline(fh)\n",
    "    i += 1\n",
    "    text = rstrip(line)\n",
    "    if startswith(text, \"@\")\n",
    "      continue\n",
    "    else\n",
    "      if text != \"\"\n",
    "        error(\"Line $i: missing blank line after metadata\")\n",
    "        close(fh)\n",
    "        return false\n",
    "      else\n",
    "        break\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  result = readDataTf(fh, i)\n",
    "  close(fh)\n",
    "  result\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Reading the data part pf a feature and storing it in a dict."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:50.260000+02:00",
     "start_time": "2018-07-20T20:44:50.146Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readDataTf (generic function with 1 method)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readDataTf(fh, firstI)\n",
    "  i = firstI\n",
    "  implicit_node = 1\n",
    "  data = Dict{Integer, String}()\n",
    "  normFields = 2\n",
    "  isNum = false\n",
    "  errors = 0\n",
    "  for line in eachline(fh)\n",
    "    i += 1\n",
    "    fields = split(rstrip(line, '\\n'), \"\\t\")\n",
    "    lfields = length(fields)\n",
    "    if lfields > normFields\n",
    "      error(\"$(i) : wrongFields\")\n",
    "      errors += 1\n",
    "      continue\n",
    "    end\n",
    "    if lfields == normFields\n",
    "      nodes = setFromSpec(fields[1])\n",
    "      valTf = fields[end]\n",
    "    else\n",
    "      nodes = Set([implicit_node])\n",
    "      if lfields == 1\n",
    "        valTf = fields[1]\n",
    "      else\n",
    "        valTf = \"\"\n",
    "      end\n",
    "    end\n",
    "    implicit_node = maximum(nodes) + 1\n",
    "    value = (\n",
    "        valTf == \"\" ?\n",
    "          (isNum && valTf != \"\") ? parse(Int, valTf) : (isNum ? nothing : \"\") :\n",
    "          valueFromTf(valTf)\n",
    "    )\n",
    "    for n in nodes\n",
    "      if value !== nothing\n",
    "        data[n] = value\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  (errors, data)\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A variant: read a TF feature and store it in a list."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:53.072000+02:00",
     "start_time": "2018-07-20T20:44:52.972Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readTfList (generic function with 1 method)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readTfList(path)\n",
    "  if !isfile(path)\n",
    "    error(\"TF reading: feature file '$path' does not exist\")\n",
    "    return false\n",
    "  end\n",
    "  fh = open(path)\n",
    "  i = 0\n",
    "  for line in eachline(fh)\n",
    "    i += 1\n",
    "    text = rstrip(line)\n",
    "    if startswith(text, \"@\")\n",
    "      continue\n",
    "    else\n",
    "      if text != \"\"\n",
    "        error(\"Line $i: missing blank line after metadata\")\n",
    "        close(fh)\n",
    "        return false\n",
    "      else\n",
    "        break\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  result = readDataTfList(fh, i)\n",
    "  close(fh)\n",
    "  result\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:53.971000+02:00",
     "start_time": "2018-07-20T20:44:53.872Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readDataTfList (generic function with 1 method)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readDataTfList(fh, firstI)\n",
    "  i = firstI\n",
    "  implicit_node = 1\n",
    "  data = Array{String, 1}()\n",
    "  normFields = 2\n",
    "  isNum = false\n",
    "  errors = 0\n",
    "  for line in eachline(fh)\n",
    "    i += 1\n",
    "    fields = split(rstrip(line, '\\n'), \"\\t\")\n",
    "    lfields = length(fields)\n",
    "    if lfields > normFields\n",
    "      error(\"$(i) : wrongFields\")\n",
    "      errors += 1\n",
    "      continue\n",
    "    end\n",
    "    if lfields == normFields\n",
    "      nodes = setFromSpec(fields[1])\n",
    "      valTf = fields[end]\n",
    "    else\n",
    "      nodes = Set([implicit_node])\n",
    "      if lfields == 1\n",
    "        valTf = fields[1]\n",
    "      else\n",
    "        valTf = \"\"\n",
    "      end\n",
    "    end\n",
    "    implicit_node = maximum(nodes) + 1\n",
    "    value = (\n",
    "        valTf == \"\" ?\n",
    "          (isNum && valTf != \"\") ? parse(Int, valTf) : (isNum ? nothing : \"\") :\n",
    "          valueFromTf(valTf)\n",
    "    )\n",
    "    for n in nodes\n",
    "      if value !== nothing\n",
    "        push!(data, value)\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  (errors, data)\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read a TF feature by slurping."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:56.236000+02:00",
     "start_time": "2018-07-20T20:44:56.124Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readTfSlurp (generic function with 1 method)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readTfSlurp(path)\n",
    "  if !isfile(path)\n",
    "    error(\"TF reading: feature file '$path' does not exist\")\n",
    "    return false\n",
    "  end\n",
    "  contents = open(path) do fh\n",
    "    read(fh, String)\n",
    "  end\n",
    "  lines = split(contents, \"\\n\")\n",
    "  if lines[end] == \"\"\n",
    "    pop!(lines)\n",
    "  end\n",
    "  i = 0\n",
    "  for line in lines\n",
    "    i += 1\n",
    "    if startswith(line, \"@\")\n",
    "      continue\n",
    "    else\n",
    "      if line != \"\"\n",
    "        error(\"Line $i: missing blank line after metadata\")\n",
    "        return false\n",
    "      else\n",
    "        break\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  result = readDataTfSlurp(lines, i + 1)\n",
    "  result\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:44:58.625000+02:00",
     "start_time": "2018-07-20T20:44:58.521Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readDataTfSlurp (generic function with 1 method)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readDataTfSlurp(lines, firstI)\n",
    "  i = firstI\n",
    "  implicit_node = 1\n",
    "  data = Dict{Integer, String}()\n",
    "  normFields = 2\n",
    "  isNum = false\n",
    "  errors = 0\n",
    "  for line in lines[firstI:end]\n",
    "    i += 1\n",
    "    fields = split(line, \"\\t\")\n",
    "    lfields = length(fields)\n",
    "    if lfields > normFields\n",
    "      error(\"$(i) : wrongFields\")\n",
    "      errors += 1\n",
    "      continue\n",
    "    end\n",
    "    if lfields == normFields\n",
    "      nodes = setFromSpec(fields[1])\n",
    "      valTf = fields[end]\n",
    "    else\n",
    "      nodes = Set([implicit_node])\n",
    "      if lfields == 1\n",
    "        valTf = fields[1]\n",
    "      else\n",
    "        valTf = \"\"\n",
    "      end\n",
    "    end\n",
    "    implicit_node = maximum(nodes) + 1\n",
    "    value = (\n",
    "        valTf == \"\" ?\n",
    "          (isNum && valTf != \"\") ? parse(Int, valTf) : (isNum ? nothing : \"\") :\n",
    "          valueFromTf(valTf)\n",
    "    )\n",
    "    for n in nodes\n",
    "      if value !== nothing\n",
    "        data[n] = value\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  (errors, data)\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A possibly optimized function to read a feature from already slurped data lines."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:00.974000+02:00",
     "start_time": "2018-07-20T20:45:00.875Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "readDataTfSlurpOpt (generic function with 1 method)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "function readDataTfSlurpOpt(lines, firstI::UInt32)\n",
    "  i::UInt32 = firstI\n",
    "  implicit_node::UInt32 = 1\n",
    "  data = Dict{UInt32, SubString{String}}()\n",
    "  normFields::UInt8 = 2\n",
    "  isNum::Bool = false\n",
    "  errors::UInt32 = 0\n",
    "  for line in lines[firstI:end]\n",
    "    i += 1\n",
    "    fields = split(line, \"\\t\")\n",
    "    lfields::UInt8 = length(fields)\n",
    "    if lfields > normFields\n",
    "      error(\"$(i) : wrongFields\")\n",
    "      errors += 1\n",
    "      continue\n",
    "    end\n",
    "    if lfields == normFields\n",
    "      nodes::Set{UInt32} = setFromSpec(fields[1])\n",
    "      valTf = fields[end]\n",
    "    else\n",
    "      nodes = Set{UInt32}([implicit_node])\n",
    "      if lfields == 1\n",
    "        valTf = fields[1]\n",
    "      else\n",
    "        valTf = \"\"\n",
    "      end\n",
    "    end\n",
    "    implicit_node = maximum(nodes) + 1\n",
    "    value = (\n",
    "        valTf == \"\" ?\n",
    "          (isNum && valTf != \"\") ? parse(Int, valTf) : (isNum ? nothing : \"\") :\n",
    "          valueFromTf(valTf)\n",
    "    )\n",
    "    for n::UInt32 in nodes\n",
    "      if value !== nothing\n",
    "        data[n] = value\n",
    "      end\n",
    "    end\n",
    "  end\n",
    "  (errors, data)\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: straight TF reading"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:07.070000+02:00",
     "start_time": "2018-07-20T20:45:03.454Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, Dict{Integer,String}(Pair{Integer,String}(247825, \"פְּנֵ֖י\"),Pair{Integer,String}(43031, \"אֲדָנִ֗ים\"),Pair{Integer,String}(349542, \"תְּבוּנָֽה\"),Pair{Integer,String}(323003, \"אֱ֭לֹהִים\"),Pair{Integer,String}(355530, \"לַֽ\"),Pair{Integer,String}(372485, \"דִֽי\"),Pair{Integer,String}(375950, \"רַחֲמִ֖ים\"),Pair{Integer,String}(319122, \"יִתָּצְךָ֪\"),Pair{Integer,String}(61670, \"יִטְמָ֖א\"),Pair{Integer,String}(119601, \"וַ\")…))"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(errors, data) = readTf(featurePath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 3.5s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:10.317000+02:00",
     "start_time": "2018-07-20T20:45:10.120Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "בְּ\n",
      "רֵאשִׁ֖ית\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: TF reading as list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:18.750000+02:00",
     "start_time": "2018-07-20T20:45:16.370Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, String[\"בְּ\", \"רֵאשִׁ֖ית\", \"בָּרָ֣א\", \"אֱלֹהִ֑ים\", \"אֵ֥ת\", \"הַ\", \"שָּׁמַ֖יִם\", \"וְ\", \"אֵ֥ת\", \"הָ\"  …  \"מִֽי\", \"בָכֶ֣ם\", \"מִ\", \"כָּל\", \"עַמֹּ֗ו\", \"יְהוָ֧ה\", \"אֱלֹהָ֛יו\", \"עִמֹּ֖ו\", \"וְ\", \"יָֽעַל\"])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(errors, data) = readTfList(featurePath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 2.5s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:21.172000+02:00",
     "start_time": "2018-07-20T20:45:21.146Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "בְּ\n",
      "רֵאשִׁ֖ית\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: TF slurping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:26.852000+02:00",
     "start_time": "2018-07-20T20:45:23.423Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0, Dict{Integer,String}(Pair{Integer,String}(247825, \"פְּנֵ֖י\"),Pair{Integer,String}(43031, \"אֲדָנִ֗ים\"),Pair{Integer,String}(349542, \"תְּבוּנָֽה\"),Pair{Integer,String}(323003, \"אֱ֭לֹהִים\"),Pair{Integer,String}(355530, \"לַֽ\"),Pair{Integer,String}(372485, \"דִֽי\"),Pair{Integer,String}(375950, \"רַחֲמִ֖ים\"),Pair{Integer,String}(319122, \"יִתָּצְךָ֪\"),Pair{Integer,String}(61670, \"יִטְמָ֖א\"),Pair{Integer,String}(119601, \"וַ\")…))"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(errors, data) = readTfSlurp(featurePath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 3.8s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:28.929000+02:00",
     "start_time": "2018-07-20T20:45:28.854Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "בְּ\n",
      "רֵאשִׁ֖ית\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Test: slurping and then optimized TF processing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:32.466000+02:00",
     "start_time": "2018-07-20T20:45:31.802Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(SubString{String}[\"@node\", \"@author=Eep Talstra Centre for Bible and Computer\", \"@dataset=BHSA\", \"@datasetName=Biblia Hebraica Stuttgartensia Amstelodamensis\", \"@email=shebanq@ancient-data.org\", \"@encoders=Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)\", \"@valueType=str\", \"@version=_temp\", \"@website=https://shebanq.ancient-data.org\", \"@writtenBy=Text-Fabric\"  …  \"מִֽי\", \"בָכֶ֣ם\", \"מִ\", \"כָּל\", \"עַמֹּ֗ו\", \"יְהוָ֧ה\", \"אֱלֹהָ֛יו\", \"עִמֹּ֖ו\", \"וְ\", \"יָֽעַל\"], 0x0000000d)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(lines, first) = readFile(featurePath)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 0.12s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:46.758000+02:00",
     "start_time": "2018-07-20T20:45:44.361Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0x00000000, Dict(0x0003c811=>\"פְּנֵ֖י\",0x0000a817=>\"אֲדָנִ֗ים\",0x00055566=>\"תְּבוּנָֽה\",0x0004edbb=>\"אֱ֭לֹהִים\",0x00056cca=>\"לַֽ\",0x0005af05=>\"דִֽי\",0x0005bc8e=>\"רַחֲמִ֖ים\",0x0004de92=>\"יִתָּצְךָ֪\",0x0000f0e6=>\"יִטְמָ֖א\",0x0001d331=>\"וַ\"…))"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(errors, data) = readDataTfSlurpOpt(lines, first)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Execution time: around 2.2s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-07-20T22:45:48.418000+02:00",
     "start_time": "2018-07-20T20:45:48.406Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "426584 results, last node 426584\n",
      "בְּ\n",
      "רֵאשִׁ֖ית\n",
      "יָֽעַל\n"
     ]
    }
   ],
   "source": [
    "showResults(errors, data)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.4",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}