{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "
{mis} |
{rec} |
{dep} |
{dub} |
{out} |
{nor} |
{} {}
'.format(\n", " n,\n", " reffl,\n", " n,\n", " condense(verse_labels),\n", " )" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "def index_clique_coarse(bnm, n, ii):\n", " verse_labels = []\n", " for i in sorted(ii, key=lambda c: (-len(chunks[c]), c))[0:LARGE_CLIQUE_SIZE]:\n", " chunk = chunks[i]\n", " fword = chunk[0]\n", " book = F.book.v(L.u(fword, otype=\"book\")[0])\n", " chapter = F.chapter.v(L.u(fword, otype=\"chapter\")[0])\n", " verse = F.verse.v(L.u(fword, otype=\"verse\")[0])\n", " verse_labels.append((book, chapter, verse))\n", " reffl = \"{}_{}\".format(bnm, n // CLIQUES_PER_FILE)\n", " extra = (\n", " \"+ {} ...\".format(len(ii) - LARGE_CLIQUE_SIZE)\n", " if len(ii) > LARGE_CLIQUE_SIZE\n", " else \"\"\n", " )\n", " return '{} {}{}
'.format(\n", " n,\n", " reffl,\n", " n,\n", " condense(verse_labels),\n", " extra,\n", " )" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "def lines_chapter(c):\n", " lines = []\n", " for v in L.d(c, otype=\"verse\"):\n", " vl = F.verse.v(v)\n", " text = \"\".join(\n", " \"{}{}\".format(Fs(TEXT_FEATURE).v(w), Fs(TRAILER_FEATURE).v(w))\n", " for w in L.d(v, otype=\"word\")\n", " )\n", " lines.append(\"{} {}\".format(vl, text.replace(\"\\n\", \" \")))\n", " return lines" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "def compare_chapters(c1, c2, lb1, lb2):\n", " dh = difflib.HtmlDiff(wrapcolumn=80)\n", " table_html = dh.make_table(\n", " lines_chapter(c1),\n", " lines_chapter(c2),\n", " fromdesc=lb1,\n", " todesc=lb2,\n", " context=False,\n", " numlines=5,\n", " )\n", " htext = \"\"\"{}{}\"\"\".format(diffhead, table_html)\n", " return htext" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 5.8.3 Compiling the table of experiments\n", "\n", "Here we generate the table of experiments, complete with the colouring according to their assessments." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[18]:" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "# generate the table of experiments\n", "def gen_html(standalone=False):\n", " global other_exps\n", " TF.info(\n", " \"EXPERIMENT: Generating html report{}\".format(\n", " \"(standalone)\" if standalone else \"\"\n", " )\n", " )\n", " stats = collections.Counter()\n", " pre = (\n", " \"\"\"\n", "\n", "\n", "\n", "{}\n", "\n", "\n", "\"\"\".format(\n", " ecss\n", " )\n", " if standalone\n", " else \"\"\n", " )\n", "\n", " post = (\n", " \"\"\"\n", "\n", "\"\"\"\n", " if standalone\n", " else \"\"\n", " )\n", "\n", " experiments = \"\"\"\n", "{}\n", "{}\n", "chunk type | chunk size | similarity method | {}{} | \".format(sim_thr) for sim_thr in SIMILARITIES)\n", " )\n", "\n", " for chunk_f in (True, False):\n", " if chunk_f:\n", " chunk_items = CHUNK_SIZES\n", " else:\n", " chunk_items = CHUNK_OBJECTS\n", " chunk_lb = CHUNK_LBS[chunk_f]\n", " for chunk_i in chunk_items:\n", " for sim_m in SIM_METHODS:\n", " set_matrix_threshold(sim_m=sim_m, chunk_o=chunk_i)\n", " these_outputs = outputs.get(MATRIX_THRESHOLD, {})\n", " experiments += \"|
---|---|---|---|---|
{} | {} | {} | \".format(\n", " CHUNK_LABELS[chunk_f],\n", " chunk_i,\n", " sim_m,\n", " )\n", " for sim_thr in SIMILARITIES:\n", " okey = (chunk_lb, chunk_i, sim_m, sim_thr)\n", " values = these_outputs.get(okey)\n", " if values is None:\n", " result = ''\n", " stats[\"mis\"] += 1\n", " else:\n", " (npassages, ncliques, longest_clique_len) = values\n", " cls = assess_exp(\n", " chunk_f, npassages, ncliques, longest_clique_len\n", " )\n", " stats[cls] += 1\n", " (lr_el, lr_lb) = (\"\", \"\")\n", " if (\n", " CHUNK_LB,\n", " CHUNK_DESC,\n", " SIMILARITY_METHOD,\n", " SIMILARITY_THRESHOLD,\n", " ) == (\n", " chunk_lb,\n", " chunk_i,\n", " sim_m,\n", " sim_thr,\n", " ):\n", " lr_el = '*'\n", " lr_lb = VALUE_LABELS[\"lr\"]\n", " result = \"\"\"\n", " | {}\n",
" {} \n", " {} \n", " {}\n", " | \"\"\".format(\n",
" cls,\n",
" lr_lb,\n",
" lr_el,\n",
" npassages,\n",
" \"\" if standalone else LOCAL_BASE_OUTP + \"/\",\n",
" EXPERIMENT_DIR,\n",
" chunk_lb,\n",
" chunk_i,\n",
" sim_m,\n",
" MATRIX_THRESHOLD,\n",
" sim_thr,\n",
" ncliques,\n",
" longest_clique_len,\n",
" )\n",
" experiments += result\n",
" experiments += \"
chunking method | {} |
---|---|
chunking description | {} |
similarity method | {} |
similarity threshold | {} |
These results look good, so a binary chapter comparison has been generated
\"\n", " for cl in sorted(bin_cliques):\n", " lb1 = \"{} {}\".format(F.book.v(cl[0][0]), F.chapter.v(cl[0][1]))\n", " lb2 = \"{} {}\".format(F.book.v(cl[1][0]), F.chapter.v(cl[1][1]))\n", " hfilename = \"{}_vs_{}.html\".format(lb1, lb2).replace(\" \", \"_\")\n", " hfilepath = \"{}/{}/{}\".format(LOCAL_BASE_OUTP, CHAPTER_DIR, hfilename)\n", " chapter_diffs.append(\n", " (\n", " lb1,\n", " cl[0][1],\n", " lb2,\n", " cl[1][1],\n", " \"{}/{}/{}/{}\".format(\n", " SHEBANQ_TOOL,\n", " LOCAL_BASE_OUTP,\n", " CHAPTER_DIR,\n", " hfilename,\n", " ),\n", " )\n", " )\n", " if not os.path.exists(hfilepath):\n", " htext = compare_chapters(cl[0][1], cl[1][1], lb1, lb2)\n", " with open(hfilepath, \"w\") as f:\n", " f.write(htext)\n", " if VERBOSE:\n", " TF.info(\n", " \"PRINT ({} {} {} M>{} S>{}): written {}\".format(\n", " CHUNK_LB,\n", " CHUNK_DESC,\n", " SIMILARITY_METHOD,\n", " MATRIX_THRESHOLD,\n", " SIMILARITY_THRESHOLD,\n", " hfilename,\n", " )\n", " )\n", " nnew += 1\n", " else:\n", " nexist += 1\n", " clique_links.append(\n", " (\n", " \"../{}/{}\".format(CHAPTER_DIR, hfilename),\n", " \"{} versus {}\".format(lb1, lb2),\n", " )\n", " )\n", " TF.info(\n", " \"PRINT ({} {} {} M>{} S>{}): Chapter diffs: {} newly created and {} already existing\".format(\n", " CHUNK_LB,\n", " CHUNK_DESC,\n", " SIMILARITY_METHOD,\n", " MATRIX_THRESHOLD,\n", " SIMILARITY_THRESHOLD,\n", " nnew,\n", " nexist,\n", " )\n", " )\n", " else:\n", " bcc_text = \"These results look dubious at best, so no binary chapter comparison has been generated
\"\n", "\n", " allgeni_html = (\n", " index_clique(cliques_name, i, c, ncliques) for (i, c) in enumerate(cliques)\n", " )\n", "\n", " allgen_htmls = []\n", " allgen_html = \"\"\n", "\n", " for (i, c) in enumerate(cliques):\n", " if i % CLIQUES_PER_FILE == 0:\n", " if i > 0:\n", " allgen_htmls.append(allgen_html)\n", " allgen_html = \"\"\n", " allgen_html += '{}\n", "\n", "\"\"\"\n", "\n", " a_tpl_file = ''\n", "\n", " index_html_file = index_html_tpl.format(\n", " a_tpl_file.format(*clique_links[0]),\n", " bcc_text,\n", " \"\\n\".join(a_tpl_file.format(*c) for c in clique_links[1:]),\n", " )\n", "\n", " listing_html = \"{}\\n\".format(\n", " \"\\n\".join(allgeni_html),\n", " )\n", "\n", " for (subdir, fname, content_html, tit) in (\n", " (None, index_name, index_html_file, \"Index \" + param_lab),\n", " (base_name, all_name, listing_html, \"Listing \" + param_lab),\n", " (base_name, cliques_name, allgen_htmls, \"Cliques \" + param_lab),\n", " ):\n", " subdir = \"\" if subdir is None else (subdir + \"/\")\n", " subdirabs = \"{}/{}/{}\".format(LOCAL_BASE_OUTP, EXPERIMENT_DIR, subdir)\n", " if not os.path.exists(subdirabs):\n", " os.makedirs(subdirabs)\n", "\n", " if type(content_html) is list:\n", " for (i, c_h) in enumerate(content_html):\n", " fn = \"{}_{}\".format(fname, i)\n", " t = \"{}_{}\".format(tit, i)\n", " with open(\n", " \"{}/{}/{}{}.html\".format(\n", " LOCAL_BASE_OUTP, EXPERIMENT_DIR, subdir, fn\n", " ),\n", " \"w\",\n", " ) as f:\n", " f.write(\n", " content_file_tpl.format(t, css, t, param_spec, c_h, meta_html)\n", " )\n", " else:\n", " with open(\n", " \"{}/{}/{}{}.html\".format(\n", " LOCAL_BASE_OUTP, EXPERIMENT_DIR, subdir, fname\n", " ),\n", " \"w\",\n", " ) as f:\n", " f.write(\n", " content_file_tpl.format(\n", " tit, css, tit, param_spec, content_html, meta_html\n", " )\n", " )\n", " destination = outputs.setdefault(MATRIX_THRESHOLD, {})\n", " destination[(CHUNK_LB, CHUNK_DESC, SIMILARITY_METHOD, SIMILARITY_THRESHOLD)] = (\n", " len(passages),\n", " len(cliques),\n", " l_c_l,\n", " )\n", " TF.info(\n", " \"PRINT ({} {} {} M>{} S>{}): formatted {} cliques ({} files) {} {} binary chapter diffs\".format(\n", " CHUNK_LB,\n", " CHUNK_DESC,\n", " SIMILARITY_METHOD,\n", " MATRIX_THRESHOLD,\n", " SIMILARITY_THRESHOLD,\n", " len(cliques),\n", " len(allgen_htmls),\n", " cdoing,\n", " len(bin_cliques),\n", " )\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.9 Running experiments\n", "\n", "The workflows of doing a single experiment, and then all experiments, are defined." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[20]:" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "outputs = {}" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "def writeoutputs():\n", " global outputs\n", " with open(EXPERIMENT_PATH, \"wb\") as f:\n", " pickle.dump(outputs, f, protocol=PICKLE_PROTOCOL)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "def readoutputs():\n", " global outputs\n", " if not os.path.exists(EXPERIMENT_PATH):\n", " outputs = {}\n", " else:\n", " with open(EXPERIMENT_PATH, \"rb\") as f:\n", " outputs = pickle.load(f)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "def do_experiment(chunk_f, chunk_i, sim_m, sim_thr, do_index):\n", " if do_index:\n", " readoutputs()\n", " (do_chunk, do_prep, do_sim, do_clique, skip) = do_params(\n", " chunk_f, chunk_i, sim_m, sim_thr\n", " )\n", " if skip:\n", " return\n", " chunking(do_chunk)\n", " preparing(do_prep)\n", " similarity(do_sim)\n", " cliqueing(do_clique)\n", " printing()\n", " if do_index:\n", " writeoutputs()\n", " gen_html()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "def do_only_chunk(chunk_f, chunk_i):\n", " do_chunk = do_params_chunk(chunk_f, chunk_i)\n", " chunking(do_chunk)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "def reset_experiments():\n", " global outputs\n", " readoutputs()\n", " outputs = {}\n", " reset_params()\n", " writeoutputs()\n", " gen_html()" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "def do_all_experiments(no_fixed=False, only_object=None):\n", " global outputs\n", " reset_experiments()\n", " for chunk_f in (False,) if no_fixed else (True, False):\n", " if chunk_f:\n", " chunk_items = CHUNK_SIZES\n", " else:\n", " chunk_items = CHUNK_OBJECTS if only_object is None else (only_object,)\n", " for chunk_i in chunk_items:\n", " for sim_m in SIM_METHODS:\n", " for sim_thr in SIMILARITIES:\n", " do_experiment(chunk_f, chunk_i, sim_m, sim_thr, False)\n", " writeoutputs()\n", " gen_html()\n", " gen_html(standalone=True)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "def do_all_chunks(no_fixed=False, only_object=None):\n", " global outputs\n", " reset_experiments()\n", " for chunk_f in (False,) if no_fixed else (True, False):\n", " if chunk_f:\n", " chunk_items = CHUNK_SIZES\n", " else:\n", " chunk_items = CHUNK_OBJECTS if only_object is None else (only_object,)\n", " for chunk_i in chunk_items:\n", " do_only_chunk(chunk_f, chunk_i)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "def show_all_experiments():\n", " readoutputs()\n", " gen_html()\n", " gen_html(standalone=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6a" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TF features\n", "\n", "Based on selected similarity matrices, we produce an\n", "edge features between verses, containing weighted links to parallel verses.\n", "\n", "The features to deliver are called `crossrefSET` and `crossrefLCS` and `crossref`.\n", "\n", "These are edge feature, both are symmetric, and hence redundant.\n", "For every node, the *from* and *to* edges are identical.\n", "\n", "The `SET` variant consists of set based similarity, the `LCS` one on longest common subsequence\n", "similarity.\n", "\n", "The `crossref` feature takes the union of both methods, with the average confidence.\n", "\n", "The weight is the similarity as percentage integer as it comes from the similarity matrix.\n", "\n", "## Discussion\n", "We only produce the results of the similarity computation (the matrix), we do not do the cliqueing.\n", "There are many ways to make cliques, and that can easily be done by users of the data, once the\n", "matrix results are in place.\n", "We also do not produce pretty outputs, chapter diffs and other goodies.\n", "Just the raw similarity data.\n", "\n", "The matrix computation is expensive.\n", "We use fixed settings:\n", "* verse chunks\n", "* `SET` method / `LCS` method,\n", "* matrix threshold 50 / 60\n", "* similarity threshold 75\n", "\n", "That is, we compute a matrix that contains all pairs with similarity above 50 or 60\n", "depending on whether we do the `SET` method or the `LCS` method.\n", "\n", "From that matrix, we only use the similarities above 75.\n", "This gives us room to play without recomputing the matrix.\n", "\n", "We do not want to redo this computation if it can be avoided.\n", "\n", "Verse similarity is not something that is very sensitive to change in the encoding.\n", "It is very likely that similar verses in one version of the data agree with similar\n", "verses in all other versions.\n", "\n", "However, the node numbers of verses may change from version to version, so that part\n", "must be done again for each version.\n", "\n", "This is how we proceed:\n", "* the matrix computation gives us triples `(v1, v2, w)`, where `v1`, `v2` are verse nodes and `d` is their similarity\n", "* we store the result of the matrix computation in a CSV file with the following fields:\n", "* `method, v1, v1Ref, v2, v2Ref, d`, where `v1Ref` and `v2Ref` are verse references,\n", " each containing exactly 3 fields: book, chapter, verse\n", "* NB: the similarity table has only one entry for each pair of similar verses per method.\n", " If `(v1, v2)` is in the table, `(v2, v1)` is not in the table, per method.\n", "\n", "When we run this notebook for the pipeline, we check for the presence of this file.\n", "If it is present, we uses the `vRefs` in it to compute the verse nodes that are valid for the\n", "version we are going to produce.\n", "That gives us all the data we need, so we can skip the matrix computation.\n", "\n", "If the file is not present, we have to compute the matrix.\n", "There will be a parameter, called `FORCE_MATRIX`, which can enforce a re-computation of the matrix." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We need some utility function geared to TF feature production.\n", "The `get_verse()` function is simpler, and we do not have to run full experiments." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[21]:" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "def writeSimTable(similars):\n", " with open(TF_TABLE, \"w\") as h:\n", " for entry in similars:\n", " h.write(\"{}\\n\".format(\"\\t\".join(str(x) for x in entry)))" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "def readSimTable():\n", " similars = []\n", " stats = set()\n", "\n", " with open(TF_TABLE) as h:\n", " for line in h:\n", " (\n", " method,\n", " v1,\n", " v2,\n", " sim,\n", " book1,\n", " chapter1,\n", " verse1,\n", " book2,\n", " chapter2,\n", " verse2,\n", " ) = line.rstrip(\"\\n\").split(\"\\t\")\n", " verseNode1 = T.nodeFromSection((book1, int(chapter1), int(verse1)))\n", " verseNode2 = T.nodeFromSection((book2, int(chapter2), int(verse2)))\n", " if verseNode1 != int(v1):\n", " stats.add(verseNode1)\n", " if verseNode2 != int(v2):\n", " stats.add(verseNode2)\n", " similars.append(\n", " (\n", " method,\n", " verseNode1,\n", " verseNode2,\n", " int(sim),\n", " book1,\n", " int(chapter1),\n", " int(verse1),\n", " book2,\n", " int(chapter2),\n", " int(verse2),\n", " )\n", " )\n", " nStats = len(stats)\n", " if nStats:\n", " utils.caption(\n", " 0,\n", " \"\\t\\tINFO: {} verse nodes have been changed between versions\".format(\n", " nStats\n", " ),\n", " )\n", " utils.caption(0, \"\\t\\tINFO: We will save and use the recomputed ones\")\n", " writeSimTable(similars)\n", " else:\n", " utils.caption(\n", " 0, \"\\t\\tINFO: All verse nodes are the same as in the previous version\"\n", " )\n", " return similars" ] }, { "cell_type": "code", "execution_count": 76, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "def makeSimTable():\n", " similars = []\n", " for (method, similarityCutoff) in (\n", " (\"SET\", 75),\n", " (\"LCS\", 75),\n", " ):\n", " (do_chunk, do_prep, do_sim, do_clique, skip) = do_params(\n", " False, \"verse\", method, similarityCutoff\n", " )\n", " chunking(do_chunk)\n", " preparing(do_prep)\n", " similarity(do_sim or FORCE_MATRIX)\n", " theseSimilars = []\n", " for ((chunk1, chunk2), sim) in sorted(\n", " (x, d) for (x, d) in chunk_dist.items() if d >= similarityCutoff\n", " ):\n", " verseNode1 = L.u(chunks[chunk1][0], otype=\"verse\")[0]\n", " verseNode2 = L.u(chunks[chunk2][0], otype=\"verse\")[0]\n", " simInt = int(round(sim))\n", " heading1 = T.sectionFromNode(verseNode1)\n", " heading2 = T.sectionFromNode(verseNode2)\n", " theseSimilars.append(\n", " (method, verseNode1, verseNode2, simInt, *heading1, *heading2)\n", " )\n", " utils.caption(\n", " 0,\n", " \"\\tMethod {}: found {} similar pairs of verses\".format(\n", " method, len(theseSimilars)\n", " ),\n", " )\n", " similars.extend(theseSimilars)\n", " writeSimTable(similars)\n", " return similars" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[22]:" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "..............................................................................................\n", ". 13s CROSSREFS: Fetching crossrefs .\n", "..............................................................................................\n" ] } ], "source": [ "utils.caption(4, \"CROSSREFS: Fetching crossrefs\")" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "| 13s \tReading existing /Users/werk/github/etcbc/parallels/_temp/parallelTable.tsv\n" ] } ], "source": [ "xTable = os.path.exists(TF_TABLE)\n", "if FORCE_MATRIX:\n", " utils.caption(\n", " 0,\n", " \"\\t{} requested of {}\".format(\n", " \"Recomputing\" if xTable else \"computing\",\n", " TF_TABLE,\n", " ),\n", " )\n", "else:\n", " if xTable:\n", " utils.caption(0, \"\\tReading existing {}\".format(TF_TABLE))\n", " else:\n", " utils.caption(0, \"\\tComputing missing {}\".format(TF_TABLE))" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "| 13s \t\tINFO: All verse nodes are the same as in the previous version\n" ] } ], "source": [ "if FORCE_MATRIX or not xTable:\n", " similars = makeSimTable()\n", "else:\n", " similars = readSimTable()" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[23]:" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('LCS', 1414401, 1414407, 84, 'Genesis', 1, 13, 'Genesis', 1, 19)\n", "('LCS', 1414401, 1414411, 89, 'Genesis', 1, 13, 'Genesis', 1, 23)\n", "('LCS', 1414403, 1414405, 77, 'Genesis', 1, 15, 'Genesis', 1, 17)\n", "('LCS', 1414407, 1414411, 84, 'Genesis', 1, 19, 'Genesis', 1, 23)\n", "('LCS', 1414498, 1414501, 79, 'Genesis', 5, 4, 'Genesis', 5, 7)\n", "('LCS', 1414498, 1414507, 75, 'Genesis', 5, 4, 'Genesis', 5, 13)\n", "('LCS', 1414498, 1414510, 78, 'Genesis', 5, 4, 'Genesis', 5, 16)\n", "('LCS', 1414498, 1414513, 86, 'Genesis', 5, 4, 'Genesis', 5, 19)\n", "('LCS', 1414498, 1414524, 77, 'Genesis', 5, 4, 'Genesis', 5, 30)\n", "('LCS', 1414498, 1414666, 79, 'Genesis', 5, 4, 'Genesis', 11, 11)\n", "('SET', 1414505, 1414623, 80, 'Genesis', 5, 11, 'Genesis', 9, 29)\n", "('SET', 1414510, 1414513, 77, 'Genesis', 5, 16, 'Genesis', 5, 19)\n", "('SET', 1414625, 1435841, 100, 'Genesis', 10, 2, '1_Chronicles', 1, 5)\n", "('SET', 1414629, 1435844, 100, 'Genesis', 10, 6, '1_Chronicles', 1, 8)\n", "('SET', 1414630, 1435845, 100, 'Genesis', 10, 7, '1_Chronicles', 1, 9)\n", "('SET', 1414631, 1435846, 100, 'Genesis', 10, 8, '1_Chronicles', 1, 10)\n", "('SET', 1414636, 1435847, 100, 'Genesis', 10, 13, '1_Chronicles', 1, 11)\n", "('SET', 1414637, 1435848, 100, 'Genesis', 10, 14, '1_Chronicles', 1, 12)\n", "('SET', 1414638, 1435849, 100, 'Genesis', 10, 15, '1_Chronicles', 1, 13)\n", "('SET', 1414639, 1414770, 83, 'Genesis', 10, 16, 'Genesis', 15, 21)\n" ] } ], "source": [ "if not SCRIPT:\n", " print(\"\\n\".join(sorted(repr(sim) for sim in similars if sim[0] == \"LCS\")[0:10]))\n", " print(\"\\n\".join(sorted(repr(sim) for sim in similars if sim[0] == \"SET\")[0:10]))" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "crossrefData = {}\n", "otherMethod = dict(LCS=\"SET\", SET=\"LCS\")" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "lines_to_next_cell": 2 }, "outputs": [], "source": [ "for (method, v1, v2, sim, *x) in similars:\n", " crossrefData.setdefault(method, {}).setdefault(v1, {})[v2] = sim\n", " crossrefData.setdefault(method, {}).setdefault(v2, {})[v1] = sim\n", " omethod = otherMethod[method]\n", " otherSim = crossrefData.get(omethod, {}).get(v1, {}).get(v2, None)\n", " thisSim = sim if otherSim is None else int(round((otherSim + sim) / 2))\n", " crossrefData.setdefault(\"\", {}).setdefault(v1, {})[v2] = thisSim\n", " crossrefData.setdefault(\"\", {}).setdefault(v2, {})[v1] = thisSim" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating parallels module for Text-Fabric\n", "\n", "We generate the feature `crossref`.\n", "It is an edge feature between verse nodes, with the similarity as weight." ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "..............................................................................................\n", ". 6m 16s Writing TF parallel features .\n", "..............................................................................................\n" ] } ], "source": [ "utils.caption(4, \"Writing TF parallel features\")" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "newFeatureStr = \"crossref crossrefSET crossrefLCS\"\n", "newFeatures = newFeatureStr.strip().split()" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "genericMetaPath = f\"{thisRepo}/yaml/generic.yaml\"\n", "parallelsMetaPath = f\"{thisRepo}/yaml/parallels.yaml\"\n", "\n", "with open(genericMetaPath) as fh:\n", " genericMeta = yaml.load(fh, Loader=yaml.FullLoader)\n", " genericMeta[\"version\"] = VERSION\n", "with open(parallelsMetaPath) as fh:\n", " parallelsMeta = formatMeta(yaml.load(fh, Loader=yaml.FullLoader))\n", "\n", "metaData = {\"\": genericMeta, **parallelsMeta}" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "nodeFeatures = dict()\n", "edgeFeatures = dict()\n", "for method in [\"\"] + list(otherMethod):\n", " edgeFeatures[\"crossref{}\".format(method)] = crossrefData[method]" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "for newFeature in newFeatures:\n", " metaData[newFeature][\"valueType\"] = \"int\"\n", " metaData[newFeature][\"edgeValues\"] = True" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "TF = Fabric(locations=thisTempTf, silent=True)\n", "TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating simple `crossref` notes for SHEBANQ\n", "We base them on the average of both methods, we supply the confidence." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[33]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MAX_REFS = 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def condenseX(vlabels):\n", " cnd = []\n", " (cur_b, cur_c) = (None, None)\n", " for (b, c, v, d) in vlabels:\n", " sep = (\n", " \"\"\n", " if cur_b is None\n", " else \". \"\n", " if cur_b != b\n", " else \"; \"\n", " if cur_c != c\n", " else \", \"\n", " )\n", " show_b = b + \" \" if cur_b != b else \"\"\n", " show_c = str(c) + \":\" if cur_b != b or cur_c != c else \"\"\n", " (cur_b, cur_c) = (b, c)\n", " cnd.append(\"{}[{}{}{}{}]\".format(sep, show_b, show_c, v, d))\n", " return cnd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "crossrefBase = crossrefData[\"\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "refsGrouped = []\n", "nCrossrefs = 0\n", "for (x, refs) in crossrefBase.items():\n", " vys = sorted(refs.keys())\n", " nCrossrefs += len(vys)\n", " currefs = []\n", " for vy in vys:\n", " nr = len(currefs)\n", " if nr == MAX_REFS:\n", " refsGrouped.append((x, tuple(currefs)))\n", " currefs = []\n", " currefs.append(vy)\n", " if len(currefs):\n", " refsGrouped.append((x, tuple(currefs)))" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "| 8m 58s Compiled 31742 cross references into 6215 notes\n" ] } ], "source": [ "refsCompiled = []\n", "for (x, vys) in refsGrouped:\n", " vysd = [\n", " (*T.sectionFromNode(vy, lang=\"la\"), \" ~{}%\".format(crossrefBase[x][vy]))\n", " for vy in vys\n", " ]\n", " vysl = condenseX(vysd)\n", " these_refs = []\n", " for (i, vy) in enumerate(vysd):\n", " link_text = vysl[i]\n", " link_target = \"{} {}:{}\".format(vy[0], vy[1], vy[2])\n", " these_refs.append(\"{}({})\".format(link_text, link_target))\n", " refsCompiled.append((x, \" \".join(these_refs)))\n", "utils.caption(\n", " 0,\n", " \"Compiled {} cross references into {} notes\".format(nCrossrefs, len(refsCompiled)),\n", ")" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[34]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sfields = \"\"\"\n", " version\n", " book\n", " chapter\n", " verse\n", " clause_atom\n", " is_shared\n", " is_published\n", " status\n", " keywords\n", " ntext\n", "\"\"\".strip().split()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sfields_fmt = (\"{}\\t\" * (len(sfields) - 1)) + \"{}\\n\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ofs = open(\"{}/{}\".format(thisNotes, notesFile), \"w\")\n", "ofs.write(\"{}\\n\".format(\"\\t\".join(sfields)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for (v, refs) in refsCompiled:\n", " firstWord = L.d(v, otype=\"word\")[0]\n", " ca = F.number.v(L.u(firstWord, otype=\"clause_atom\")[0])\n", " (bk, ch, vs) = T.sectionFromNode(v, lang=\"la\")\n", " ofs.write(\n", " sfields_fmt.format(\n", " VERSION,\n", " bk,\n", " ch,\n", " vs,\n", " ca,\n", " \"T\",\n", " \"\",\n", " CROSSREF_STATUS,\n", " CROSSREF_KEYWORD,\n", " refs,\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "| 8m 58s Generated 6215 notes\n" ] } ], "source": [ "utils.caption(0, \"Generated {} notes\".format(len(refsCompiled)))\n", "ofs.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Diffs\n", "\n", "Check differences with previous versions." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[35]:" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "..............................................................................................\n", ". 9m 05s Check differences with previous version .\n", "..............................................................................................\n", "| 9m 05s \t3 features to add\n", "| 9m 05s \t\tcrossref\n", "| 9m 05s \t\tcrossrefLCS\n", "| 9m 05s \t\tcrossrefSET\n", "| 9m 05s \tno features to delete\n", "| 9m 05s \t0 features in common\n", "| 9m 05s Done\n" ] } ], "source": [ "utils.checkDiffs(thisTempTf, thisTf, only=set(newFeatures))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Deliver\n", "\n", "Copy the new TF feature from the temporary location where it has been created to its final destination." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[36]:" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "..............................................................................................\n", ". 9m 19s Deliver data set to /Users/dirk/github/etcbc/parallels/tf/2021 .\n", "..............................................................................................\n" ] } ], "source": [ "utils.deliverDataset(thisTempTf, thisTf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compile TF" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[38]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "utils.caption(4, \"Load and compile the new TF features\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "..............................................................................................\n", ". 10m 25s Load and compile the new TF features .\n", "..............................................................................................\n", "This is Text-Fabric 8.5.13\n", "Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html\n", "\n", "117 features found and 0 ignored\n", " 0.00s loading features ...\n", " | 0.00s Dataset without structure sections in otext:no structure functions in the T-API\n", " 3.47s All features loaded/computed - for details use loadLog()\n" ] }, { "data": { "text/plain": [ "[('Computed',\n", " 'computed-data',\n", " ('C Computed', 'Call AllComputeds', 'Cs ComputedString')),\n", " ('Features', 'edge-features', ('E Edge', 'Eall AllEdges', 'Es EdgeString')),\n", " ('Fabric', 'loading', ('TF',)),\n", " ('Locality', 'locality', ('L Locality',)),\n", " ('Nodes', 'navigating-nodes', ('N Nodes',)),\n", " ('Features',\n", " 'node-features',\n", " ('F Feature', 'Fall AllFeatures', 'Fs FeatureString')),\n", " ('Search', 'search', ('S Search',)),\n", " ('Text', 'text', ('T Text',))]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "TF = Fabric(locations=[coreTf, thisTf], modules=[\"\"])\n", "api = TF.load(newFeatureStr)\n", "api.makeAvailableIn(globals())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We list all the `crossrefs` that the verses of Genesis 10 are involved in." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[39]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "utils.caption(4, \"Test: crossrefs of Genesis 10\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "chapter = (\"Genesis\", 10)\n", "chapterNode = T.nodeFromSection(chapter)\n", "startVerses = {}" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "lines_to_next_cell": 2 }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "..............................................................................................\n", ". 10m 33s Test: crossrefs of Genesis 10 .\n", "..............................................................................................\n", "| 10m 33s \tMethod \n", "| 10m 33s \t\t20 start verses\n", "\t\tGenesis 10:2\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:5 confidende 100%\n", "\t\tGenesis 10:3\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:6 confidende 95%\n", "\t\tGenesis 10:4\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:7 confidende 95%\n", "\t\tGenesis 10:6\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:8 confidende 100%\n", "\t\tGenesis 10:7\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:9 confidende 100%\n", "\t\tGenesis 10:8\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:10 confidende 100%\n", "\t\tGenesis 10:13\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:11 confidende 100%\n", "\t\tGenesis 10:14\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:12 confidende 100%\n", "\t\tGenesis 10:15\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:13 confidende 100%\n", "\t\tGenesis 10:16\n", "| 10m 33s \t\t ----------> Genesis 15:21 confidende 83%\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:14 confidende 100%\n", "\t\tGenesis 10:17\n", "| 10m 33s \t\t ----------> Genesis 15:20 confidende 76%\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:15 confidende 100%\n", "\t\tGenesis 10:20\n", "| 10m 33s \t\t ----------> Genesis 10:31 confidende 87%\n", "\t\tGenesis 10:22\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:17 confidende 77%\n", "\t\tGenesis 10:24\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:18 confidende 100%\n", "\t\tGenesis 10:25\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:19 confidende 100%\n", "\t\tGenesis 10:26\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:20 confidende 100%\n", "\t\tGenesis 10:27\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:21 confidende 100%\n", "| 10m 33s \t\t ----------> 2_Chronicles 11:9 confidende 78%\n", "\t\tGenesis 10:28\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:22 confidende 100%\n", "\t\tGenesis 10:29\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:23 confidende 100%\n", "\t\tGenesis 10:31\n", "| 10m 33s \t\t ----------> Genesis 10:20 confidende 87%\n", "| 10m 33s \tMethod SET\n", "| 10m 33s \t\t20 start verses\n", "\t\tGenesis 10:2\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:5 confidende 100%\n", "\t\tGenesis 10:3\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:6 confidende 95%\n", "\t\tGenesis 10:4\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:7 confidende 95%\n", "\t\tGenesis 10:6\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:8 confidende 100%\n", "\t\tGenesis 10:7\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:9 confidende 100%\n", "\t\tGenesis 10:8\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:10 confidende 100%\n", "\t\tGenesis 10:13\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:11 confidende 100%\n", "\t\tGenesis 10:14\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:12 confidende 100%\n", "\t\tGenesis 10:15\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:13 confidende 100%\n", "\t\tGenesis 10:16\n", "| 10m 33s \t\t ----------> Genesis 15:21 confidende 83%\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:14 confidende 100%\n", "\t\tGenesis 10:17\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:15 confidende 100%\n", "\t\tGenesis 10:20\n", "| 10m 33s \t\t ----------> Genesis 10:31 confidende 80%\n", "\t\tGenesis 10:22\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:17 confidende 77%\n", "\t\tGenesis 10:24\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:18 confidende 100%\n", "\t\tGenesis 10:25\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:19 confidende 100%\n", "\t\tGenesis 10:26\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:20 confidende 100%\n", "\t\tGenesis 10:27\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:21 confidende 100%\n", "\t\tGenesis 10:28\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:22 confidende 100%\n", "\t\tGenesis 10:29\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:23 confidende 100%\n", "\t\tGenesis 10:31\n", "| 10m 33s \t\t ----------> Genesis 10:20 confidende 80%\n", "| 10m 33s \tMethod LCS\n", "| 10m 33s \t\t20 start verses\n", "\t\tGenesis 10:2\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:5 confidende 100%\n", "\t\tGenesis 10:3\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:6 confidende 95%\n", "\t\tGenesis 10:4\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:7 confidende 95%\n", "\t\tGenesis 10:6\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:8 confidende 100%\n", "\t\tGenesis 10:7\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:9 confidende 100%\n", "\t\tGenesis 10:8\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:10 confidende 100%\n", "\t\tGenesis 10:13\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:11 confidende 100%\n", "\t\tGenesis 10:14\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:12 confidende 100%\n", "\t\tGenesis 10:15\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:13 confidende 100%\n", "\t\tGenesis 10:16\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:14 confidende 100%\n", "\t\tGenesis 10:17\n", "| 10m 33s \t\t ----------> Genesis 15:20 confidende 76%\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:15 confidende 100%\n", "\t\tGenesis 10:20\n", "| 10m 33s \t\t ----------> Genesis 10:31 confidende 94%\n", "\t\tGenesis 10:22\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:17 confidende 77%\n", "\t\tGenesis 10:24\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:18 confidende 100%\n", "\t\tGenesis 10:25\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:19 confidende 100%\n", "\t\tGenesis 10:26\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:20 confidende 100%\n", "\t\tGenesis 10:27\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:21 confidende 100%\n", "| 10m 33s \t\t ----------> 2_Chronicles 11:9 confidende 78%\n", "\t\tGenesis 10:28\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:22 confidende 100%\n", "\t\tGenesis 10:29\n", "| 10m 33s \t\t ----------> 1_Chronicles 1:23 confidende 100%\n", "\t\tGenesis 10:31\n", "| 10m 33s \t\t ----------> Genesis 10:20 confidende 94%\n" ] } ], "source": [ "for method in [\"\", \"SET\", \"LCS\"]:\n", " utils.caption(0, \"\\tMethod {}\".format(method))\n", " for verseNode in L.d(chapterNode, otype=\"verse\"):\n", " crossrefs = Es(\"crossref{}\".format(method)).f(verseNode)\n", " if crossrefs:\n", " startVerses[T.sectionFromNode(verseNode)] = crossrefs\n", " utils.caption(0, \"\\t\\t{} start verses\".format(len(startVerses)))\n", " for (start, crossrefs) in sorted(startVerses.items()):\n", " utils.caption(0, \"\\t\\t{} {}:{}\".format(*start), continuation=True)\n", " for (target, confidence) in crossrefs:\n", " utils.caption(\n", " 0,\n", " \"\\t\\t{:>20} {:<20} confidende {:>3}%\".format(\n", " \"-\" * 10 + \">\",\n", " \"{} {}:{}\".format(*T.sectionFromNode(target)),\n", " confidence,\n", " ),\n", " )" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[29]:" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "lines_to_next_cell": 2 }, "outputs": [], "source": [ "if SCRIPT:\n", " stop(good=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6b. SHEBANQ annotations\n", "\n", "The code below generates extensive `crossref` notes for `4b`, including clique overviews and chapter diffs.\n", "But since the pipeline in October 2017, we generate much simpler notes.\n", "That code is above.\n", "\n", "We retain this code here, in case we want to expand the `crossref` functionality in the future again.\n", "\n", "Based on selected similarity matrices, we produce a SHEBANQ note set of cross references for similar passages." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[30]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_verse(i, ca=False):\n", " return get_verse_w(chunks[i][0], ca=ca)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_verse_o(o, ca=False):\n", " return get_verse_w(L.d(o, otype=\"word\")[0], ca=ca)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_verse_w(w, ca=False):\n", " book = F.book.v(L.u(w, otype=\"book\")[0])\n", " chapter = F.chapter.v(L.u(w, otype=\"chapter\")[0])\n", " verse = F.verse.v(L.u(w, otype=\"verse\")[0])\n", " if ca:\n", " ca = F.number.v(L.u(w, otype=\"clause_atom\")[0])\n", " return (book, chapter, verse, ca) if ca else (book, chapter, verse)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def key_verse(x):\n", " return (book_rank[x[0]], int(x[1]), int(x[2]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "MAX_REFS = 10" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def condensex(vlabels):\n", " cnd = []\n", " (cur_b, cur_c) = (None, None)\n", " for (b, c, v, d) in vlabels:\n", " sep = (\n", " \"\"\n", " if cur_b is None\n", " else \". \"\n", " if cur_b != b\n", " else \"; \"\n", " if cur_c != c\n", " else \", \"\n", " )\n", " show_b = b + \" \" if cur_b != b else \"\"\n", " show_c = c + \":\" if cur_b != b or cur_c != c else \"\"\n", " (cur_b, cur_c) = (b, c)\n", " cnd.append(\"{}{}{}{}{}\".format(sep, show_b, show_c, v, d))\n", " return cnd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfields = \"\"\"\n", " book1\n", " chapter1\n", " verse1\n", " book2\n", " chapter2\n", " verse2\n", " similarity\n", "\"\"\".strip().split()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dfields_fmt = (\"{}\\t\" * (len(dfields) - 1)) + \"{}\\n\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_crossrefs():\n", " global crossrefs\n", " TF.info(\"CROSSREFS: Fetching crossrefs\")\n", " crossrefs_proto = {}\n", " crossrefs = {}\n", " (chunk_f, chunk_i, sim_m) = SHEBANQ_MATRIX\n", " sim_thr = SHEBANQ_SIMILARITY\n", " (do_chunk, do_prep, do_sim, do_clique, skip) = do_params(\n", " chunk_f, chunk_i, sim_m, sim_thr\n", " )\n", " if skip:\n", " return\n", " TF.info(\n", " \"CROSSREFS ({} {} {} S>{})\".format(CHUNK_LBS[chunk_f], chunk_i, sim_m, sim_thr)\n", " )\n", " crossrefs_proto = {x for x in chunk_dist.items() if x[1] >= sim_thr}\n", " TF.info(\n", " \"CROSSREFS ({} {} {} S>{}): found {} pairs\".format(\n", " CHUNK_LBS[chunk_f],\n", " chunk_i,\n", " sim_m,\n", " sim_thr,\n", " len(crossrefs_proto),\n", " )\n", " )\n", " f = open(CROSSREF_DB_PATH, \"w\")\n", " f.write(\"{}\\n\".format(\"\\t\".join(dfields)))\n", " for ((x, y), d) in crossrefs_proto:\n", " vx = get_verse(x)\n", " vy = get_verse(y)\n", " rd = int(round(d))\n", " crossrefs.setdefault(x, {})[vy] = rd\n", " crossrefs.setdefault(y, {})[vx] = rd\n", " f.write(dfields_fmt.format(*(vx + vy + (rd,))))\n", " total = sum(len(x) for x in crossrefs.values())\n", " f.close()\n", " TF.info(\n", " \"CROSSREFS: Found {} crossreferences and wrote {} pairs\".format(\n", " total, len(crossrefs_proto)\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_specific_crossrefs(chunk_f, chunk_i, sim_m, sim_thr, write_to):\n", " (do_chunk, do_prep, do_sim, do_clique, skip) = do_params(\n", " chunk_f, chunk_i, sim_m, sim_thr\n", " )\n", " if skip:\n", " return\n", " chunking(do_chunk)\n", " preparing(do_prep)\n", " similarity(do_sim)\n", "\n", " TF.info(\"CROSSREFS: Fetching crossrefs\")\n", " crossrefs_proto = {}\n", " crossrefs = {}\n", " (do_chunk, do_prep, do_sim, do_clique, skip) = do_params(\n", " chunk_f, chunk_i, sim_m, sim_thr\n", " )\n", " if skip:\n", " return\n", " TF.info(\n", " \"CROSSREFS ({} {} {} S>{})\".format(CHUNK_LBS[chunk_f], chunk_i, sim_m, sim_thr)\n", " )\n", " crossrefs_proto = {x for x in chunk_dist.items() if x[1] >= sim_thr}\n", " TF.info(\n", " \"CROSSREFS ({} {} {} S>{}): found {} pairs\".format(\n", " CHUNK_LBS[chunk_f],\n", " chunk_i,\n", " sim_m,\n", " sim_thr,\n", " len(crossrefs_proto),\n", " )\n", " )\n", " f = open(\"files/{}\".format(write_to), \"w\")\n", " f.write(\"{}\\n\".format(\"\\t\".join(dfields)))\n", " for ((x, y), d) in crossrefs_proto:\n", " vx = get_verse(x)\n", " vy = get_verse(y)\n", " rd = int(round(d))\n", " crossrefs.setdefault(x, {})[vy] = rd\n", " crossrefs.setdefault(y, {})[vx] = rd\n", " f.write(dfields_fmt.format(*(vx + vy + (rd,))))\n", " total = sum(len(x) for x in crossrefs.values())\n", " f.close()\n", " TF.info(\n", " \"CROSSREFS: Found {} crossreferences and wrote {} pairs\".format(\n", " total, len(crossrefs_proto)\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def compile_refs():\n", " global refs_compiled\n", " refs_grouped = []\n", " for x in sorted(crossrefs):\n", " refs = crossrefs[x]\n", " vys = sorted(refs.keys(), key=key_verse)\n", " currefs = []\n", " for vy in vys:\n", " nr = len(currefs)\n", " if nr == MAX_REFS:\n", " refs_grouped.append((x, tuple(currefs)))\n", " currefs = []\n", " currefs.append(vy)\n", " if len(currefs):\n", " refs_grouped.append((x, tuple(currefs)))\n", " refs_compiled = []\n", " for (x, vys) in refs_grouped:\n", " vysd = [(vy[0], vy[1], vy[2], \" ~{}%\".format(crossrefs[x][vy])) for vy in vys]\n", " vysl = condensex(vysd)\n", " these_refs = []\n", " for (i, vy) in enumerate(vysd):\n", " link_text = vysl[i]\n", " link_target = \"{} {}:{}\".format(vy[0], vy[1], vy[2])\n", " these_refs.append(\"[{}]({})\".format(link_text, link_target))\n", " refs_compiled.append((x, \" \".join(these_refs)))\n", " TF.info(\n", " \"CROSSREFS: Compiled cross references into {} notes\".format(len(refs_compiled))\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_chapter_diffs():\n", " global chapter_diffs\n", " chapter_diffs = []\n", " for cl in sorted(bin_cliques):\n", " lb1 = \"{} {}\".format(F.book.v(cl[0][0]), F.chapter.v(cl[0][1]))\n", " lb2 = \"{} {}\".format(F.book.v(cl[1][0]), F.chapter.v(cl[1][1]))\n", " hfilename = \"{}_vs_{}.html\".format(lb1, lb2).replace(\" \", \"_\")\n", " chapter_diffs.append(\n", " (\n", " lb1,\n", " cl[0][1],\n", " lb2,\n", " cl[1][1],\n", " \"{}/{}/{}/{}\".format(\n", " SHEBANQ_TOOL,\n", " LOCAL_BASE_OUTP,\n", " CHAPTER_DIR,\n", " hfilename,\n", " ),\n", " )\n", " )\n", " TF.info(\"CROSSREFS: Added {} chapter diffs\".format(2 * len(chapter_diffs)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_clique_refs():\n", " global clique_refs\n", " clique_refs = []\n", " for (i, c) in enumerate(cliques):\n", " for j in c:\n", " seq = i // CLIQUES_PER_FILE\n", " clique_refs.append(\n", " (\n", " j,\n", " i,\n", " \"{}/{}/{}/{}/clique_{}_{}.html#c_{}\".format(\n", " SHEBANQ_TOOL,\n", " LOCAL_BASE_OUTP,\n", " EXPERIMENT_DIR,\n", " base_name,\n", " base_name,\n", " seq,\n", " i,\n", " ),\n", " )\n", " )\n", " TF.info(\"CROSSREFS: Added {} clique references\".format(len(clique_refs)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sfields = \"\"\"\n", " version\n", " book\n", " chapter\n", " verse\n", " clause_atom\n", " is_shared\n", " is_published\n", " status\n", " keywords\n", " ntext\n", "\"\"\".strip().split()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sfields_fmt = (\"{}\\t\" * (len(sfields) - 1)) + \"{}\\n\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate_notes():\n", " with open(NOTES_PATH, \"w\") as f:\n", " f.write(\"{}\\n\".format(\"\\t\".join(sfields)))\n", " x = next(F.otype.s(\"word\"))\n", " (bk, ch, vs, ca) = get_verse(x, ca=True)\n", " f.write(\n", " sfields_fmt.format(\n", " VERSION,\n", " bk,\n", " ch,\n", " vs,\n", " ca,\n", " \"T\",\n", " \"\",\n", " CROSSREF_STATUS,\n", " CROSSREF_KEYWORD,\n", " \"\"\"The crossref notes are the result of a computation without manual tweaks.\n", "Parameters: chunk by verse, similarity method SET with threshold 65.\n", "[Here](tool=parallel) is an account of the generation method.\"\"\".replace(\n", " \"\\n\", \" \"\n", " ),\n", " )\n", " )\n", " for (lb1, ch1, lb2, ch2, fl) in chapter_diffs:\n", " (bk1, ch1, vs1, ca1) = get_verse_o(ch1, ca=True)\n", " (bk2, ch2, vs2, ca2) = get_verse_o(ch2, ca=True)\n", " f.write(\n", " sfields_fmt.format(\n", " VERSION,\n", " bk1,\n", " ch1,\n", " vs1,\n", " ca1,\n", " \"T\",\n", " \"\",\n", " CROSSREF_STATUS,\n", " CROSSREF_KEYWORD,\n", " \"[chapter diff with {}](tool:{})\".format(lb2, fl),\n", " )\n", " )\n", " f.write(\n", " sfields_fmt.format(\n", " VERSION,\n", " bk2,\n", " ch2,\n", " vs2,\n", " ca2,\n", " \"T\",\n", " \"\",\n", " CROSSREF_STATUS,\n", " CROSSREF_KEYWORD,\n", " \"[chapter diff with {}](tool:{})\".format(lb1, fl),\n", " )\n", " )\n", " for (x, refs) in refs_compiled:\n", " (bk, ch, vs, ca) = get_verse(x, ca=True)\n", " f.write(\n", " sfields_fmt.format(\n", " VERSION,\n", " bk,\n", " ch,\n", " vs,\n", " ca,\n", " \"T\",\n", " \"\",\n", " CROSSREF_STATUS,\n", " CROSSREF_KEYWORD,\n", " refs,\n", " )\n", " )\n", " for (chunk, clique, fl) in clique_refs:\n", " (bk, ch, vs, ca) = get_verse(chunk, ca=True)\n", " f.write(\n", " sfields_fmt.format(\n", " VERSION,\n", " bk,\n", " ch,\n", " vs,\n", " ca,\n", " \"T\",\n", " \"\",\n", " CROSSREF_STATUS,\n", " CROSSREF_KEYWORD,\n", " \"[all variants (clique {})](tool:{})\".format(clique, fl),\n", " )\n", " )\n", "\n", " TF.info(\n", " \"CROSSREFS: Generated {} notes\".format(\n", " 1 + len(refs_compiled) + 2 * len(chapter_diffs) + len(clique_refs)\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "lines_to_next_cell": 2 }, "outputs": [], "source": [ "def crossrefs2shebanq():\n", " expr = SHEBANQ_MATRIX + (SHEBANQ_SIMILARITY,)\n", " do_experiment(*(expr + (True,)))\n", " get_crossrefs()\n", " compile_refs()\n", " get_chapter_diffs()\n", " get_clique_refs()\n", " generate_notes()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7. Main\n", "\n", "In the cell below you can select the experiments you want to carry out.\n", "\n", "The previous cells contain just definitions and parameters.\n", "The next cell will do work.\n", "\n", "If none of the matrices and cliques have been computed before on the system where this runs, doing all experiments might take multiple hours (4-8)." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[ ]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reset_params()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```\n", "do_experiment(False, 'sentence', 'LCS', 60, False)\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "do_all_experiments()" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "```\n", "do_all_experiments(no_fixed=True, only_object='chapter')\n", "crossrefs2shebanq()\n", "show_all_experiments()\n", "get_specific_crossrefs(False, 'verse', 'LCS', 60, 'crossrefs_lcs_db.txt')\n", "do_all_chunks()\n", "```\n" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[ ]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "lines_to_next_cell": 2 }, "outputs": [], "source": [ "HTML(ecss)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 8. Overview of the similarities\n", "\n", "Here are the plots of two similarity matrices\n", "* with verses as chunks and SET as similarity method\n", "* with verses as chunks and LCS as similarity method\n", "\n", "Horizontally you see the degree of similarity from 0 to 100%, vertically the number of pairs that have that (rounded) similarity. This axis is logarithmic." ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[ ]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "do_experiment(False, \"verse\", \"SET\", 60, False)\n", "distances = collections.Counter()\n", "for (x, d) in chunk_dist.items():\n", " distances[int(round(d))] += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "lines_to_next_cell": 2 }, "outputs": [], "source": [ "x = range(MATRIX_THRESHOLD, 101)\n", "fig = plt.figure(figsize=[15, 4])\n", "plt.plot(x, [math.log(max((1, distances[y]))) for y in x], \"b-\")\n", "plt.axis([MATRIX_THRESHOLD, 101, 0, 15])\n", "plt.xlabel(\"similarity as %\")\n", "plt.ylabel(\"log # similarities\")\n", "plt.xticks(x, x, rotation=\"vertical\")\n", "plt.margins(0.2)\n", "plt.subplots_adjust(bottom=0.15)\n", "plt.title(\"distances\")" ] }, { "cell_type": "markdown", "metadata": { "lines_to_next_cell": 2 }, "source": [ "In[ ]:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "do_experiment(False, \"verse\", \"LCS\", 60, False)\n", "distances = collections.Counter()\n", "for (x, d) in chunk_dist.items():\n", " distances[int(round(d))] += 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "lines_to_next_cell": 2 }, "outputs": [], "source": [ "x = range(MATRIX_THRESHOLD, 101)\n", "fig = plt.figure(figsize=[15, 4])\n", "plt.plot(x, [math.log(max((1, distances[y]))) for y in x], \"b-\")\n", "plt.axis([MATRIX_THRESHOLD, 101, 0, 15])\n", "plt.xlabel(\"similarity as %\")\n", "plt.ylabel(\"log # similarities\")\n", "plt.xticks(x, x, rotation=\"vertical\")\n", "plt.margins(0.2)\n", "plt.subplots_adjust(bottom=0.15)\n", "plt.title(\"distances\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In[ ]:" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" }, "toc": { "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "toc_cell": true, "toc_position": {}, "toc_section_display": "block", "toc_window_display": false }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }