"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=4)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause ZYqX NA
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause WayX NA
phrase CP Conj
function=Conj
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 3"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause WayX NA
phrase CP Conj
function=Conj
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 4"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause WayX NA
phrase CP Conj
function=Conj
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(results, condenseType=\"clause\", end=4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we want such pairs, but then where the grammatical number differs."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"query = \"\"\"\n",
"clause\n",
" phrase function=Pred\n",
" w1:word pdp=verb\n",
" phrase function=Subj\n",
" =: w2:word pdp=subs\n",
" :=\n",
"w1 .nu#nu. w2\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.65s 739 results\n"
]
}
],
"source": [
"results = A.search(query)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | p | clause | phrase | word | phrase | word |
\n",
"1 | Genesis 1:1 | בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ | בָּרָ֣א | בָּרָ֣א | אֱלֹהִ֑ים | אֱלֹהִ֑ים |
\n",
"2 | Genesis 1:3 | וַיֹּ֥אמֶר אֱלֹהִ֖ים | יֹּ֥אמֶר | יֹּ֥אמֶר | אֱלֹהִ֖ים | אֱלֹהִ֖ים |
\n",
"3 | Genesis 1:4 | וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָאֹ֖ור | יַּ֧רְא | יַּ֧רְא | אֱלֹהִ֛ים | אֱלֹהִ֛ים |
\n",
"4 | Genesis 1:4 | וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָאֹ֖ור וּבֵ֥ין הַחֹֽשֶׁךְ׃ | יַּבְדֵּ֣ל | יַּבְדֵּ֣ל | אֱלֹהִ֔ים | אֱלֹהִ֔ים |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=4)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause xQtX NA
phrase PP Time
function=Time
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
phrase PP Objc
function=Objc
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause WayX NA
phrase CP Conj
function=Conj
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 3"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause WayX NA
phrase CP Conj
function=Conj
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
phrase PP Objc
function=Objc
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 4"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause WayX NA
phrase CP Conj
function=Conj
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
phrase PP Cmpl
function=Cmpl
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(results, condenseType=\"clause\", end=4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"and now where the subject is not God(s)."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"query = \"\"\"\n",
"clause\n",
" phrase function=Pred\n",
" w1:word pdp=verb\n",
" phrase function=Subj\n",
" =: w2:word pdp=subs lex#>LHJM/\n",
" :=\n",
"w1 .nu#nu. w2\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.77s 525 results\n"
]
}
],
"source": [
"results = A.search(query)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | p | clause | phrase | word | phrase | word |
\n",
"1 | Genesis 1:14 | יְהִ֤י מְאֹרֹת֙ בִּרְקִ֣יעַ הַשָּׁמַ֔יִם | יְהִ֤י | יְהִ֤י | מְאֹרֹת֙ | מְאֹרֹת֙ |
\n",
"2 | Genesis 3:5 | וְנִפְקְח֖וּ עֵֽינֵיכֶ֑ם | נִפְקְח֖וּ | נִפְקְח֖וּ | עֵֽינֵיכֶ֑ם | עֵֽינֵיכֶ֑ם |
\n",
"3 | Genesis 7:22 | כֹּ֡ל מִכֹּ֛ל מֵֽתוּ׃ | מֵֽתוּ׃ | מֵֽתוּ׃ | כֹּ֡ל | כֹּ֡ל |
\n",
"4 | Genesis 18:32 | אוּלַ֛י יִמָּצְא֥וּן שָׁ֖ם עֲשָׂרָ֑ה | יִמָּצְא֥וּן | יִמָּצְא֥וּן | עֲשָׂרָ֑ה | עֲשָׂרָ֑ה |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=4)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause ZYqX NA
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
phrase PP Loca
function=Loca
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause WQtX Resu
phrase CP Conj
function=Conj
phrase VP Pred
function=Pred
phrase NP Subj
function=Subj
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 3"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause XQtl NA
phrase NP Subj
function=Subj
clause XQtl NA
phrase PP Adju
function=Adju
clause XQtl NA
phrase VP Pred
function=Pred
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 4"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause xYqX NA
phrase AdvP Modi
function=Modi
phrase VP Pred
function=Pred
phrase AdvP Cmpl
function=Cmpl
phrase NP Subj
function=Subj
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(results, condenseType=\"clause\", end=4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Edges\n",
"\n",
"Note that all *edge* features in the dataset correspond to three relational operators.\n",
"For example, `mother` gives rise to the operators `-mother>` and ``.\n",
"\n",
"### Simple edges\n",
"Here is an example: look for pairs of clauses of which one is the mother of the other.\n",
"In our dataset, there is an *edge* between the two clauses, and this edge is coded in the feature `mother`.\n",
"The following query shows how to use the `mother` edge information."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:06.688698Z",
"start_time": "2018-05-24T08:00:05.864656Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.08s 13917 results\n"
]
}
],
"source": [
"query = \"\"\"\n",
"clause\n",
"-mother> clause\n",
"\"\"\"\n",
"results = A.search(query)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:06.688698Z",
"start_time": "2018-05-24T08:00:05.864656Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | p | clause | clause |
\n",
"1 | Genesis 1:4 | כִּי־טֹ֑וב | וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָאֹ֖ור |
\n",
"2 | Genesis 1:10 | כִּי־טֹֽוב׃ | וַיַּ֥רְא אֱלֹהִ֖ים |
\n",
"3 | Genesis 1:12 | כִּי־טֹֽוב׃ | וַיַּ֥רְא אֱלֹהִ֖ים |
\n",
"4 | Genesis 1:14 | לְהַבְדִּ֕יל בֵּ֥ין הַיֹּ֖ום וּבֵ֣ין הַלָּ֑יְלָה | יְהִ֤י מְאֹרֹת֙ בִּרְקִ֣יעַ הַשָּׁמַ֔יִם |
\n",
"5 | Genesis 1:15 | לְהָאִ֖יר עַל־הָאָ֑רֶץ | וְהָי֤וּ לִמְאֹורֹת֙ בִּרְקִ֣יעַ הַשָּׁמַ֔יִם |
\n",
"6 | Genesis 1:17 | לְהָאִ֖יר עַל־הָאָֽרֶץ׃ | וַיִּתֵּ֥ן אֹתָ֛ם אֱלֹהִ֖ים בִּרְקִ֣יעַ הַשָּׁמָ֑יִם |
\n",
"7 | Genesis 1:18 | וְלִמְשֹׁל֙ בַּיֹּ֣ום וּבַלַּ֔יְלָה | לְהָאִ֖יר עַל־הָאָֽרֶץ׃ |
\n",
"8 | Genesis 1:18 | וּֽלֲהַבְדִּ֔יל בֵּ֥ין הָאֹ֖ור וּבֵ֣ין הַחֹ֑שֶׁךְ | וְלִמְשֹׁל֙ בַּיֹּ֣ום וּבַלַּ֔יְלָה |
\n",
"9 | Genesis 1:18 | כִּי־טֹֽוב׃ | וַיַּ֥רְא אֱלֹהִ֖ים |
\n",
"10 | Genesis 1:21 | כִּי־טֹֽוב׃ | וַיַּ֥רְא אֱלֹהִ֖ים |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The mother relation is not always between clause nodes. \n",
"What if we are interested in all nodes between which the mother relation exists, irrespective\n",
"of the type?\n",
"\n",
"Use the `.` in the query instead of `clause`. \n",
"The `.` stands for: *any node type*."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:06.688698Z",
"start_time": "2018-05-24T08:00:05.864656Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.75s 182269 results\n"
]
}
],
"source": [
"query = \"\"\"\n",
".\n",
"-mother> .\n",
"\"\"\"\n",
"results = A.search(query)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | p | clause_atom (+1) | clause_atom (+1) |
\n",
"1 | Genesis 1:1 | אֵ֥ת הָאָֽרֶץ׃ | אֵ֥ת הַשָּׁמַ֖יִם |
\n",
"2 | Genesis 1:2 | וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ | בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ |
\n",
"3 | Genesis 1:2 | בֹ֔הוּ | תֹ֨הוּ֙ |
\n",
"4 | Genesis 1:2 | וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום | וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=4, colorMap={1: \"salmon\", 2: \"cyan\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can show more of the edges.\n",
"\n",
"Let's highlight all edges in the result in yellow."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"verse
sentence 1
sentence_atom 1
clause xQtX NA
clause_atom 0
mother•\n",
"⇥\n",
" \n",
"⇥\n",
"
phrase PP Objc
phrase_atom PP NA
subphrase
mother•\n",
"⇥\n",
"
subphrase
mother•\n",
"↦\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(\n",
" results,\n",
" end=1,\n",
" colorMap={1: \"salmon\", 2: \"cyan\"},\n",
" hiddenTypes={\"half_verse\"},\n",
" edgeHighlights=dict(mother={p: \"yellow\" for p in results}),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we color the edges between subphrases orange, the edges between clause atoms green, and the other edges yellow."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"ehighlights = {p: \"yellow\" for p in results}\n",
"\n",
"for (f, t) in results:\n",
" fType = F.otype.v(f)\n",
" tType = F.otype.v(t)\n",
" ehighlights[(f, t)] = (\n",
" (\n",
" \"orange\"\n",
" if fType == \"subphrase\"\n",
" else \"lightgreen\"\n",
" if fType == \"clause_atom\"\n",
" else \"yellow\"\n",
" )\n",
" if fType == tType\n",
" else \"yellow\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"verse
sentence 1
sentence_atom 1
clause xQtX NA
clause_atom 0
mother•\n",
"⇥\n",
" \n",
"⇥\n",
"
phrase PP Objc
phrase_atom PP NA
subphrase
mother•\n",
"⇥\n",
"
subphrase
mother•\n",
"↦\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(\n",
" results,\n",
" end=1,\n",
" colorMap={1: \"salmon\", 2: \"cyan\"},\n",
" hiddenTypes={\"half_verse\"},\n",
" edgeHighlights=dict(mother=ehighlights),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's have a look at result 2:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"verse
sentence 1
sentence_atom 1
clause xQtX NA
clause_atom 0
mother•\n",
"⇥\n",
" \n",
"⇥\n",
"
phrase PP Objc
phrase_atom PP NA
subphrase
mother•\n",
"⇥\n",
"
subphrase
mother•\n",
"↦\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"verse
sentence 2
sentence_atom 2
clause WXQt NA
clause_atom 422
mother•\n",
"⇥\n",
"\n",
"↦\n",
"
phrase NP PreC
phrase_atom NP NA
subphrase
mother•\n",
"⇥\n",
"
subphrase
mother•\n",
"↦\n",
"
sentence 3
sentence_atom 3
clause NmCl NA
clause_atom 402
mother•\n",
"⇥\n",
"\n",
"↦\n",
"
phrase PP PreC
phrase_atom PP NA
subphrase
mother•\n",
"↦\n",
"
sentence 4
sentence_atom 4
clause Ptcp NA
clause_atom 460
mother•\n",
"↦\n",
"
phrase NP Subj
phrase_atom NP NA
subphrase
mother•\n",
"↦\n",
"
phrase PP Cmpl
phrase_atom PP NA
subphrase
mother•\n",
"↦\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(\n",
" results,\n",
" start=2,\n",
" end=2,\n",
" colorMap={1: \"salmon\", 2: \"cyan\"},\n",
" hiddenTypes={\"half_verse\"},\n",
" edgeHighlights=dict(mother=ehighlights),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What about those yellow edges in the subphrases above? Didn't we say that those should be orange?\n",
"\n",
"No, because they do not point to a subphrase, but to the word in the subphrase. To make that\n",
"even more explicit, we show the node numbers:"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"verse:1414389
sentence:1172308 1
sentence_atom:1236025 1
clause:427559 xQtX NA
clause_atom:515690 0
mother•\n",
"515691⇥\n",
" \n",
"515694⇥\n",
"
phrase:651576 PP Objc
phrase_atom:904779 PP NA
subphrase:1300539
mother•\n",
"1300540⇥\n",
"
subphrase:1300540
mother•\n",
"↦1300539\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"verse:1414390
sentence:1172309 2
sentence_atom:1236026 2
clause:427560 WXQt NA
clause_atom:515691 422
mother•\n",
"515692⇥\n",
"\n",
"↦515690\n",
"
phrase:651580 NP PreC
phrase_atom:904783 NP NA
subphrase:1300541
mother•\n",
"1300542⇥\n",
"
subphrase:1300542
mother•\n",
"↦1300541\n",
"
sentence:1172310 3
sentence_atom:1236027 3
clause:427561 NmCl NA
clause_atom:515692 402
mother•\n",
"515693⇥\n",
"\n",
"↦515691\n",
"
phrase:651583 PP PreC
phrase_atom:904786 PP NA
subphrase:1300543
mother•\n",
"1300544⇥\n",
"
subphrase:1300544
mother•\n",
"↦22\n",
"
sentence:1172311 4
sentence_atom:1236028 4
clause:427562 Ptcp NA
clause_atom:515693 460
mother•\n",
"↦515692\n",
"
phrase:651585 NP Subj
phrase_atom:904788 NP NA
subphrase:1300545
mother•\n",
"1300546⇥\n",
"
subphrase:1300546
mother•\n",
"↦25\n",
"
phrase:651587 PP Cmpl
phrase_atom:904790 PP NA
subphrase:1300547
mother•\n",
"1300548⇥\n",
"
subphrase:1300548
mother•\n",
"↦29\n",
"
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(\n",
" results,\n",
" start=2,\n",
" end=2,\n",
" colorMap={1: \"salmon\", 2: \"cyan\"},\n",
" withNodes=True,\n",
" hiddenTypes={\"half_verse\"},\n",
" edgeHighlights=dict(mother=ehighlights),\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A clause and its mother do not have to be in the same verse.\n",
"We are going to fetch are the cases where they are in different verses.\n",
"\n",
"Note that we need a more flexible syntax here, where we specify a few templates, give names\n",
"to a few positions in the template, and then constrain those positions\n",
"by stipulating relationships between them.\n",
"\n",
"> **Caution**\n",
"Referring to verses is not as innocent as it seems.\n",
"That will be addressed in [gaps](searchGaps.ipynb)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:11.096751Z",
"start_time": "2018-05-24T08:00:10.585477Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.13s 710 results\n"
]
}
],
"source": [
"query = \"\"\"\n",
"v1:verse\n",
" c1:clause\n",
"v2:verse\n",
" c2:clause\n",
"\n",
"c1 -mother> c2\n",
"v1 # v2\n",
"\"\"\"\n",
"results = A.search(query)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:11.096751Z",
"start_time": "2018-05-24T08:00:10.585477Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | p | verse | clause | verse | clause |
\n",
"1 | Genesis 1:18 | | וְלִמְשֹׁל֙ בַּיֹּ֣ום וּבַלַּ֔יְלָה | | לְהָאִ֖יר עַל־הָאָֽרֶץ׃ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We want to see the different verse references in the table.\n",
"\n",
"We can skip the verse columns first:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:11.096751Z",
"start_time": "2018-05-24T08:00:10.585477Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | p | verse | clause | verse | clause |
\n",
"1 | Genesis 1:18 | | וְלִמְשֹׁל֙ בַּיֹּ֣ום וּבַלַּ֔יְלָה | | לְהָאִ֖יר עַל־הָאָֽרֶץ׃ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=1, skipCols=\"1 3\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"and then specify that the remaining columns (the clauses) show the passage:"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:11.096751Z",
"start_time": "2018-05-24T08:00:10.585477Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | verse | clause | verse | clause |
\n",
"1 | | Genesis 1:18 וְלִמְשֹׁל֙ בַּיֹּ֣ום וּבַלַּ֔יְלָה | | לְהָאִ֖יר עַל־הָאָֽרֶץ׃ |
\n",
"2 | | Genesis 2:7 וַיִּיצֶר֩ יְהוָ֨ה אֱלֹהִ֜ים אֶת־הָֽאָדָ֗ם עָפָר֙ מִן־הָ֣אֲדָמָ֔ה | | בְּיֹ֗ום |
\n",
"3 | | Genesis 7:3 לְחַיֹּ֥ות זֶ֖רַע עַל־פְּנֵ֥י כָל־הָאָֽרֶץ׃ | | מִכֹּ֣ל׀ הַבְּהֵמָ֣ה הַטְּהֹורָ֗ה תִּֽקַּח־לְךָ֛ שִׁבְעָ֥ה שִׁבְעָ֖ה אִ֣ישׁ וְאִשְׁתֹּ֑ו |
\n",
"4 | | Genesis 22:17 כִּֽי־בָרֵ֣ךְ אֲבָרֶכְךָ֗ | | כִּ֗י |
\n",
"5 | | Genesis 24:44 הִ֣וא הָֽאִשָּׁ֔ה | | הָֽעַלְמָה֙ |
\n",
"6 | | Genesis 27:45 עַד־שׁ֨וּב אַף־אָחִ֜יךָ מִמְּךָ֗ | | עַ֥ד אֲשֶׁר־תָּשׁ֖וּב חֲמַ֥ת אָחִֽיךָ׃ |
\n",
"7 | | Genesis 36:16 אַלּֽוּף־קֹ֛רַח אַלּ֥וּף גַּעְתָּ֖ם אַלּ֣וּף עֲמָלֵ֑ק | | בְּנֵ֤י אֱלִיפַז֙ בְּכֹ֣ור עֵשָׂ֔ו אַלּ֤וּף תֵּימָן֙ אַלּ֣וּף אֹומָ֔ר אַלּ֥וּף צְפֹ֖ו אַלּ֥וּף קְנַֽז׃ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=7, skipCols=\"1 3\", withPassage=\"1 2\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Edges with values\n",
"\n",
"There are also edge features that somehow *qualify* the relation between nodes they specify.\n",
"\n",
"The edge feature `crossref` in the\n",
"[parallels](https://github.com/ETCBC/parallels)\n",
"module specifies a relationship between verses: they are *parallel* if they are similar.\n",
"But `crossref` also tells you how similar, in the form of a number that is the percentage of similarity\n",
"according to the measure used by the algorithm to detect the parallels.\n",
"\n",
"This number is called the *value* of the `crossref` edge.\n",
"In our search templates we make use of the *values* of edge features.\n",
"\n",
"Not all edge features provide values. `mother` does not. But `crossref` does.\n",
"\n",
"Here is how many cross-references we have. The `crossref` edge feature is symmetric: if `v` is parallel to `w`, `w` is parallel to `v`. So in our query we stipulate that `v` comes before `w`:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:35.648065Z",
"start_time": "2018-05-24T08:00:35.276033Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.06s 15871 results\n"
]
}
],
"source": [
"query = \"\"\"\n",
"v:verse\n",
"-crossref> w:verse\n",
"v < w\n",
"\"\"\"\n",
"results = A.search(query)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We get a quick overview of the similarity distribution of parallels by means of `freqList()`:"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:38.315507Z",
"start_time": "2018-05-24T08:00:38.291652Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"((100, 8456),\n",
" (80, 7796),\n",
" (84, 2874),\n",
" (86, 2328),\n",
" (76, 1274),\n",
" (77, 1220),\n",
" (78, 1170),\n",
" (79, 844),\n",
" (81, 844),\n",
" (75, 836),\n",
" (83, 754),\n",
" (88, 730),\n",
" (82, 720),\n",
" (92, 250),\n",
" (85, 248),\n",
" (90, 240),\n",
" (91, 216),\n",
" (94, 160),\n",
" (87, 148),\n",
" (95, 148),\n",
" (89, 142),\n",
" (96, 90),\n",
" (93, 88),\n",
" (98, 76),\n",
" (99, 58),\n",
" (97, 32))"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"E.crossref.freqList()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we want the cases with a high similarity, we can say:"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:40.831880Z",
"start_time": "2018-05-24T08:00:40.657543Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.04s 4356 results\n"
]
},
{
"data": {
"text/html": [
"n | verse | verse |
\n",
"1 | Genesis 10:2 | 1_Chronicles 1:5 |
\n",
"2 | Genesis 10:6 | 1_Chronicles 1:8 |
\n",
"3 | Genesis 10:7 | 1_Chronicles 1:9 |
\n",
"4 | Genesis 10:8 | 1_Chronicles 1:10 |
\n",
"5 | Genesis 10:13 | 1_Chronicles 1:11 |
\n",
"6 | Genesis 10:14 | 1_Chronicles 1:12 |
\n",
"7 | Genesis 10:15 | 1_Chronicles 1:13 |
\n",
"8 | Genesis 10:16 | 1_Chronicles 1:14 |
\n",
"9 | Genesis 10:17 | 1_Chronicles 1:15 |
\n",
"10 | Genesis 10:24 | 1_Chronicles 1:18 |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"v:verse\n",
"-crossref>95> w:verse\n",
"v < w\n",
"\"\"\"\n",
"results = A.search(query)\n",
"A.table(results, end=10, withPassage=\"1 2\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can also see the verses written out:"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | verse | verse |
\n",
"1 | Genesis 10:2 בְּנֵ֣י יֶ֔פֶת גֹּ֣מֶר וּמָגֹ֔וג וּמָדַ֖י וְיָוָ֣ן וְתֻבָ֑ל וּמֶ֖שֶׁךְ וְתִירָֽס׃ | 1_Chronicles 1:5 בְּנֵ֣י יֶ֔פֶת גֹּ֣מֶר וּמָגֹ֔וג וּמָדַ֖י וְיָוָ֣ן וְתֻבָ֑ל וּמֶ֖שֶׁךְ וְתִירָֽס׃ ס |
\n",
"2 | Genesis 10:6 וּבְנֵ֖י חָ֑ם כּ֥וּשׁ וּמִצְרַ֖יִם וּפ֥וּט וּכְנָֽעַן׃ | 1_Chronicles 1:8 בְּנֵ֖י חָ֑ם כּ֥וּשׁ וּמִצְרַ֖יִם פּ֥וּט וּכְנָֽעַן׃ |
\n",
"3 | Genesis 10:7 וּבְנֵ֣י כ֔וּשׁ סְבָא֙ וַֽחֲוִילָ֔ה וְסַבְתָּ֥ה וְרַעְמָ֖ה וְסַבְתְּכָ֑א וּבְנֵ֥י רַעְמָ֖ה שְׁבָ֥א וּדְדָֽן׃ | 1_Chronicles 1:9 וּבְנֵ֣י כ֔וּשׁ סְבָא֙ וַחֲוִילָ֔ה וְסַבְתָּ֥א וְרַעְמָ֖א וְסַבְתְּכָ֑א וּבְנֵ֥י רַעְמָ֖א שְׁבָ֥א וּדְדָֽן׃ ס |
\n",
"4 | Genesis 10:8 וְכ֖וּשׁ יָלַ֣ד אֶת־נִמְרֹ֑ד ה֣וּא הֵחֵ֔ל לִֽהְיֹ֥ות גִּבֹּ֖ר בָּאָֽרֶץ׃ | 1_Chronicles 1:10 וְכ֖וּשׁ יָלַ֣ד אֶת־נִמְרֹ֑וד ה֣וּא הֵחֵ֔ל לִהְיֹ֥ות גִּבֹּ֖ור בָּאָֽרֶץ׃ ס |
\n",
"5 | Genesis 10:13 וּמִצְרַ֡יִם יָלַ֞ד אֶת־לוּדִ֧ים וְאֶת־עֲנָמִ֛ים וְאֶת־לְהָבִ֖ים וְאֶת־נַפְתֻּחִֽים׃ | 1_Chronicles 1:11 וּמִצְרַ֡יִם יָלַ֞ד אֶת־לוּדִ֧ים וְאֶת־עֲנָמִ֛ים וְאֶת־לְהָבִ֖ים וְאֶת־נַפְתֻּחִֽים׃ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, end=5, withPassage=\"1 2\", full=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we want to inspect the cases with a lower similarity:"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:43.547559Z",
"start_time": "2018-05-24T08:00:43.379437Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.03s 2672 results\n"
]
},
{
"data": {
"text/html": [
"n | verse | verse |
\n",
"1 | Genesis 1:15 וְהָי֤וּ לִמְאֹורֹת֙ בִּרְקִ֣יעַ הַשָּׁמַ֔יִם לְהָאִ֖יר עַל־הָאָ֑רֶץ וַֽיְהִי־כֵֽן׃ | Genesis 1:17 וַיִּתֵּ֥ן אֹתָ֛ם אֱלֹהִ֖ים בִּרְקִ֣יעַ הַשָּׁמָ֑יִם לְהָאִ֖יר עַל־הָאָֽרֶץ׃ |
\n",
"2 | Genesis 5:4 וַיִּֽהְי֣וּ יְמֵי־אָדָ֗ם אַֽחֲרֵי֙ הֹולִידֹ֣ו אֶת־שֵׁ֔ת שְׁמֹנֶ֥ה מֵאֹ֖ת שָׁנָ֑ה וַיֹּ֥ולֶד בָּנִ֖ים וּבָנֹֽות׃ | Genesis 5:7 וַֽיְחִי־שֵׁ֗ת אַֽחֲרֵי֙ הֹולִידֹ֣ו אֶת־אֱנֹ֔ושׁ שֶׁ֣בַע שָׁנִ֔ים וּשְׁמֹנֶ֥ה מֵאֹ֖ות שָׁנָ֑ה וַיֹּ֥ולֶד בָּנִ֖ים וּבָנֹֽות׃ |
\n",
"3 | Genesis 5:4 וַיִּֽהְי֣וּ יְמֵי־אָדָ֗ם אַֽחֲרֵי֙ הֹולִידֹ֣ו אֶת־שֵׁ֔ת שְׁמֹנֶ֥ה מֵאֹ֖ת שָׁנָ֑ה וַיֹּ֥ולֶד בָּנִ֖ים וּבָנֹֽות׃ | Genesis 5:13 וַיְחִ֣י קֵינָ֗ן אַחֲרֵי֙ הֹולִידֹ֣ו אֶת־מַֽהֲלַלְאֵ֔ל אַרְבָּעִ֣ים שָׁנָ֔ה וּשְׁמֹנֶ֥ה מֵאֹ֖ות שָׁנָ֑ה וַיֹּ֥ולֶד בָּנִ֖ים וּבָנֹֽות׃ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"v:verse\n",
"-crossref<80> w:verse\n",
"v < w\n",
"\"\"\"\n",
"results = A.search(query)\n",
"A.table(results, end=3, withPassage=\"1 2\", full=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This shows how all features in your data can be queried in search templates, even the features that give values\n",
"to edges."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature conditions\n",
"\n",
"So far we have seen feature conditions in templates of these forms\n",
"\n",
"```\n",
"node feature=value\n",
"```\n",
"\n",
"But there is more."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Trivially true\n",
"\n",
"You can say\n",
"\n",
"```\n",
"node feature*\n",
"```\n",
"\n",
"which selects all nodes, irrespective of the existence or value of feature.\n",
"\n",
"This is a useless criterion in the sense that it does not influence the set of results.\n",
"\n",
"But when some applications run queries for you, they might use the features mentioned in your query\n",
"to decorate the results retrieved.\n",
"\n",
"This is your way to tell such applications that you want the values of `feature` included in your results.\n",
"\n",
"The text fabric browser looks at the features when it exports your results to CSV."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.35s 426590 results\n",
"426590\n",
" 0.34s 426590 results\n",
"426590\n"
]
}
],
"source": [
"query1 = \"\"\"\n",
"word vt*\n",
"\"\"\"\n",
"\n",
"query2 = \"\"\"\n",
"word\n",
"\"\"\"\n",
"\n",
"results = A.search(query1)\n",
"print(len(results))\n",
"\n",
"results = A.search(query1)\n",
"print(len(results))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Inequality\n",
"\n",
"You can also say\n",
"\n",
"```\n",
"node feature#value\n",
"```\n",
"which selects nodes where the feature does not have `value`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Multiple values\n",
"\n",
"When stating a feature condition, such as `chapter=1`,\n",
"you may also specify a list of alternative values:\n",
"\n",
"```\n",
" chapter=1|2|3\n",
"```\n",
"\n",
"You may list as many values as you wish, for every feature.\n",
"\n",
"It also works with inequalities:\n",
"\n",
"```\n",
" chapter#1|2|3\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's find all verbally inflected words that are:\n",
"not in the qal, not in the third person, not in the singular,\n",
"not in the masculine."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.35s 271 results\n"
]
}
],
"source": [
"query = \"\"\"\n",
"word sp=verb vs#qal vt#infc|infa|ptca|ptcp ps#p3 nu#sg gn#m\n",
"\"\"\"\n",
"\n",
"A.displaySetup(extraFeatures=\"vt ps nu gn\")\n",
"results = A.search(query, shallow=True)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for r in sorted(results)[0:5]:\n",
" A.pretty(r)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"A.displayReset(\"extraFeatures\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Existence of values\n",
"\n",
"If you are not interested in the particular value of a feature,\n",
"but only in whether there is a value or not, you can express that.\n",
"\n",
"### Qere\n",
"\n",
"We can ask for all words that have a qere.\n",
"Just leave out the `=value` part.\n",
"\n",
"```\n",
"word qere\n",
"```\n",
"\n",
"Conversely, we can ask for words without a qere.\n",
"Just add a `#` after the feature name.\n",
"\n",
"```\n",
"word qere#\n",
"```\n",
"\n",
"Let's test it."
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:49.932231Z",
"start_time": "2018-05-24T08:00:48.725647Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Words in total:\n",
" 0.25s 426590 results\n",
"Words with a qere:\n",
" 0.12s 1892 results\n",
"Words without a qere:\n",
" 0.30s 424698 results\n",
"qereWords + plainWords == allWords ? True\n"
]
}
],
"source": [
"query = \"\"\"\n",
"word\n",
"\"\"\"\n",
"print(\"Words in total:\")\n",
"results = A.search(query)\n",
"allWords = len(results)\n",
"\n",
"print(\"Words with a qere:\")\n",
"query = \"\"\"\n",
"word qere\n",
"\"\"\"\n",
"results = A.search(query)\n",
"qereWords = len(results)\n",
"\n",
"print(\"Words without a qere:\")\n",
"query = \"\"\"\n",
"word qere#\n",
"\"\"\"\n",
"results = A.search(query)\n",
"plainWords = len(results)\n",
"\n",
"print(f\"qereWords + plainWords == allWords ? {qereWords + plainWords == allWords}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Boundaries\n",
"\n",
"For features with *numerical* values, we may ask for values higher or lower than a given value.\n",
"\n",
"The\n",
"[dist](https://etcbc.github.io/bhsa/features/hebrew/2017/dist.html)\n",
"feature gives the distance between an object and its mother.\n",
"\n",
"We want to see it values by means of `freqList()`, but the feature is not yet loaded.\n",
"Let's do a query with it, after running it, the feature is loaded."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:55.805149Z",
"start_time": "2018-05-24T08:00:55.469647Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 1.67s 598 results\n"
]
}
],
"source": [
"query = \"\"\"\n",
"clause dist=1\n",
"\"\"\"\n",
"results = A.search(query)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can explore the frequencies:"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:00:58.342986Z",
"start_time": "2018-05-24T08:00:57.929824Z"
},
"scrolled": true,
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"((0, 631151),\n",
" (-1, 104911),\n",
" (-2, 38188),\n",
" (-3, 14986),\n",
" (-4, 7665),\n",
" (-5, 3657),\n",
" (-6, 2145),\n",
" (1, 1773),\n",
" (-7, 1380),\n",
" (-8, 918))"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"F.dist.freqList()[0:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let us say we are interested in clause only. The feature `dist` is defined for multiple node types.\n",
"We can pass a set of node types to `freqList()` in order to get the frequencies restricted to those types:"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:00.822906Z",
"start_time": "2018-05-24T08:01:00.224369Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"((0, 67340),\n",
" (-1, 11593),\n",
" (-2, 3265),\n",
" (-3, 2437),\n",
" (-4, 1384),\n",
" (-5, 668),\n",
" (1, 598),\n",
" (-6, 329),\n",
" (-7, 167),\n",
" (-8, 70))"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"F.dist.freqList({\"clause\"})[0:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There are negative distances. In those cases the mother precedes the daughter. Let's get the mothers that\n",
"precede their daughters by a large amount."
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:05.718152Z",
"start_time": "2018-05-24T08:01:05.541047Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.03s 86 results\n"
]
},
{
"data": {
"text/html": [
"n | p | clause |
\n",
"1 | Genesis 25:12 | אֲשֶׁ֨ר יָלְדָ֜ה הָגָ֧ר הַמִּצְרִ֛ית שִׁפְחַ֥ת שָׂרָ֖ה לְאַבְרָהָֽם׃ |
\n",
"2 | Genesis 30:33 | אֲשֶׁר־אֵינֶנּוּ֩ נָקֹ֨ד וְטָל֜וּא בָּֽעִזִּ֗ים וְחוּם֙ בַּכְּשָׂבִ֔ים |
\n",
"3 | Genesis 49:11 | אֹסְרִ֤י לַגֶּ֨פֶן֙ עִירֹ֔ו |
\n",
"4 | Genesis 50:13 | אֲשֶׁ֣ר קָנָה֩ אַבְרָהָ֨ם אֶת־הַשָּׂדֶ֜ה לַאֲחֻזַּת־קֶ֗בֶר מֵאֵ֛ת עֶפְרֹ֥ן הַחִתִּ֖י |
\n",
"5 | Exodus 18:8 | אֲשֶׁ֨ר עָשָׂ֤ה יְהוָה֙ לְפַרְעֹ֣ה וּלְמִצְרַ֔יִם עַ֖ל אֹודֹ֣ת יִשְׂרָאֵ֑ל |
\n",
"6 | Exodus 25:9 | אֲשֶׁ֤ר אֲנִי֙ מַרְאֶ֣ה אֹותְךָ֔ אֵ֚ת תַּבְנִ֣ית הַמִּשְׁכָּ֔ן וְאֵ֖ת תַּבְנִ֣ית כָּל־כֵּלָ֑יו |
\n",
"7 | Exodus 38:26 | הָעֹבֵ֜ר עַל־הַפְּקֻדִ֗ים מִבֶּ֨ן עֶשְׂרִ֤ים שָׁנָה֙ וָמַ֔עְלָה |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"clause dist<-10\n",
"\"\"\"\n",
"results = A.search(query)\n",
"A.table(sorted(results), end=7)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Regular expressions\n",
"\n",
"An even more powerful way of specifying desired feature values is by regular expressions.\n",
"You can do this for *string-valued* values features only.\n",
"\n",
"Instead of specifying a feature condition like this\n",
"\n",
"```\n",
"typ=WIm0\n",
"```\n",
"\n",
"or\n",
"\n",
"```\n",
"typ=WIm0|WImX\n",
"```\n",
"\n",
"you can say\n",
"\n",
"```\n",
"typ~WIm[0X]\n",
"```\n",
"\n",
"Note that you do not use the `=` between feature name and value specification,\n",
"but `~`.\n",
"\n",
"The syntax and semantics of regular expressions are those as defined in the\n",
"[Python docs](https://docs.python.org/3/library/re.html#regular-expression-syntax).\n",
"\n",
"Note, that if you need to enter a `\\` in the regular expression, you have to double it.\n",
"Also, when you need a space in it, you have to put a `\\` in front of it."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### No value no match\n",
"\n",
"If you search with regular expressions, then nodes without a value do not match any regular expression.\n",
"\n",
"The regular expression `.*` matches everything.\n",
"\n",
"#### Qere\n",
"\n",
"Not all words have a qere.\n",
"\n",
"So we expect the following template to list all words that do have a qere and none of those that don't."
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:11.104476Z",
"start_time": "2018-05-24T08:01:10.518168Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.14s 1892 results\n",
"Compare this with qere words: 1892: Equal\n"
]
}
],
"source": [
"query = \"\"\"\n",
"word qere~.*\n",
"\"\"\"\n",
"results = list(A.search(query))\n",
"matchWords = len(results)\n",
"print(\n",
" \"Compare this with qere words: \"\n",
" f'{qereWords}: {\"Equal\" if matchWords == qereWords else \"Unequal\"}'\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### More examples\n",
"\n",
"#### Two letter nouns\n",
"\n",
"We pick two letter nouns that start with an aleph."
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:13.922452Z",
"start_time": "2018-05-24T08:01:13.089321Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.21s 816 results\n"
]
},
{
"data": {
"text/html": [
"n | p | word |
\n",
"1 | Genesis 2:6 | אֵ֖ד |
\n",
"2 | Genesis 3:20 | אֵ֥ם |
\n",
"3 | Genesis 14:18 | אֵ֥ל |
\n",
"4 | Genesis 14:19 | אֵ֣ל |
\n",
"5 | Genesis 14:20 | אֵ֣ל |
\n",
"6 | Genesis 14:22 | אֵ֣ל |
\n",
"7 | Genesis 15:17 | אֵ֔שׁ |
\n",
"8 | Genesis 16:13 | אֵ֣ל |
\n",
"9 | Genesis 17:1 | אֵ֣ל |
\n",
"10 | Genesis 17:4 | אַ֖ב |
\n",
"11 | Genesis 17:5 | אַב־ |
\n",
"12 | Genesis 19:24 | אֵ֑שׁ |
\n",
"13 | Genesis 21:33 | אֵ֥ל |
\n",
"14 | Genesis 22:6 | אֵ֖שׁ |
\n",
"15 | Genesis 22:7 | אֵשׁ֙ |
\n",
"16 | Genesis 24:29 | אָ֖ח |
\n",
"17 | Genesis 27:45 | אַף־ |
\n",
"18 | Genesis 28:3 | אֵ֤ל |
\n",
"19 | Genesis 28:5 | אֵ֥ם |
\n",
"20 | Genesis 30:2 | אַ֥ף |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"word sp=subs g_cons~^>.$\n",
"\"\"\"\n",
"results = A.search(query, sort=True)\n",
"A.table(results, end=20)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let us zoom in on one of the results.\n",
"We want to know more about the lexeme in question.\n",
"\n",
"There are several methods to do that."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Show the nodes\n",
"\n",
"First of all, let us show the nodes."
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:17.632676Z",
"start_time": "2018-05-24T08:01:17.624599Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.table(results, start=20, end=20, withNodes=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can use `pretty()` to get more info."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:20.072171Z",
"start_time": "2018-05-24T08:01:20.065240Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.pretty(results[19][0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that under the word is a link to its lexeme entry in SHEBANQ.\n",
"\n",
"##### Programmatically\n",
"With a bit of TF juggling you could also have got this link programmatically:"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"lx = L.u(results[19][0], otype=\"lex\")[0]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"אַף"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.webLink(lx)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Enrich the query\n",
"\n",
"We can also add some context to the query.\n",
"Since we are interested in the lexemes, let's add those to the query.\n",
"\n",
"Every word lies embedded in a lexeme."
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:27.629901Z",
"start_time": "2018-05-24T08:01:26.793939Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.20s 816 results\n"
]
},
{
"data": {
"text/html": [
"n | p | lex | word |
\n",
"1 | Exodus 4:8 | אֹות | אֹ֣ת |
\n",
"2 | Exodus 4:8 | אֹות | אֹ֥ת |
\n",
"3 | Exodus 8:19 | אֹות | אֹ֥ת |
\n",
"4 | Exodus 12:13 | אֹות | אֹ֗ת |
\n",
"5 | Genesis 2:6 | אֵד | אֵ֖ד |
\n",
"6 | Genesis 27:45 | אַף | אַף־ |
\n",
"7 | Genesis 30:2 | אַף | אַ֥ף |
\n",
"8 | Exodus 4:14 | אַף | אַ֨ף |
\n",
"9 | Exodus 11:8 | אַף | אָֽף׃ ס |
\n",
"10 | Exodus 32:19 | אַף | אַ֣ף |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"lex\n",
" word sp=subs g_cons~^>.$\n",
"\"\"\"\n",
"results = A.search(query)\n",
"A.table(results, end=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Same amount of results, but the order is different.\n",
"We just use Python to get the lexemes only, together with their first occurrence.\n",
"We make a list of tuples, and feed that to `A.table()`."
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:38.168240Z",
"start_time": "2018-05-24T08:01:38.158934Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"n | p | lex | word |
\n",
"1 | Exodus 4:8 | אֹות | אֹ֣ת |
\n",
"2 | Genesis 2:6 | אֵד | אֵ֖ד |
\n",
"3 | Genesis 27:45 | אַף | אַף־ |
\n",
"4 | Genesis 17:4 | אָב | אַ֖ב |
\n",
"5 | Genesis 3:20 | אֵם | אֵ֥ם |
\n",
"6 | Genesis 24:29 | אָח | אָ֖ח |
\n",
"7 | Isaiah 20:6 | אִי | אִ֣י |
\n",
"8 | Genesis 14:18 | אֵל | אֵ֥ל |
\n",
"9 | Genesis 15:17 | אֵשׁ | אֵ֔שׁ |
\n",
"10 | Genesis 31:29 | אֵל | אֵ֣ל |
\n",
"11 | 2_Samuel 18:5 | אַט | אַט־ |
\n",
"12 | 2_Samuel 14:19 | אִשׁ | אִ֣שׁ׀ |
\n",
"13 | Ezekiel 40:48 | אַיִל | אֵ֣ל |
\n",
"14 | Jeremiah 36:22 | אָח | אָ֖ח |
\n",
"15 | Job 24:25 | אַל | אַ֗ל |
\n",
"16 | Ezra 5:8 | אָע | אָ֖ע |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"lexemes = set()\n",
"lexResults = []\n",
"for (lex, word) in results:\n",
" if lex not in lexemes:\n",
" lexemes.add(lex)\n",
" lexResults.append((lex, word))\n",
"A.table(lexResults)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Observe how you can use a query to get an interesting node set,\n",
"which you can then massage using standard Python machinery,\n",
"after which you can display the results prettily with `A.table()` or `A.show()`.\n",
"\n",
"**The take-away lesson is: you can use `A.table()` and `A.show()` on arbitrary iterables of tuples of nodes,\n",
"whether or not they come from an executed query.**\n",
"\n",
"The headers of the tables are taken from the node types of all tuples, but it shows the most\n",
"frequent one only. \n",
"If there are more types in the same column, it will be indicated, and if you hover over the `(+1)` you see which\n",
"types are also present."
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:41.297750Z",
"start_time": "2018-05-24T08:01:41.291652Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tuples = (\n",
" (1, 1000000),\n",
" (1000001, 2),\n",
")\n",
"A.table(tuples)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Also `A.show()` makes perfect sense in this case."
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:43.690920Z",
"start_time": "2018-05-24T08:01:43.667562Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"clause 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"clause 3"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(tuples, condensed=True, condenseType=\"clause\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Everything that is part of a result, we see properly highlighted, but we can not discern what belongs to result 1 and what to result 2.\n",
"\n",
"That becomes clear if we uncondense:"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:46.770065Z",
"start_time": "2018-05-24T08:01:46.729601Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(tuples, condensed=False, condenseType=\"clause\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### we-x clauses with a non-qal verb\n",
"\n",
"If you look at the [clause types](https://etcbc.github.io/bhsa/features/hebrew/2017/typ.html)\n",
"you see a lot of types indicating that the clause starts with `we`:\n",
"\n",
"```\n",
"Way0\tWayyiqtol-null clause\n",
"WayX\tWayyiqtol-X clause\n",
"WIm0\tWe-imperative-null clause\n",
"WImX\tWe-imperative-X clause\n",
"WQt0\tWe-qatal-null clause\n",
"WQtX\tWe-qatal-X clause\n",
"WxI0\tWe-x-imperative-null clause\n",
"WXIm\tWe-X-imperative clause\n",
"WxIX\tWe-x-imperative-X clause\n",
"WxQ0\tWe-x-qatal-null clause\n",
"WXQt\tWe-X-qatal clause\n",
"WxQX\tWe-x-qatal-X clause\n",
"WxY0\tWe-x-yiqtol-null clause\n",
"WXYq\tWe-X-yiqtol clause\n",
"WxYX\tWe-x-yiqtol-X clause\n",
"WYq0\tWe-yiqtol-null clause\n",
"WYqX\tWe-yiqtol-X clause\n",
"```\n",
"\n",
"We are interested in the `We-x` and `We-X` clauses, so all clauses whose `typ` starts with `Wx` or `WX`.\n",
"\n",
"There are quite a number of verb stems. By means of a regular expression we can pick everything except `qal`.\n",
"\n",
"In the\n",
"[Python docs on regular expressions](https://docs.python.org/3/library/re.html#regular-expression-syntax)\n",
"we see that we can check for that by `^(?:!qal)`."
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:01:53.494281Z",
"start_time": "2018-05-24T08:01:52.486679Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.24s 3098 results\n"
]
},
{
"data": {
"text/html": [
"n | p | clause | word |
\n",
"1 | Genesis 1:20 | וְעֹוף֙ יְעֹופֵ֣ף עַל־הָאָ֔רֶץ עַל־פְּנֵ֖י רְקִ֥יעַ הַשָּׁמָֽיִם׃ | יְעֹופֵ֣ף |
\n",
"2 | Genesis 2:10 | וּמִשָּׁם֙ יִפָּרֵ֔ד | יִפָּרֵ֔ד |
\n",
"3 | Genesis 2:25 | וְלֹ֖א יִתְבֹּשָֽׁשׁוּ׃ | יִתְבֹּשָֽׁשׁוּ׃ |
\n",
"4 | Genesis 3:18 | וְקֹ֥וץ וְדַרְדַּ֖ר תַּצְמִ֣יחַֽ לָ֑ךְ | תַּצְמִ֣יחַֽ |
\n",
"5 | Genesis 4:4 | וְהֶ֨בֶל הֵבִ֥יא גַם־ה֛וּא מִבְּכֹרֹ֥ות צֹאנֹ֖ו וּמֵֽחֶלְבֵהֶ֑ן | הֵבִ֥יא |
\n",
"6 | Genesis 4:7 | וְאִם֙ לֹ֣א תֵיטִ֔יב | תֵיטִ֔יב |
\n",
"7 | Genesis 4:14 | וּמִפָּנֶ֖יךָ אֶסָּתֵ֑ר | אֶסָּתֵ֑ר |
\n",
"8 | Genesis 4:26 | וּלְשֵׁ֤ת גַּם־הוּא֙ יֻלַּד־בֵּ֔ן | יֻלַּד־ |
\n",
"9 | Genesis 6:1 | וּבָנֹ֖ות יֻלְּד֥וּ לָהֶֽם׃ | יֻלְּד֥וּ |
\n",
"10 | Genesis 6:12 | וְהִנֵּ֣ה נִשְׁחָ֑תָה | נִשְׁחָ֑תָה |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"clause typ~^W[xX]\n",
" word sp=verb vs#qal\n",
"\"\"\"\n",
"results = list(A.search(query))\n",
"A.table(results, end=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Find all glosses with a space"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:02:00.113562Z",
"start_time": "2018-05-24T08:02:00.028266Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.01s 406 results\n"
]
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = r\"\"\"\n",
"lex gloss~[\\ ] sp=subs\n",
"\"\"\"\n",
"results = list(A.search(query))\n",
"A.table(results, start=1, end=4)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 1"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"lex תְּהֹום
primeval oceansp=subs
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 2"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"lex תַּחַת
under partsp=subs
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 3"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"lex יַבָּשָׁה
dry landsp=subs
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 4"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"lex דֶּשֶׁא
young grasssp=subs
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.show(results, condensed=False, start=1, end=4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Custom sets\n",
"\n",
"Eventually you reach cases where search templates are just not up to it.\n",
"\n",
"Examples:\n",
"\n",
"* What if you want to restrict a search to sentences that do not contain infrequent words?\n",
"* It is fairly tricky to look for gapped phrases. What if you look for complex patterns, but only in\n",
" gapped phrases?\n",
"\n",
"Before you dive head over heels into hand coding, here is an intermediate solution.\n",
"You can create node sets by means of search, and then use those node sets in other search templates\n",
"at the places where you have node types."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can make custom sets with arbitrary nodes, not all of the same type.\n",
"Let's collect all non-word, non-lex nodes that contain fairly frequent words only.\n",
"We also collect a set of nodes that contain highly infrequent words.\n",
"\n",
"There is a feature for that, [`rank_lex`](https://etcbc.github.io/bhsa/features/hebrew/2017/rank_lex.html).\n",
"Since we have not loaded it, we do so now."
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:05:53.690573Z",
"start_time": "2018-05-24T08:05:53.592190Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"TF.load(\"rank_lex\", add=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We set a threshold `COMMON_RANK`, and pick all objects with only high ranking words, their ranks between 0 and `COMMON_RANK`.\n",
"\n",
"We set a threshold `RARE_RANK`, and pick all objects that contain at least one low ranking word, its rank higher than `RARE_RANK`."
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:06:08.187551Z",
"start_time": "2018-05-24T08:06:00.209985Z"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"669195 members in set frequent\n",
"425320 members in set infrequent\n"
]
}
],
"source": [
"COMMON_RANK = 100\n",
"RARE_RANK = 500\n",
"\n",
"frequent = set()\n",
"infrequent = set()\n",
"\n",
"for n in N.walk():\n",
" nTp = F.otype.v(n)\n",
" if nTp == \"lex\":\n",
" continue\n",
" if nTp == \"word\":\n",
" ranks = [F.rank_lex.v(n)]\n",
" else:\n",
" ranks = [F.rank_lex.v(w) for w in L.d(n, otype=\"word\")]\n",
" maxRank = max(ranks)\n",
" minRank = min(ranks)\n",
" if maxRank < COMMON_RANK:\n",
" frequent.add(n)\n",
" if maxRank > RARE_RANK:\n",
" infrequent.add(n)\n",
"\n",
"print(f\"{len(frequent):>6} members in set frequent\")\n",
"print(f\"{len(infrequent):>6} members in set infrequent\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can do all kinds of searches within the domain of `frequent` and `infrequent` things.\n",
"\n",
"We give the names to all the sets and put them in a dictionary."
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:07:11.688552Z",
"start_time": "2018-05-24T08:07:11.685127Z"
},
"tags": []
},
"outputs": [],
"source": [
"customSets = dict(\n",
" frequent=frequent,\n",
" infrequent=infrequent,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then we pass it to `A.search()` with a query to look for sentences with a rare word that have a clause with only frequent words:"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.43s 4311 results\n"
]
},
{
"data": {
"text/html": [
"n | p | sentence | clause |
\n",
"5 | Genesis 1:25 | וַיַּ֥רְא אֱלֹהִ֖ים כִּי־טֹֽוב׃ | וַיַּ֥רְא אֱלֹהִ֖ים |
\n",
"6 | Genesis 1:29 | הִנֵּה֩ נָתַ֨תִּי לָכֶ֜ם אֶת־כָּל־עֵ֣שֶׂב׀ זֹרֵ֣עַ זֶ֗רַע אֲשֶׁר֙ עַל־פְּנֵ֣י כָל־הָאָ֔רֶץ וְאֶת־כָּל־הָעֵ֛ץ אֲשֶׁר־בֹּ֥ו פְרִי־עֵ֖ץ זֹרֵ֣עַ זָ֑רַע וּֽלְכָל־חַיַּ֣ת הָ֠אָרֶץ וּלְכָל־עֹ֨וף הַשָּׁמַ֜יִם וּלְכֹ֣ל׀ רֹומֵ֣שׂ עַל־הָאָ֗רֶץ אֲשֶׁר־בֹּו֙ נֶ֣פֶשׁ חַיָּ֔ה אֶת־כָּל־יֶ֥רֶק עֵ֖שֶׂב לְאָכְלָ֑ה | אֲשֶׁר֙ עַל־פְּנֵ֣י כָל־הָאָ֔רֶץ |
\n",
"7 | Genesis 2:2 | וַיְכַ֤ל אֱלֹהִים֙ בַּיֹּ֣ום הַשְּׁבִיעִ֔י מְלַאכְתֹּ֖ו אֲשֶׁ֣ר עָשָׂ֑ה | אֲשֶׁ֣ר עָשָׂ֑ה |
\n",
"8 | Genesis 2:2 | וַיִּשְׁבֹּת֙ בַּיֹּ֣ום הַשְּׁבִיעִ֔י מִכָּל־מְלַאכְתֹּ֖ו אֲשֶׁ֥ר עָשָֽׂה׃ | אֲשֶׁ֥ר עָשָֽׂה׃ |
\n",
"9 | Genesis 2:3 | כִּ֣י בֹ֤ו שָׁבַת֙ מִכָּל־מְלַאכְתֹּ֔ו אֲשֶׁר־בָּרָ֥א אֱלֹהִ֖ים לַעֲשֹֽׂות׃ פ | לַעֲשֹֽׂות׃ פ |
\n",
"10 | Genesis 2:4 | בְּיֹ֗ום עֲשֹׂ֛ות יְהוָ֥ה אֱלֹהִ֖ים אֶ֥רֶץ וְשָׁמָֽיִם׃ וַיִּיצֶר֩ יְהוָ֨ה אֱלֹהִ֜ים אֶת־הָֽאָדָ֗ם עָפָר֙ מִן־הָ֣אֲדָמָ֔ה | בְּיֹ֗ום |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"infrequent otype=sentence\n",
" frequent otype=clause\n",
"\"\"\"\n",
"results = A.search(query, sets=customSets)\n",
"A.table(results, start=5, end=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We are going to show this really nice:\n",
"\n",
"* we add the feature `rank_lex` to the display\n",
"* we suppress the other features\n",
"* we color the rare words and the common words differently"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"ExecuteTime": {
"end_time": "2018-05-24T08:07:22.498973Z",
"start_time": "2018-05-24T08:07:22.065761Z"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"result 6"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"result 7"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"A.displaySetup(extraFeatures=\"rank_lex\")\n",
"highlights = {}\n",
"for (sentence, clause) in results:\n",
" highlights[sentence] = \"magenta\"\n",
" highlights[clause] = \"cyan\"\n",
" for w in L.d(sentence, otype=\"word\"):\n",
" if F.rank_lex.v(w) > RARE_RANK:\n",
" highlights[w] = \"magenta\"\n",
" for w in L.d(clause, otype=\"word\"):\n",
" if F.rank_lex.v(w) < COMMON_RANK:\n",
" highlights[w] = \"cyan\"\n",
"A.show(\n",
" results,\n",
" condensed=False,\n",
" start=6,\n",
" end=7,\n",
" suppress={\"sp\", \"vt\", \"vs\", \"function\", \"typ\", \"otype\"},\n",
" highlights=highlights,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now infrequent sentences ending in a frequent word:"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.45s 10798 results\n"
]
},
{
"data": {
"text/html": [
"n | p | sentence | word |
\n",
"5 | Genesis 1:9 | יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶל־מָקֹ֣ום אֶחָ֔ד | אֶחָ֔ד |
\n",
"6 | Genesis 1:10 | וַיִּקְרָ֨א אֱלֹהִ֤ים׀ לַיַּבָּשָׁה֙ אֶ֔רֶץ | אֶ֔רֶץ |
\n",
"7 | Genesis 1:11 | תַּֽדְשֵׁ֤א הָאָ֨רֶץ֙ דֶּ֔שֶׁא עֵ֚שֶׂב מַזְרִ֣יעַ זֶ֔רַע עֵ֣ץ פְּרִ֞י עֹ֤שֶׂה פְּרִי֙ לְמִינֹ֔ו אֲשֶׁ֥ר זַרְעֹו־בֹ֖ו עַל־הָאָ֑רֶץ | אָ֑רֶץ |
\n",
"8 | Genesis 1:15 | וְהָי֤וּ לִמְאֹורֹת֙ בִּרְקִ֣יעַ הַשָּׁמַ֔יִם לְהָאִ֖יר עַל־הָאָ֑רֶץ | אָ֑רֶץ |
\n",
"9 | Genesis 1:22 | וְהָעֹ֖וף יִ֥רֶב בָּאָֽרֶץ׃ | אָֽרֶץ׃ |
\n",
"10 | Genesis 1:26 | וְיִרְדּוּ֩ בִדְגַ֨ת הַיָּ֜ם וּבְעֹ֣וף הַשָּׁמַ֗יִם וּבַבְּהֵמָה֙ וּבְכָל־הָאָ֔רֶץ וּבְכָל־הָרֶ֖מֶשׂ הָֽרֹמֵ֥שׂ עַל־הָאָֽרֶץ׃ | אָֽרֶץ׃ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"infrequent otype=sentence\n",
" := frequent otype=word\n",
"\"\"\"\n",
"results = A.search(query, sets=customSets)\n",
"A.table(results, start=5, end=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As a check, we replace the custom set `frequent` by the ordinary type `word` with a rank condition."
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0.37s 10798 results\n"
]
},
{
"data": {
"text/html": [
"n | p | sentence | word |
\n",
"5 | Genesis 1:9 | יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶל־מָקֹ֣ום אֶחָ֔ד | אֶחָ֔ד |
\n",
"6 | Genesis 1:10 | וַיִּקְרָ֨א אֱלֹהִ֤ים׀ לַיַּבָּשָׁה֙ אֶ֔רֶץ | אֶ֔רֶץ |
\n",
"7 | Genesis 1:11 | תַּֽדְשֵׁ֤א הָאָ֨רֶץ֙ דֶּ֔שֶׁא עֵ֚שֶׂב מַזְרִ֣יעַ זֶ֔רַע עֵ֣ץ פְּרִ֞י עֹ֤שֶׂה פְּרִי֙ לְמִינֹ֔ו אֲשֶׁ֥ר זַרְעֹו־בֹ֖ו עַל־הָאָ֑רֶץ | אָ֑רֶץ |
\n",
"8 | Genesis 1:15 | וְהָי֤וּ לִמְאֹורֹת֙ בִּרְקִ֣יעַ הַשָּׁמַ֔יִם לְהָאִ֖יר עַל־הָאָ֑רֶץ | אָ֑רֶץ |
\n",
"9 | Genesis 1:22 | וְהָעֹ֖וף יִ֥רֶב בָּאָֽרֶץ׃ | אָֽרֶץ׃ |
\n",
"10 | Genesis 1:26 | וְיִרְדּוּ֩ בִדְגַ֨ת הַיָּ֜ם וּבְעֹ֣וף הַשָּׁמַ֗יִם וּבַבְּהֵמָה֙ וּבְכָל־הָאָ֔רֶץ וּבְכָל־הָרֶ֖מֶשׂ הָֽרֹמֵ֥שׂ עַל־הָאָֽרֶץ׃ | אָֽרֶץ׃ |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"query = \"\"\"\n",
"infrequent otype=sentence\n",
" := word rank_lex<100\n",
"\"\"\"\n",
"results = A.search(query, sets=customSets)\n",
"A.table(results, start=5, end=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that no matter how expensive the construction of a set has been, once you have it, queries based on it are just fast. There is no penalty when you use given sets instead of the familiar node types."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# All steps\n",
"\n",
"* **[start](start.ipynb)** your first step in mastering the bible computationally\n",
"* **[display](display.ipynb)** become an expert in creating pretty displays of your text structures\n",
"* **[search](search.ipynb)** turbo charge your hand-coding with search templates\n",
"\n",
"---\n",
"\n",
"advanced\n",
"\n",
"You have seen how to filter on feature values, of nodes and of edges.\n",
"\n",
"Now we want to set up sets for real.\n",
"\n",
"[sets](searchSets.ipynb)\n",
"[relations](searchRelations.ipynb)\n",
"[quantifiers](searchQuantifiers.ipynb)\n",
"[from MQL](searchFromMQL.ipynb)\n",
"[rough](searchRough.ipynb)\n",
"[gaps](searchGaps.ipynb)\n",
"\n",
"---\n",
"\n",
"* **[export Excel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results\n",
"* **[share](share.ipynb)** draw in other people's data and let them use yours\n",
"* **[export](export.ipynb)** export your dataset as an Emdros database\n",
"* **[annotate](annotate.ipynb)** annotate plain text by means of other tools and import the annotations as TF features\n",
"* **[map](map.ipynb)** map somebody else's annotations to a new version of the corpus\n",
"* **[volumes](volumes.ipynb)** work with selected books only\n",
"* **[trees](trees.ipynb)** work with the BHSA data as syntax trees\n",
"\n",
"CC-BY Dirk Roorda"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}