{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "\n", "\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Lexemes\n", "\n", "Various ways to list the lexeme base of individual chapters in the Hebew Bible.\n", "\n", "The *lexeme base* of a passage is the set of lexemes that occurs in that passage.\n", "\n", "We define a function, ``lexbase(passages, excluded=xpassages)``, \n", "that produces a file of the lexemes that occur in a given list of passages and do not occur in an other given list of passages.\n", "\n", "If you have LAF-Fabric working and downloaded this notebook, you can call this function yourself in order to generate \n", "lexeme bases of arbitrary passages.\n", "\n", "We also produce standard files with the lexeme bases of individual books, chapters and verses in the Bible.\n", "\n", "\n", "# Output\n", "\n", "The output files are organized as follows:\n", "\n", "* all files are comma separated text files that can imported in a spreadsheet application such as OpenOffice or Excel;\n", "* every line corresponds to a lexeme in the lexeme base and contains the following information:\n", " * lexeme (unique identifier in transcription, containing `` / [ = `` characters),\n", " * frequency (number of occurrences of this lexeme in the whole Hebrew Bible),\n", " * ``lex_utf8`` feature (the lexeme in Hebrew as it occurs in the ETCBC text database),\n", " * ``g_entry_heb`` feature (the vocalized lexeme as it is listed in the ETCBC lexicon),\n", " * ``sp`` feature (part of speech),\n", " * ``gloss`` feature.\n", " \n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0.00s This is LAF-Fabric 4.5.0\n", "API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html\n", "Feature doc: http://shebanq-doc.readthedocs.org/en/latest/texts/welcome.html\n", "\n" ] } ], "source": [ "import sys, collections, re\n", "\n", "from laf.fabric import LafFabric\n", "from etcbc.preprocess import prepare\n", "fabric = LafFabric()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0.00s LOADING API: please wait ... \n", " 0.65s INFO: USING DATA COMPILED AT: 2015-05-04T13-46-20\n", " 0.65s INFO: USING DATA COMPILED AT: 2015-05-04T14-07-34\n", " 3.67s LOGFILE=/Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/__log__lexemes.txt\n", " 14s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK lexemes AT 2015-05-27T15-43-40\n" ] } ], "source": [ "version = '4b'\n", "fabric.load('etcbc{}'.format(version), 'lexicon', 'lexemes', {\n", " \"xmlids\": {\"node\": False, \"edge\": False},\n", " \"features\": ('''\n", " otype\n", " lex lex_utf8 g_entry_heb\n", " sp gloss\n", " book chapter verse\n", " ''',''),\n", " \"prepare\": prepare,\n", " \"primary\": False,\n", "})\n", "exec(fabric.localnames.format(var='fabric'))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "csvdir = my_file('csv')\n", "passagedir = my_file('passage')\n", "%mkdir -p {csvdir}\n", "%mkdir -p {passagedir}" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "# Passage syntax\n", "\n", "passages = | separated list of passage\n", "passage = bookname (chapterranges | (chapter : verseranges))\n", "chapterranges = empty | (, separated list of numberrange)\n", "verseranges = empty | (, separated list of numberrange)\n", "numberrange = number | (number - number)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "passage_pat = re.compile('^\\s*([A-Za-z0-9_]+)\\s*([0-9,-]*)\\s*:?\\s*([0-9,-]*)\\s*$')\n", "\n", "lex_info = {}\n", "lex_section = {}\n", "lex_count = collections.Counter()\n", "for v in F.otype.s('verse'):\n", " bk = F.book.v(L.u('book', v))\n", " ch = F.chapter.v(L.u('chapter', v))\n", " vs = F.verse.v(v)\n", " for w in L.d('word', v):\n", " lex = F.lex.v(w)\n", " if lex not in lex_info:\n", " lex_info[lex] = (F.lex_utf8.v(w), F.g_entry_heb.v(w), F.sp.v(w), F.gloss.v(w))\n", " lex_section.setdefault(bk, {}).setdefault(ch, {}).setdefault(vs, collections.Counter())[lex] += 1\n", " lex_count[lex] += 1\n", "\n", "def verse_index():\n", " result = {}\n", " for v in F.verse.s():\n", " bk = F.book.v(L.u('book', v))\n", " ch = F.chapter.v(L.u('chapter', v))\n", " vs = F.verse.v(v)\n", " result.setdefault(bk, {}).setdefault(ch, {})[vs] = v\n", " return result\n", "\n", "vindex = verse_index()\n", "\n", "def parse_passages(passages):\n", " lexemes = set()\n", " for p in passages.strip().split('|'):\n", " lexemes |= parse_passage(p.strip())\n", " return lexemes\n", "\n", "def parse_ranges(rangespec, kind, passage, source, subsources=None):\n", " numbers = set()\n", " if rangespec == '':\n", " if subsources == None:\n", " return set(source.keys())\n", " else:\n", " for subsource in subsources:\n", " if subsource in source:\n", " numbers |= set(source[subsource].keys())\n", " return numbers\n", " ranges = rangespec.split(',')\n", " good = True\n", " for r in ranges:\n", " comps = r.split('-', 1)\n", " if len(comps) == 1:\n", " b = comps[0]\n", " e = comps[0]\n", " else:\n", " (b,e) = comps\n", " if not (b.isdigit() and e.isdigit()):\n", " print('Error: Not a valid {} range: [{}] in [{}]'.format(kind, r, passage))\n", " good = False\n", " else:\n", " b = int(b)\n", " e = int(e)\n", " for c in range(b, e+1):\n", " crep = str(c)\n", " if subsources == None:\n", " if crep not in source:\n", " print('Warning: No such {}: {} ([{}] in [{}])'.format(kind, crep, r, passage))\n", " numbers.add(crep)\n", " else:\n", " for subsource in subsources:\n", " if subsource not in source or crep not in source[subsource]:\n", " print('Warning: No such {}: {}:{} ([{}] in [{}])'.format(kind, subsource, crep, r, passage))\n", " numbers.add(crep)\n", " return numbers\n", " \n", "def parse_passage(passage):\n", " lexemes = set()\n", " result = passage_pat.match(passage)\n", " if result == None:\n", " print('Error: Not a valid passage: {}'.format(passage))\n", " return lexemes\n", " (book, chapterspec, versespec) = result.group(1,2,3)\n", " if book not in vindex:\n", " print('Error: Not a valid book: {} in {}'.format(book, passage))\n", " return lexemes\n", " chapters = parse_ranges(chapterspec, 'chapter', passage, vindex[book])\n", " verses = parse_ranges(versespec, 'verse', passage, vindex[book], chapters)\n", "\n", " vnodes = set()\n", " for ch in vindex[book]:\n", " if ch not in chapters: continue\n", " for vs in vindex[book][ch]:\n", " if vs not in verses: continue\n", " vnodes.add(vindex[book][ch][vs])\n", " lexemes = set()\n", " for v in vnodes:\n", " for w in L.d('word', v):\n", " lexemes.add(F.lex.v(w))\n", " return lexemes\n", " \n", "def lexbase(passages, excluded=None):\n", " lexemes = parse_passages(passages)\n", " outlexemes = set() if excluded == None else parse_passages(excluded)\n", " lexemes -= outlexemes\n", " fileid = '{}{}'.format(\n", " passages, \n", " '' if excluded == None else ' minus {}'.format(excluded)\n", " )\n", " filename = 'passage/{}.csv'.format(fileid.replace(':','_'))\n", " of = outfile(filename)\n", " i = 0\n", " limit = 20\n", " nlex = len(lexemes)\n", " shown = min((nlex, limit))\n", " print('==== {} ==== showing {} of {} lexemes here ===='.format(fileid, shown, nlex))\n", " for lx in sorted(lexemes, key=lambda x: (-lex_count[x], x)):\n", " (l_utf8, l_vc, l_sp, l_gl) = lex_info[lx]\n", " line = '\"{}\",{},{}\",\"{}\",\"{}\",\"{}\"\\n'.format(lx, lex_count[lx], l_utf8, l_vc, l_sp, l_gl)\n", " of.write(line)\n", " if i < limit: sys.stdout.write(line)\n", " i += 1\n", " of.close()\n", " print('See {}\\n'.format(my_file(filename)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Examples\n", "\n", "Here are some examples of the flexibility with which you can call the ``lexbase`` function." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "==== Genesis 2 ==== showing 20 of 131 lexemes here ====\n", "\"W\",51004,ו\",\"וְ\",\"conj\",\"and\"\n", "\"H\",30386,ה\",\"הַ\",\"art\",\"the\"\n", "\"L\",20447,ל\",\"לְ\",\"prep\",\"to\"\n", "\"B\",15767,ב\",\"בְּ\",\"prep\",\"in\"\n", "\">T\",11017,את\",\"אֵת\",\"prep\",\"\"\n", "\"MN\",7681,מן\",\"מִן\",\"prep\",\"from\"\n", "\"JHWH/\",6828,יהוה/\",\"יהוה\",\"nmpr\",\"YHWH\"\n", "\"L\",5521,אל\",\"אֶל\",\"prep\",\"to\"\n", "\">CR\",5500,אשׁר\",\"אֲשֶׁר\",\"conj\",\"\"\n", "\"KL/\",5495,כל/\",\"כֹּל\",\"subs\",\"whole\"\n", "\">MR[\",5378,אמר[\",\"אָמַר\",\"verb\",\"say\"\n", "\"L>\",5249,לא\",\"לֹא\",\"nega\",\"not\"\n", "\"KJ\",4483,כי\",\"כִּי\",\"conj\",\"that\"\n", "\"HJH[\",3561,היה[\",\"הָיָה\",\"verb\",\"be\"\n", "\"K\",2965,כ\",\"כְּ\",\"prep\",\"as\"\n", "\"LHJM/\",2601,אלהים/\",\"אֱלֹהִים\",\"subs\",\"god(s)\"\n", "\"BW>[\",2570,בוא[\",\"בֹּוא\",\"verb\",\"come\"\n", "\">RY/\",2504,ארץ/\",\"אֶרֶץ\",\"subs\",\"earth\"\n", "See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 2.csv\n", "\n", "==== Genesis 2 minus Genesis 1 ==== showing 20 of 88 lexemes here ====\n", "\"JHWH/\",6828,יהוה/\",\"יהוה\",\"nmpr\",\"YHWH\"\n", "\"L>\",5249,לא\",\"לֹא\",\"nega\",\"not\"\n", "\"BW>[\",2570,בוא[\",\"בֹּוא\",\"verb\",\"come\"\n", "\">JC/\",2186,אישׁ/\",\"אִישׁ\",\"subs\",\"man\"\n", "\"HLK[\",1554,הלך[\",\"הָלַךְ\",\"verb\",\"walk\"\n", "\"HW>\",1409,הוא\",\"הוּא\",\"prps\",\"he\"\n", "\">B/\",1226,אב/\",\"אָב\",\"subs\",\"father\"\n", "\"LQX[\",965,לקח[\",\"לָקַח\",\"verb\",\"take\"\n", "\"KL[\",817,אכל[\",\"אָכַל\",\"verb\",\"eat\"\n", "\">JN/\",788,אין/\",\"אַיִן\",\"subs\",\"\"\n", "\">CH/\",781,אשׁה/\",\"אִשָּׁה\",\"subs\",\"woman\"\n", "\">LH\",747,אלה\",\"אֵלֶּה\",\"prde\",\"these\"\n", "\"R>C/\",613,ראשׁ/\",\"רֹאשׁ\",\"subs\",\"head\"\n", "\"FJM[\",609,שׂים[\",\"שִׂים\",\"verb\",\"put\"\n", "\"Z>T\",604,זאת\",\"זֹאת\",\"prde\",\"this\"\n", "\"MH\",587,מה\",\"מָה\",\"prin\",\"what\"\n", "See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 2 minus Genesis 1.csv\n", "\n", "==== Genesis 3-4,10 minus Genesis 1-2 ==== showing 20 of 248 lexemes here ====\n", "\"BN/\",4937,בן/\",\"בֵּן\",\"subs\",\"son\"\n", "\"JD/\",1635,יד/\",\"יָד\",\"subs\",\"hand\"\n", "\"M\",1068,אם\",\"אִם\",\"conj\",\"if\"\n", "\"CWB[\",1037,שׁוב[\",\"שׁוּב\",\"verb\",\"return\"\n", "\"JD<[\",991,ידע[\",\"יָדַע\",\"verb\",\"know\"\n", "\"T==\",848,את==\",\"אֵת\",\"prep\",\"together with\"\n", "\"GM\",769,גם\",\"גַּם\",\"advb\",\"even\"\n", "\">TH\",747,אתה\",\"אַתָּה\",\"prps\",\"you\"\n", "\"H=\",743,ה=\",\"הֲ\",\"inrg\",\"\"\n", "\">XR/\",718,אחר/\",\"אַחַר\",\"subs\",\"after\"\n", "\"DRK/\",706,דרך/\",\"דֶּרֶךְ\",\"subs\",\"way\"\n", "\"MYRJM/\",681,מצרים/\",\"מִצְרַיִם\",\"nmpr\",\"Egypt\"\n", "\"QWM[\",664,קום[\",\"קוּם\",\"verb\",\"arise\"\n", "See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Genesis 3-4,10 minus Genesis 1-2.csv\n", "\n", "==== Exodus minus Genesis ==== showing 20 of 631 lexemes here ====\n", "\"MCH=/\",766,משׁה=/\",\"מֹשֶׁה\",\"nmpr\",\"Moses\"\n", "\"QDC/\",469,קדשׁ/\",\"קֹדֶשׁ\",\"subs\",\"holiness\"\n", "\">HRWN/\",347,אהרון/\",\"אַהֲרֹון\",\"nmpr\",\"Aaron\"\n", "\"LWJ/\",296,לוי/\",\"לֵוִי\",\"adjv\",\"Levite\"\n", "\"KTB[\",231,כתב[\",\"כָּתַב\",\"verb\",\"write\"\n", "\"JHWCWBD[\",193,אבד[\",\"אָבַד\",\"verb\",\"perish\"\n", "\"LXM[\",171,לחם[\",\"לָחַם\",\"verb\",\"fight\"\n", "\"XKMH/\",157,חכמה/\",\"חָכְמָה\",\"subs\",\"wisdom\"\n", "\"FMX[\",154,שׂמח[\",\"שָׂמַח\",\"verb\",\"rejoice\"\n", "\"L[\",103,גאל[\",\"גָּאַל\",\"verb\",\"redeem\"\n", "\"CJT[\",85,שׁית[\",\"שִׁית\",\"verb\",\"put\"\n", "\"MKR[\",80,מכר[\",\"מָכַר\",\"verb\",\"sell\"\n", "\"ZR/\",70,זר/\",\"זָר\",\"adjv\",\"strange\"\n", "\"H/\",34,לאה/\",\"לֵאָה\",\"nmpr\",\"Leah\"\n", "\"RXM/\",31,רחם/\",\"רֶחֶם\",\"subs\",\"womb\"\n", "\"TJMN/\",23,תימן/\",\"תֵּימָן\",\"subs\",\"south\"\n", "\"NLH/\",14,גאלה/\",\"גְּאֻלָּה\",\"subs\",\"right of buying back\"\n", "\"KHNH/\",14,כהנה/\",\"כְּהֻנָּה\",\"subs\",\"priesthood\"\n", "\"BRQ=/\",13,ברק=/\",\"בָּרָק\",\"nmpr\",\"Barak\"\n", "See /Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/lexemes/passage/Numeri 1-3_10-15|Judices 5_1,3,5,7,9|Ruth 4 minus Chronica_I|Chronica_II.csv\n", "\n" ] } ], "source": [ "lexbase('Genesis 2', excluded=None)\n", "lexbase('Genesis 2', excluded='Genesis 1')\n", "lexbase('Genesis 3-4,10', excluded='Genesis 1-2')\n", "lexbase('Exodus', excluded='Genesis')\n", "lexbase('Numeri 1-3:10-15|Judices 5:1,3,5,7,9|Ruth 4', excluded='Chronica_I|Chronica_II')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Standard lexeme files\n", "\n", "Here we produce several lexeme files for books and chapters.\n", "\n", "## Output kind\n", "\n", "There are normal and incremental output files.\n", "In a normal output file, you find all lexemes for the indicated chapters and verses.\n", "In an *incremental* file, you find per indicated passage the lexemes that are new with respect to the previous passages (either the previous verses in the chapter, or the previous chapters in the book).\n", "\n", "## Output files\n", "\n", "* all_lexemes.csv contains a listing of all lexemes, ordered by frequency\n", "* *book*.csv contains a listing of all lexemes in that book\n", "* *book*_per_ch.csv contains a listing of all lexemes in that book, organized by chapter\n", "* *book*_per_ch_inc.csv contains a listing of all lexemes in that book, organized by chapter, where each chapter lists only the lexemes that did not occur in previous chapters of that book\n", "* *book*_per_vs.csv contains a listing of all lexemes in that book, organized by chapter and then by verse\n", "* *book*_per_vs_inc.csv contains a listing of all lexemes in that book, organized by chapter and then by verse, where each verse lists only the lexemes that did not occur in previous verses of that same chapter\n", "\n", "## Output location\n", "\n", "You can download the files as they have been generated by my LAF-Fabric installation via my SURFdrive:\n", "[version 4](https://surfdrive.surf.nl/files/public.php?service=files&t=dca1e8094a9b3c4c79f07d17306d12bd) \n", "[version 4b](https://surfdrive.surf.nl/files/public.php?service=files&t=faf643c647abb6bfd052f6c2898efae5)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [], "source": [ "outf = outfile(\"csv/all_lexemes.csv\")\n", "for (l, f) in sorted(lex_count.items(), key=lambda x: -x[1]):\n", " (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]\n", " outf.write('\"{}\",{},\"{}\",\"{}\",\"{}\",\"{}\"\\n'.format(\n", " l, f, l_utf8, l_vc, l_sp, l_gl,\n", " ))\n", "outf.close()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for bk in sorted(lex_section):\n", " outfb = outfile(\"csv/{}.csv\".format(bk))\n", " outfc = outfile(\"csv/{}_per_ch.csv\".format(bk))\n", " outfci = outfile(\"csv/{}_per_ch_inc.csv\".format(bk))\n", " outfv = outfile(\"csv/{}_per_vs.csv\".format(bk))\n", " outfvi = outfile(\"csv/{}_per_vs_inc.csv\".format(bk))\n", " bk_lex = set()\n", " for ch in sorted(lex_section[bk], key=lambda x: int(x)):\n", " ch_lex = set()\n", " for vs in sorted(lex_section[bk][ch], key=lambda x: int(x)):\n", " for l in sorted(lex_section[bk][ch][vs]):\n", " (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]\n", " f = lex_count[l]\n", " line = '\"{}\",{},{},\"{}\",{},\"{}\",\"{}\",\"{}\",\"{}\"\\n'.format(\n", " bk, ch, vs, l, f, l_utf8, l_vc, l_sp, l_gl,\n", " )\n", " outfv.write(line)\n", " if l not in ch_lex:\n", " ch_lex.add(l)\n", " outfvi.write(line)\n", " if l not in bk_lex:\n", " bk_lex.add(l)\n", " for l in sorted(ch_lex):\n", " (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]\n", " f = lex_count[l]\n", " line = '\"{}\",{},\"{}\",{},\"{}\",\"{}\",\"{}\",\"{}\"\\n'.format(\n", " bk, ch, l, f, l_utf8, l_vc, l_sp, l_gl,\n", " )\n", " outfc.write(line)\n", " if l not in bk_lex:\n", " bk_lex.add(l)\n", " outfci.write(line)\n", " for l in sorted(bk_lex):\n", " (l_utf8, l_vc, l_sp, l_gl) = lex_info[l]\n", " f = lex_count[l]\n", " line = '\"{}\",\"{}\",{},\"{}\",\"{}\",\"{}\",\"{}\"\\n'.format(\n", " bk, l, f, l_utf8, l_vc, l_sp, l_gl,\n", " )\n", " outfb.write(line)\n", " outfb.close()\n", " outfc.close()\n", " outfci.close() \n", " outfv.close()\n", " outfvi.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }