{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "\n", "\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Preparing a data file for participle research\n", "\n", "The source data is a file with mannual annotations by the LingVar group on a file with participle occurrences.\n", "This file derives from a set of qdf-files, filtered to contain only participle occurrences with context information, and then enriched with a complex custom annotation by hand.\n", "\n", "This notebook unravels the hand made annotations, distributes the relevant parts over several more columns, and finally transforms most columns in sets of level columns.\n", "\n", "This *levelling* of a column means, that if a column has a limited set of values, say ``aap``, ``noot``, ``mies``, we replace it by three columns, *has_aap*, *has_noot*, *has_mies*, having only boolean values.\n", "\n", "The code is organized in a set of successive stages, each having a limited task." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import sys, os\n", "import collections\n", "import re, csv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Definitions\n", "\n", "We replace the book information, originally given as label, by full information: book number, book name, book acronym." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "39 books\n" ] } ], "source": [ "books = {\n", "'AMOS': ('amos', 'amo'),\n", "'CAN': ('song of songs', 'sol'),\n", "'DAN': ('daniel', 'dan'),\n", "'DEUT': ('deuteronomy', 'deu'),\n", "'ESR': ('ezra', 'ezr'),\n", "'EST': ('esther', 'est'),\n", "'EXO': ('exodus', 'exo'),\n", "'EZE': ('ezekiel', 'eze'),\n", "'GEN': ('genesis', 'gen'),\n", "'HAB': ('habakkuk', 'hab'),\n", "'HAG': ('haggai', 'hag'),\n", "'HOS': ('hosea', 'hos'),\n", "'ICHR': ('1 chronicles', '1ch'),\n", "'IICHR': ('2 chronicles', '2ch'),\n", "'IIKON': ('2 kings', '2ki'),\n", "'IISA': ('2 samuel', '2sa'),\n", "'IKON': ('1 kings', '1ki'),\n", "'IOB': ('job', 'job'),\n", "'ISAM': ('1 samuel', '1sa'),\n", "'JER': ('jeremiah', 'jer'),\n", "'JES': ('isaiah', 'isa'),\n", "'JOE': ('joel', 'joe'),\n", "'JONA': ('jona', 'jon'),\n", "'JOZ': ('joshua', 'jos'),\n", "'LEV': ('leviticus', 'lev'),\n", "'MAL': ('malachi', 'mal'),\n", "'MICH': ('micah', 'mic'),\n", "'NAH': ('nahum', 'nah'),\n", "'NEH': ('nehemiah', 'neh'),\n", "'NUM': ('numbers', 'num'),\n", "'OBAD': ('obadiah', 'oba'),\n", "'PRO': ('proverbs', 'pro'),\n", "'PS': ('psalms', 'psa'),\n", "'QOH': ('qoheleth', 'qoh'),\n", "'RICHT': ('judges', 'jud'),\n", "'RUTH': ('ruth', 'rut'),\n", "'THR': ('lamentations', 'lam'),\n", "'ZACH': ('zechariah', 'zec'),\n", "'ZEP': ('zephaniah', 'zep'),\n", "}\n", "print(\"{} books\".format(len(books)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Staging\n", "\n", "Here is the code to go from stage to stage.\n", "Nothing fancy, just maintaining a bunch of global variables.\n", "Every stage has an input and output file handle, a place to drop error messages, and\n", "a set of current column headers, with index, so that we can refer to columns by name instead of by number.\n", "\n", "Successive stages transform the data, and may or may not add columns." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "base_dir = '{}/Dropbox/laf-fabric-output/etcbc4b/participle'.format(os.path.expanduser('~'))\n", "filepat = 'participia_compleet_r{}.csv'\n", "start_column_names = {0: tuple('C{:>02d}'.format(c+1) for c in range(17))}\n", "column_names = None\n", "column_index = None\n", "\n", "def infile(f): return open('{}/{}'.format(base_dir, f))\n", "def outfile(f): return open('{}/{}'.format(base_dir, f), mode='w')\n", "def msg(m):\n", " sys.stderr.write(m + '\\n')\n", " sys.stderr.flush()\n", "\n", "passage_pat = re.compile(r'([0-9]{2})\\s*([A-Z_]+)\\s*([0-9]+),([0-9]+)\\.([0-9]+)')\n", "\n", "errors = collections.defaultdict(lambda: collections.defaultdict(lambda: []))\n", "sourcef = None\n", "targetf = None\n", "data = None\n", "new_data = None\n", "nrows = None\n", "levels = None\n", "\n", "the_stage = 0\n", "\n", "def error(cat, r, f):\n", " errors[the_stage][cat].append((r,f))\n", " \n", "def make_passage(m):\n", " return (m.group(1), m.group(2), m.group(3), m.group(4), m.group(5))\n", "\n", "def stage_start(nr=None):\n", " global the_stage\n", " global sourcef\n", " global targetf\n", " global column_names\n", " global column_index\n", " global data\n", " global new_data\n", " global nrows\n", " global levels\n", " \n", " if nr == None:\n", " the_stage += 1\n", " else:\n", " the_stage = nr\n", " msg(\"===BEGIN==STAGE {}=====\".format(the_stage))\n", " column_names = list(start_column_names[the_stage - 1])\n", " column_index = dict((name, e) for (e, name) in enumerate(column_names))\n", " sfile = filepat.format(the_stage - 1)\n", " tfile = filepat.format(the_stage)\n", " msg('Column names before:\\n{}'.format(', '.join(column_names)))\n", " msg('Reading participle text data stage {} ({} => {}) ...'.format(the_stage, sfile, tfile))\n", " errors[the_stage] = collections.defaultdict(lambda: [])\n", " sourcef = infile(sfile)\n", " targetf = outfile(tfile)\n", " data = csv.reader(sourcef)\n", " new_data = csv.writer(targetf)\n", " nrows = 0\n", " levels = collections.defaultdict(lambda: collections.Counter())\n", "\n", "\n", "def stage_end(last=False):\n", " global targetf\n", " global columnindex\n", " \n", " start_column_names[the_stage] = tuple(column_names)\n", " column_index = dict((name, e) for (e, name) in enumerate(column_names))\n", "\n", " targetf.close()\n", " sourcef.close()\n", " tfile = filepat.format(the_stage)\n", " ffile = filepat.format('_final')\n", " targetf = infile(tfile)\n", " data = csv.reader(targetf)\n", " \n", " ncols = len(column_names)\n", " row_lengths = collections.Counter()\n", "\n", " if last:\n", " finalf = outfile(ffile)\n", " new_data = csv.writer(finalf)\n", " new_data.writerow(column_names)\n", "\n", " for row in data:\n", " row_lengths[len(row)] += 1\n", " for (e, field) in enumerate(row):\n", " levels[e][field] += 1\n", " if last:\n", " new_data.writerow(row)\n", " targetf.close()\n", " if last:\n", " finalf.close()\n", "\n", " show_n = 20\n", " for e in sorted(levels):\n", " valueset = levels[e].keys()\n", " lnv = len(valueset)\n", " examples = ' '.join(str(x) for x in sorted(valueset)[0:show_n])\n", " rest = ' ... {} more'.format(lnv - show_n) if lnv > show_n else ''\n", " print(\"{:<15} has {:>5} levels ({}{})\".format(column_names[e], lnv, examples, rest))\n", "\n", " msg(\"{:>5} total rows x {:>3} header columns\".format(nrows, ncols))\n", " for (rl, rw) in sorted(row_lengths.items(), key=lambda x: (-x[1], x[0])):\n", " msg(\"{:>5} body rows x {:>3} actual columns ({})\".format(rw, rl, 'OK' if rl == ncols else 'ERROR'))\n", " if errors:\n", " my_errors = errors[the_stage]\n", " for cat in sorted(my_errors):\n", " msg(\"Error: {} ({}x)\".format(cat, len(my_errors[cat])))\n", " for (r, f) in my_errors[cat]:\n", " msg(\"\\t{:>5}: {}\".format(r, f))\n", "\n", " msg(\"===END====STAGE {}=====\".format(the_stage))\n", "\n", "def show_col(colname):\n", " global column_index\n", " column_index = dict((name, e) for (e, name) in enumerate(column_names))\n", "\n", " print(\"Start levels of column {}\".format(colname))\n", " for (val, cnt) in sorted(levels[column_index[colname]].items(), key=lambda x: (-x[1], x[0])):\n", " print(\"{:<10}: {:>5}x\".format(val, cnt))\n", " print(\"End levels of column {}\".format(colname))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Rename and Remove\n", "\n", "* Rename several columns.\n", "* Remove several columns." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 1=====\n", "Column names before:\n", "C01, C02, C03, C04, C05, C06, C07, C08, C09, C10, C11, C12, C13, C14, C15, C16, C17\n", "Reading participle text data stage 1 (participia_compleet_r0.csv => participia_compleet_r1.csv) ...\n", " 9651 total rows x 15 header columns\n", " 9651 body rows x 15 actual columns (OK)\n", "===END====STAGE 1=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "C01 has 1294 levels (Aa Aa' Aav Ab'-2-v Ab-1- Ab-1-iv Ab-1-v Ab-2- Ab-2-v Ab-d2-v Ab-dj-v Ab-dj2-v Abv Ad Ad' Ad'-j-v Ad'v Ad-Pj- Ad-Pj-'v Ad-Pj-v ... 1274 more)\n", "state has 4 levels (: :a :c :e)\n", "k has 4 levels ( #NAAM? +K +K=)\n", "domain has 98 levels (? ?? ??? ???Q ???QQQ ??N ??ND ??NDN ??NDND ??NQ ??NQQ ??Q ??QNQ ??QQ ?N ?ND ?NQ ?NQN ?NQND ?NQNQ ... 78 more)\n", "phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP)\n", "carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more)\n", "C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more)\n", "conj has 62 levels ( CR CR CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more)\n", "C10 has 16 levels ( >CR >M B-H C DJ H JCR K->CR KJ KMW LMCR)\n", "neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/)\n", "vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif)\n", "C13 has 2 levels (act pas)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=1)\n", "\n", "rencols = dict(\n", " C02='state',\n", " C03='k',\n", " C04='domain',\n", " C06='phrf',\n", " C07='carc',\n", " C09='conj',\n", " C11='neg',\n", " C12='vstem',\n", " C14='lex',\n", " C16='clause',\n", " C17='comment',\n", ")\n", "delcols = '''C05 C17'''\n", "\n", "for old in rencols:\n", " column_names[column_index[old]] = rencols[old]\n", "\n", "delcols_sorted = sorted((column_index[x] for x in delcols.split()), reverse=True)\n", "\n", "for dc in delcols_sorted:\n", " del column_names[dc:dc+1]\n", "\n", "for row in data:\n", " for dc in delcols_sorted:\n", " del row[dc:dc+1]\n", " nrows += 1\n", " new_data.writerow(row)\n", "\n", "stage_end()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Book names, and the manual first column\n", "\n", "* Replace passage labels by book, chapter, verse indication, use English names and abbreviations for bible book names.\n", "* Split typ1 on -, fill with empty fields up to three fields (check the number of fields)\n", "* Strip trailing *v*-s from new column 1, and add them to the new column typ3.\n", "* If typ1 starts with *n* and original column 1 has at least one - and after the - is not a *w* or >: that is a mistake\n", "* extract ``w`` and ``>`` from new *typ1* and *typ3* and fill new *has_wav* and *has_alef* accordingly.\n", "* if typ1 is in a set of special values, replace those values with others\n", "* in column *lex*, strip the terminal [" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 2=====\n", "Column names before:\n", "C01, state, k, domain, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, C15, clause\n", "Reading participle text data stage 2 (participia_compleet_r1.csv => participia_compleet_r2.csv) ...\n", " 9651 total rows x 25 header columns\n", " 9651 body rows x 25 actual columns (OK)\n", "===END====STAGE 2=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r)\n", "t1_wav has 2 levels (0 1)\n", "t1_alef has 2 levels (0 1)\n", "typ2 has 367 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 347 more)\n", "typ3 has 130 levels ( : :a :e D Dv Dv: Dv:a E E:a Ei: Ev Ev:c Ev:s Nl O O: O:a O:a/qa O:e ... 110 more)\n", "state has 4 levels (: :a :c :e)\n", "k has 4 levels ( #NAAM? +K +K=)\n", "domain has 98 levels (? ?? ??? ???Q ???QQQ ??N ??ND ??NDN ??NDND ??NQ ??NQQ ??Q ??QNQ ??QQ ?N ?ND ?NQ ?NQN ?NQND ?NQNQ ... 78 more)\n", "phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP)\n", "carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more)\n", "C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more)\n", "conj has 62 levels ( CR CR CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more)\n", "C10 has 16 levels ( >CR >M B-H C DJ H JCR K->CR KJ KMW LMCR)\n", "neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/)\n", "vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif)\n", "C13 has 2 levels (act pas)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=2)\n", "\n", "bookabbs = set()\n", "\n", "typ1 = column_index['C01']\n", "bookl = column_index['C15']\n", "lex = column_index['lex']\n", "\n", "for row in data:\n", " lexval = row[lex]\n", " if not lexval.endswith('['):\n", " error('lexeme not ending on [', nrows, lexval)\n", " row[lex] = lexval.strip('[')\n", "\n", " match = passage_pat.match(row[bookl])\n", " if match:\n", " (booknum, bookabb, chapter, verse, seqnum) = match.groups()\n", " bookabbs.add(bookabb)\n", " (book_name, book_acro) = books[bookabb]\n", " chapnum = int(chapter)\n", " versenum = int(verse)\n", " vlabel = '{}{}.{}'.format(book_acro, chapnum, versenum)\n", " row[bookl:bookl+1] = (\n", " vlabel,\n", " int(booknum), \n", " book_name,\n", " book_acro,\n", " chapnum,\n", " versenum,\n", " int(seqnum),\n", " )\n", " else:\n", " error('Unrecognized passage', nrows, row[bookl])\n", "\n", " if row[typ1].count('-') > 2:\n", " error('More than 2 - in manual field', nrows, row[typ1])\n", " row[typ1:typ1+1] = (row[typ1].replace(\"'\",'').replace('\"','') + '-----').split('-')[0:3]\n", "\n", " row[typ1] = row[typ1].rstrip('\"')\n", " while row[typ1].endswith('v'):\n", " row[typ1] = row[typ1][0:-1]\n", " row[typ1+2] += 'v'\n", " if row[typ1].startswith('n') and not (row[typ1] == 'n' or row[typ1][1] in {'w', '>'}):\n", " error('n followed by stray characters in manual field', nrows, '-'.join(row[typ1:typ1+2]))\n", "\n", " has_wav = 1 if 'w' in row[typ1] or 'w' in row[typ1+2] else 0\n", " has_alef = 1 if '>' in row[typ1] or '>' in row[typ1+2] else 0\n", " row[typ1:typ1+1] = [\n", " row[typ1].replace('w', '').replace('>', ''), \n", " has_wav,\n", " has_alef,\n", " ]\n", " row[typ1+4] = row[typ1+4].replace('w', '').replace('>', '')\n", " if row[typ1] == 'N': row[typ1] = 'n'\n", " if row[typ1] == 'b2c': row[typ1] = 'b'\n", " if row[typ1].startswith('bijzin?nVv'):\n", " row[typ1] = 'n'\n", " row[typ1+3] = 'Vv'\n", " new_data.writerow(row)\n", " nrows += 1\n", "\n", "column_names[bookl:bookl+1] = ('vlabel', 'booknum', 'bookname', 'bookacro', 'chapter', 'verse', 'seqnum')\n", "column_names[typ1:typ1+1] = ('typ1', 'typ2', 'typ3')\n", "column_names[typ1:typ1+1] = ('typ1', 't1_wav', 't1_alef')\n", " \n", "stage_end()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Typ2 and Typ3\n", "\n", "\n", "* turn an ``a`` in *typ2* into a new feature *has_a* after typ2 and remove it from *typ2*, \n", " which is then called *typ2strip-a*\n", "* count the number of ``v``-s in *typ3*, add this as a new field *#_v* after *typ3*,\n", " and strip the ``v``-s from column *typ3*, which is then called *typ3strip-v*.\n", "* work out the values of column *domain*: translate them into level columns for Q, D, N, ? plus a column that gives the length (= the embedding level). The Q, D, N, ? count as follows: if on the last position: 1, second last: 0.5 else 0.\n", "\n", "After processing, analyse the column levels again, especially column *typ2strip-a*." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 3=====\n", "Column names before:\n", "typ1, t1_wav, t1_alef, typ2, typ3, state, k, domain, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause\n", "Reading participle text data stage 3 (participia_compleet_r2.csv => participia_compleet_r3.csv) ...\n", " 9651 total rows x 31 header columns\n", " 9651 body rows x 31 actual columns (OK)\n", "===END====STAGE 3=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r)\n", "t1_wav has 2 levels (0 1)\n", "t1_alef has 2 levels (0 1)\n", "typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more)\n", "t2_a has 2 levels (0 1)\n", "typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more)\n", "t3#v has 6 levels (0 1 2 3 4 5)\n", "state has 4 levels (: :a :c :e)\n", "k has 4 levels ( #NAAM? +K +K=)\n", "dom_D has 3 levels (0 0.5 1)\n", "dom_N has 3 levels (0 0.5 1)\n", "dom_Q has 3 levels (0 0.5 1)\n", "dom_? has 3 levels (0 0.5 1)\n", "dom_emb has 7 levels (1 2 3 4 5 6 7)\n", "phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP)\n", "carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more)\n", "C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more)\n", "conj has 62 levels ( CR CR CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more)\n", "C10 has 16 levels ( >CR >M B-H C DJ H JCR K->CR KJ KMW LMCR)\n", "neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/)\n", "vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif)\n", "C13 has 2 levels (act pas)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=3)\n", "\n", "levels_0 = collections.defaultdict(lambda: collections.Counter())\n", "\n", "typ1 = column_index['typ1']\n", "typ2 = column_index['typ2']\n", "typ3 = column_index['typ3']\n", "dom = column_index['domain']\n", "dlvs = ['D', 'N', 'Q', '?']\n", "\n", "for row in data:\n", " d = row[dom]\n", " lend = len(d)\n", " these_dlvs = [0, 0, 0, 0]\n", " for (i, lv) in enumerate(dlvs):\n", " if d[-1] == lv:\n", " these_dlvs[i] = 1\n", " elif lend > 1 and d[-2] == lv:\n", " these_dlvs[i] = 0.5\n", " row[dom:dom+1] = these_dlvs + [lend]\n", " \n", " (f2, f3) = (row[typ2], 0)\n", " if row[typ1] in {'h', 'k'}:\n", " if f2.startswith('a'):\n", " f2 = f2.lstrip('a')\n", " f3 = 1\n", " row[typ2:typ2+1] = (f2, f3)\n", " \n", " nvs = row[typ3+1].count('v')\n", " row[typ3+1:typ3+2] = [\n", " row[typ3+1].replace('v',''), \n", " nvs,\n", " ]\n", " new_data.writerow(row)\n", " if row[typ1] in {'p', 'h', 'k'}:\n", " levels_0[row[typ1]][row[typ2]] += 1\n", " nrows += 1\n", "\n", "column_names[dom:dom+1] = ['dom_{}'.format(x) for x in dlvs] + ['dom_emb']\n", "column_names[typ2:typ2+1] = ('typ2strip-a', 't2_a')\n", "column_names[typ3+1:typ3+2] = ('typ3strip-v', 't3#v')\n", "\n", "stage_end()\n", "\n", "#show_col('typ1')\n", "#show_col('typ3strip-v')\n", "\n", "#for val0 in sorted(levels_0):\n", "# print(\"Levels of typ2strip-a if typ1 is {}:\".format(val0))\n", "# for (val, occ) in sorted(levels_0[val0].items(), key=lambda x: (-x[1], x[0])):\n", "# print(\"\\t{:<5} occurs {:>5}x\".format(val, occ)) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Typ2 continued\n", "\n", "Make levels for *type2strip-a*, but simplify the values.\n", "Only for the cases where typ1 is ``p``." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 4=====\n", "Column names before:\n", "typ1, t1_wav, t1_alef, typ2strip-a, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause\n", "Reading participle text data stage 4 (participia_compleet_r3.csv => participia_compleet_r4.csv) ...\n", " 9651 total rows x 36 header columns\n", " 9651 body rows x 36 actual columns (OK)\n", "===END====STAGE 4=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r)\n", "t1_wav has 2 levels (0 1)\n", "t1_alef has 2 levels (0 1)\n", "typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more)\n", "t2_BLJ has 2 levels (0 1)\n", "t2_EJN has 2 levels (0 1)\n", "t2_HNH has 2 levels (0 1)\n", "t2_JC has 2 levels (0 1)\n", "t2_OWD has 2 levels (0 1)\n", "t2_a has 2 levels (0 1)\n", "typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more)\n", "t3#v has 6 levels (0 1 2 3 4 5)\n", "state has 4 levels (: :a :c :e)\n", "k has 4 levels ( #NAAM? +K +K=)\n", "dom_D has 3 levels (0 0.5 1)\n", "dom_N has 3 levels (0 0.5 1)\n", "dom_Q has 3 levels (0 0.5 1)\n", "dom_? has 3 levels (0 0.5 1)\n", "dom_emb has 7 levels (1 2 3 4 5 6 7)\n", "phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP)\n", "carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more)\n", "C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more)\n", "conj has 62 levels ( CR CR CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more)\n", "C10 has 16 levels ( >CR >M B-H C DJ H JCR K->CR KJ KMW LMCR)\n", "neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/)\n", "vstem has 16 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal shaf tif)\n", "C13 has 2 levels (act pas)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=4)\n", "\n", "trans = '''\n", "H = HNH\n", "Hs = BLJ\n", "> = EJN\n", ">s = EJN\n", "< = OWD\n", "J = JC\n", "P> = EJN\n", " = EJN\n", "Js = JC\n", "r> = EJN\n", "r>s = EJN\n", ">< = EJN,OWD\n", "1>s = EJN\n", "B = BLJ\n", "Pb> = EJN\n", "hB = BLJ\n", "hJ = JC\n", "'''\n", "\n", "trans_table = dict(\n", " (x.strip(),set(y.strip().split(','))) \n", " for (x,y) in (\n", " z.strip().split('=') \n", " for z in trans.split('\\n') if z != ''\n", " )\n", ")\n", "t2_levels = set()\n", "for x in trans_table: t2_levels |= trans_table[x]\n", "t2_level_sorted = sorted(t2_levels)\n", "ll = len(t2_levels)\n", "\n", "typ1 = column_index['typ1']\n", "typ2 = column_index['typ2strip-a']\n", "\n", "for row in data:\n", " if row[typ1] == 'p':\n", " val = row[typ2]\n", " if val not in trans_table:\n", " error('Unrecognized level for typ2strip-a', nrows, val)\n", " row[typ2+1:typ2+1] = ['?' for x in t2_level_sorted]\n", " else:\n", " these_levels = trans_table[val]\n", " row[typ2+1:typ2+1] = [1 if x in these_levels else 0 for x in t2_level_sorted]\n", " else:\n", " row[typ2+1:typ2+1] = [0 for x in t2_level_sorted]\n", " nrows += 1\n", " new_data.writerow(row)\n", "\n", "column_names[typ2+1:typ2+1] = ['t2_{}'.format(x) for x in t2_level_sorted]\n", "stage_end()\n", "\n", "#show_col('typ3strip-v')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Verbal stem\n", "\n", "Deal with verbal stems, column *vstem* and the next one, which gives the values ``act`` and ``pas``.\n", "If *vstem* has ``qal`` and the next one has ``pas``, modify ``qal`` to ``qalp``.\n", "\n", "We use this stage for determining the levels, and later we do the actual levelling." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 5=====\n", "Column names before:\n", "typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, C10, neg, vstem, C13, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause\n", "Reading participle text data stage 5 (participia_compleet_r4.csv => participia_compleet_r5.csv) ...\n", " 9651 total rows x 35 header columns\n", " 9651 body rows x 35 actual columns (OK)\n", "===END====STAGE 5=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r)\n", "t1_wav has 2 levels (0 1)\n", "t1_alef has 2 levels (0 1)\n", "typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more)\n", "t2_BLJ has 2 levels (0 1)\n", "t2_EJN has 2 levels (0 1)\n", "t2_HNH has 2 levels (0 1)\n", "t2_JC has 2 levels (0 1)\n", "t2_OWD has 2 levels (0 1)\n", "t2_a has 2 levels (0 1)\n", "typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more)\n", "t3#v has 6 levels (0 1 2 3 4 5)\n", "state has 4 levels (: :a :c :e)\n", "k has 4 levels ( #NAAM? +K +K=)\n", "dom_D has 3 levels (0 0.5 1)\n", "dom_N has 3 levels (0 0.5 1)\n", "dom_Q has 3 levels (0 0.5 1)\n", "dom_? has 3 levels (0 0.5 1)\n", "dom_emb has 7 levels (1 2 3 4 5 6 7)\n", "phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP)\n", "carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more)\n", "C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more)\n", "conj has 62 levels ( CR CR CR >CR-/W >L-H >M >T->CR >T-H >W >XR/->CR B-C B-H B-VRM/ ... 42 more)\n", "C10 has 16 levels ( >CR >M B-H C DJ H JCR K->CR KJ KMW LMCR)\n", "neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/)\n", "vstem has 17 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal qalp shaf tif)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=5)\n", "\n", "vstem = column_index['vstem']\n", "\n", "for row in data:\n", " val = row[vstem]\n", " val2 = row[vstem+1]\n", " if val == 'qal' and val2 == 'pas': val += 'p'\n", " row[vstem] = val\n", " del row[vstem+1:vstem+2]\n", " nrows += 1\n", " new_data.writerow(row)\n", "\n", "del column_names[vstem+1:vstem+2]\n", "stage_end()\n", "#show_col('vstem')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Conj\n", "\n", "Deal with conjunctions, column *conj* and the next one. If there is a value in the next one, take that, otherwise the value in the *conj* column. Then apply a translation table.\n", "\n", "The actual leveling occurs in the next stage." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 6=====\n", "Column names before:\n", "typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, C10, neg, vstem, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause\n", "Reading participle text data stage 6 (participia_compleet_r5.csv => participia_compleet_r6.csv) ...\n", " 9651 total rows x 34 header columns\n", " 9651 body rows x 34 actual columns (OK)\n", "===END====STAGE 6=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r)\n", "t1_wav has 2 levels (0 1)\n", "t1_alef has 2 levels (0 1)\n", "typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more)\n", "t2_BLJ has 2 levels (0 1)\n", "t2_EJN has 2 levels (0 1)\n", "t2_HNH has 2 levels (0 1)\n", "t2_JC has 2 levels (0 1)\n", "t2_OWD has 2 levels (0 1)\n", "t2_a has 2 levels (0 1)\n", "typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more)\n", "t3#v has 6 levels (0 1 2 3 4 5)\n", "state has 4 levels (: :a :c :e)\n", "k has 4 levels ( #NAAM? +K +K=)\n", "dom_D has 3 levels (0 0.5 1)\n", "dom_N has 3 levels (0 0.5 1)\n", "dom_Q has 3 levels (0 0.5 1)\n", "dom_? has 3 levels (0 0.5 1)\n", "dom_emb has 7 levels (1 2 3 4 5 6 7)\n", "phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP)\n", "carc has 179 levels (0 10 100 101 102 103 104 106 107 11 110 111 112 113 115 116 117 12 120 121 ... 159 more)\n", "C08 has 73 levels ( 0 100 101 102 103 106 110 111 112 113 116 12 120 121 122 123 126 127 130 ... 53 more)\n", "conj has 14 levels ( acr ad c di empty h h im kacer ki ow pn w)\n", "neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/)\n", "vstem has 17 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal qalp shaf tif)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=6)\n", "\n", "conj = column_index['conj']\n", "\n", "trans_conj = {\n", " '': 'empty',\n", " 'W': 'w',\n", " 'H': 'h',\n", " 'KJ': 'ki',\n", " '>CR': 'acr',\n", " 'W-/H': 'h',\n", " '>M': 'im',\n", " 'DJ': 'di',\n", " 'K->CR': 'kacer',\n", " 'W-/>M': 'im',\n", " 'C': 'c',\n", " 'W-/KJ': 'ki',\n", " 'L-H': 'h',\n", " '>T->CR': 'acr',\n", " 'K-H': 'h',\n", " 'PN': 'pn',\n", " 'W-/W': 'w',\n", " 'W': 'ow',\n", " 'KJ->M': 'im',\n", " '>XR/->CR': 'acr',\n", " 'KJ-/>M': 'im',\n", " 'LW': None,\n", " 'W-/>T-H': 'h',\n", " 'JCR': 'acr',\n", " 'LMCR': 'acr',\n", " 'CR': 'acr',\n", " 'CR': 'acr',\n", " 'CR-/W': 'w',\n", " '>L-H': 'h',\n", " '>T-H': 'h',\n", " 'B-C': 'c',\n", " 'B-H': 'h',\n", " 'B-VRM/': None,\n", " 'H-/W': 'w',\n", " 'JCR': 'acr',\n", " 'K-C': 'c',\n", " 'K-PH/->CR': 'acr',\n", " 'KJ-/LWL>': None,\n", " 'KMW': None,\n", " 'LWL>': None,\n", " 'MN': None,\n", " 'MN-DJ': 'di',\n", " 'MN-L-BD/-H': 'h',\n", " 'TXT/->CR': 'acr',\n", " 'W-/L-H': 'h ',\n", " 'W-/B-KL/-DJ': 'di',\n", " 'W-/DJ': 'di',\n", " 'W-/L->CR': 'acr',\n", " 'W-/L-H': 'h',\n", " 'W-/LW': None,\n", " 'W-/W-/W': 'w',\n", "}\n", "\n", "for row in data:\n", " val = row[conj]\n", " val2 = row[conj+1]\n", " if val2 != '': val = val2\n", " row[conj] = trans_conj[val] or ''\n", " del row[conj+1:conj+2]\n", " nrows += 1\n", " new_data.writerow(row)\n", "\n", "del column_names[conj+1:conj+2]\n", "stage_end()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Carc\n", "\n", "Deal with *carc*.\n", "\n", "There should be no empty carcs.\n", "\n", "In some cases we do not take the value in the *carc* column, but the value in the next column:\n", "* if the value is ``200`` or ``201`` take the value of the next column (which may be empty)\n", "\n", "There should not be other three digit values starting with ``2``.\n", "\n", "The result is a two or three digit number.\n", "\n", "Here are the rules for processing the resulting values\n", "* `` `` (empty value, coming from the next column): *carc1* = ``chain``, no levels in other columns\n", "* ``0``: *carc1* = ``txto``, no levels in other columns\n", "* ``10`` - ``16``: *carc1* = ``rela``, *carc2* = second digit, no level in *carc3*\n", "* ``50`` - ``74``: *carc1* = ``infc``, no levels in other columns\n", "* there should be no other two-digit values\n", "* ``999``: *carc1* = ``q``, no levels in other columns\n", "* *def* where d, e, f are digits: *carc1* = *d*, *carc2* = *e*, *carc3* = *f*" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 7=====\n", "Column names before:\n", "typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc, C08, conj, neg, vstem, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause\n", "Reading participle text data stage 7 (participia_compleet_r6.csv => participia_compleet_r7.csv) ...\n", " 9651 total rows x 35 header columns\n", " 9651 body rows x 35 actual columns (OK)\n", "===END====STAGE 7=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r)\n", "t1_wav has 2 levels (0 1)\n", "t1_alef has 2 levels (0 1)\n", "typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more)\n", "t2_BLJ has 2 levels (0 1)\n", "t2_EJN has 2 levels (0 1)\n", "t2_HNH has 2 levels (0 1)\n", "t2_JC has 2 levels (0 1)\n", "t2_OWD has 2 levels (0 1)\n", "t2_a has 2 levels (0 1)\n", "typ3strip-v has 77 levels ( : :a :c :e D D: D:a E E:a E:c E:s Ei: Nl O O: O:a O:a/qa O:c O:e ... 57 more)\n", "t3#v has 6 levels (0 1 2 3 4 5)\n", "state has 4 levels (: :a :c :e)\n", "k has 4 levels ( #NAAM? +K +K=)\n", "dom_D has 3 levels (0 0.5 1)\n", "dom_N has 3 levels (0 0.5 1)\n", "dom_Q has 3 levels (0 0.5 1)\n", "dom_? has 3 levels (0 0.5 1)\n", "dom_emb has 7 levels (1 2 3 4 5 6 7)\n", "phrf has 8 levels (AdjP AdvP DPrP NP PP PPrP PrNP VP)\n", "carc1 has 12 levels (1 3 4 5 6 7 8 chain infc q rela txto)\n", "carc2 has 9 levels ( 0 1 2 3 4 5 6 7)\n", "carc3 has 9 levels ( 0 1 2 3 4 5 6 7)\n", "conj has 14 levels ( acr ad c di empty h h im kacer ki ow pn w)\n", "neg has 6 levels ( >JN/ >L= BLJ/ L> MN->JN/)\n", "vstem has 17 levels (haf hif hit hof hsht htpa htpe nif pael pasq peal piel pual qal qalp shaf tif)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=7)\n", "\n", "carc = column_index['carc']\n", "\n", "for row in data:\n", " carc1 = ''\n", " carc2 = ''\n", " carc3 = ''\n", " code = str(row[carc])\n", " if code == '':\n", " error('Empty carc', nrows, code)\n", " if len(code) == 3 and code[0] == '2' and code[1:] not in {'00', '01'}:\n", " error('Strange carc in 200 range', nrows, code)\n", " if len(code) == 3 and code[0] == '2':\n", " code = str(row[carc+1])\n", " if code == '':\n", " carc1 = 'chain'\n", " elif code == '0':\n", " carc1 = 'txto'\n", " elif 10 <= int(code) <= 16:\n", " carc1 = 'rela'\n", " carc2 = code[1]\n", " elif 50 <= int(code) <= 74:\n", " carc1 = 'infc'\n", " elif len(code) == 2:\n", " error('Strange carc with two digits', nrows, code)\n", " elif code == '999':\n", " carc1 = 'q'\n", " else:\n", " (carc1,carc2,carc3) = (code[0], code[1], code[2])\n", " row[carc:carc+2] = (carc1, carc2, carc3)\n", " nrows += 1\n", " new_data.writerow(row)\n", "\n", "column_names[carc:carc+2] = ('carc1', 'carc2', 'carc3')\n", "stage_end()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generic levelling\n", "\n", "Now deal with several features of which we isolate known levels, strip those parts from the value, and leave the remainder in its original column.\n", "If there is no remainder in any column, may remove that column as well.\n", "In the feature spec below, set ``keep`` to ``False``." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "===BEGIN==STAGE 8=====\n", "Column names before:\n", "typ1, t1_wav, t1_alef, typ2strip-a, t2_BLJ, t2_EJN, t2_HNH, t2_JC, t2_OWD, t2_a, typ3strip-v, t3#v, state, k, dom_D, dom_N, dom_Q, dom_?, dom_emb, phrf, carc1, carc2, carc3, conj, neg, vstem, lex, vlabel, booknum, bookname, bookacro, chapter, verse, seqnum, clause\n", "Reading participle text data stage 8 (participia_compleet_r7.csv => participia_compleet_r8.csv) ...\n", " 9651 total rows x 112 header columns\n", " 9651 body rows x 112 actual columns (OK)\n", "===END====STAGE 8=====\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "typ1 has 15 levels (Aa Ab Ad Ak Am An Ap a b h k m n p r)\n", "t1_wav has 2 levels (0 1)\n", "t1_alef has 2 levels (0 1)\n", "typ2strip-a has 366 levels ( #c /directe/qa /hi /hi#a /hi#c /hi#s /hish /hitp /hitp#s /hitpolel /hitpolel#s /ho /ho#c /ni /ni#c /ni#s /ni#s_Ov /pi /pi#a ... 346 more)\n", "t2_BLJ has 2 levels (0 1)\n", "t2_EJN has 2 levels (0 1)\n", "t2_HNH has 2 levels (0 1)\n", "t2_JC has 2 levels (0 1)\n", "t2_OWD has 2 levels (0 1)\n", "t2_a has 2 levels (0 1)\n", "typ3strip-v has 26 levels ( : :/Hit3 :/qa :` :a :a/qa :a/qa1 :c :c/qa :e :s ; i i: i:a i:c n n: n:a ... 6 more)\n", "t3_D has 2 levels (0 1)\n", "t3_E has 2 levels (0 1)\n", "t3_Nhl has 2 levels (0 1)\n", "t3_Nl has 2 levels (0 1)\n", "t3_O has 2 levels (0 1)\n", "t3_PC has 2 levels (0 1)\n", "t3_S has 2 levels (0 1)\n", "t3_V has 2 levels (0 1)\n", "t3#v has 6 levels (0 1 2 3 4 5)\n", "state_:e has 2 levels (0 1)\n", "state_:c has 2 levels (0 1)\n", "state_:a has 2 levels (0 1)\n", "state_: has 1 levels (1)\n", "k_+K= has 2 levels (0 1)\n", "k_+K has 2 levels (0 1)\n", "k_#NAAM? has 2 levels (0 1)\n", "dom_D has 3 levels (0 0.5 1)\n", "dom_N has 3 levels (0 0.5 1)\n", "dom_Q has 3 levels (0 0.5 1)\n", "dom_? has 3 levels (0 0.5 1)\n", "dom_emb has 7 levels (1 2 3 4 5 6 7)\n", "phrf_VP has 2 levels (0 1)\n", "phrf_PrNP has 2 levels (0 1)\n", "phrf_PPrP has 2 levels (0 1)\n", "phrf_PP has 2 levels (0 1)\n", "phrf_NP has 2 levels (0 1)\n", "phrf_DPrP has 2 levels (0 1)\n", "phrf_AdvP has 2 levels (0 1)\n", "phrf_AdjP has 2 levels (0 1)\n", "carc1_txto has 2 levels (0 1)\n", "carc1_rela has 2 levels (0 1)\n", "carc1_q has 2 levels (0 1)\n", "carc1_infc has 2 levels (0 1)\n", "carc1_chain has 2 levels (0 1)\n", "carc1_8 has 2 levels (0 1)\n", "carc1_7 has 2 levels (0 1)\n", "carc1_6 has 2 levels (0 1)\n", "carc1_5 has 2 levels (0 1)\n", "carc1_4 has 2 levels (0 1)\n", "carc1_3 has 2 levels (0 1)\n", "carc1_1 has 2 levels (0 1)\n", "carc2_7 has 2 levels (0 1)\n", "carc2_6 has 2 levels (0 1)\n", "carc2_5 has 2 levels (0 1)\n", "carc2_4 has 2 levels (0 1)\n", "carc2_3 has 2 levels (0 1)\n", "carc2_2 has 2 levels (0 1)\n", "carc2_1 has 2 levels (0 1)\n", "carc2_0 has 2 levels (0 1)\n", "carc3_7 has 2 levels (0 1)\n", "carc3_6 has 2 levels (0 1)\n", "carc3_5 has 2 levels (0 1)\n", "carc3_4 has 2 levels (0 1)\n", "carc3_3 has 2 levels (0 1)\n", "carc3_2 has 2 levels (0 1)\n", "carc3_1 has 2 levels (0 1)\n", "carc3_0 has 2 levels (0 1)\n", "conj_w has 2 levels (0 1)\n", "conj_pn has 2 levels (0 1)\n", "conj_ow has 2 levels (0 1)\n", "conj_ki has 2 levels (0 1)\n", "conj_kacer has 2 levels (0 1)\n", "conj_im has 2 levels (0 1)\n", "conj_h has 2 levels (0 1)\n", "conj_h has 2 levels (0 1)\n", "conj_empty has 2 levels (0 1)\n", "conj_di has 2 levels (0 1)\n", "conj_c has 2 levels (0 1)\n", "conj_ad has 2 levels (0 1)\n", "conj_acr has 2 levels (0 1)\n", "neg_MN->JN/ has 2 levels (0 1)\n", "neg_L> has 2 levels (0 1)\n", "neg_BLJ/ has 2 levels (0 1)\n", "neg_>L= has 2 levels (0 1)\n", "neg_>JN/ has 2 levels (0 1)\n", "vstem_tif has 2 levels (0 1)\n", "vstem_shaf has 2 levels (0 1)\n", "vstem_qalp has 2 levels (0 1)\n", "vstem_qal has 2 levels (0 1)\n", "vstem_pual has 2 levels (0 1)\n", "vstem_piel has 2 levels (0 1)\n", "vstem_peal has 2 levels (0 1)\n", "vstem_pasq has 2 levels (0 1)\n", "vstem_pael has 2 levels (0 1)\n", "vstem_nif has 2 levels (0 1)\n", "vstem_htpe has 2 levels (0 1)\n", "vstem_htpa has 2 levels (0 1)\n", "vstem_hsht has 2 levels (0 1)\n", "vstem_hof has 2 levels (0 1)\n", "vstem_hit has 2 levels (0 1)\n", "vstem_hif has 2 levels (0 1)\n", "vstem_haf has 2 levels (0 1)\n", "lex has 906 levels (] [>CH ] [>L >XTH ] [*] [>DMTW ] [*
] [] [TMJD ] [*
] [] [*
] [HW> W-CLC M>WT H->JC ] [*
] [JM ] [*
] [L--SXR ] [*] [] [*] [] [*] [>NXNW ] [M-BJT_LXM JHWDH / PRJM ] [*] [B-->RY ] [*] [>LH ] [*] [CMJM W->RY ] [*] [CXR ] [] [*] [GDLWT <..>] [*] [GDLWT ] [*] [GDLWT ] [JN XQR ] [*] [GDLWT ] [B-MYRJM ] [*] [KL ] [*] [KL >LH ] ... 8964 more)\n" ] } ], "source": [ "stage_start(nr=8)\n", "\n", "features = collections.OrderedDict((\n", " ('vstem', {\n", " 'lvs': list(sorted('''qal qalp hif nif piel peal pual hit hof haf pael htpa hsht htpe pasq tif shaf'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('neg', {\n", " 'lvs': list(sorted('''>JN/ >L= BLJ/ L> MN->JN/'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('conj', {\n", " 'lvs': list(sorted('''acr ad c di empty h h im kacer ki ow pn w'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('carc3', {\n", " 'lvs': list(sorted('''0 1 2 3 4 5 6 7'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('carc2', {\n", " 'lvs': list(sorted('''0 1 2 3 4 5 6 7'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('carc1', {\n", " 'lvs': list(sorted('''1 3 4 5 6 7 8 chain infc q rela txto'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('phrf', {\n", " 'lvs': list(sorted('''AdjP AdvP DPrP NP PP PPrP PrNP VP'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('k', {\n", " 'lvs': list(sorted('''#NAAM? +K +K='''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('state', {\n", " 'lvs': list(sorted(''': :a :c :e'''.split(), reverse=True)), \n", " 'keep': False,\n", " }),\n", " ('typ3strip-v', {\n", " 'lvs': list(sorted('''V PC S O E D Nl Nhl'''.split())),\n", " 'keep': True,\n", " 'lvname': 't3',\n", " }),\n", "))\n", "\n", "colnums = []\n", "clevels = []\n", "keep = []\n", "for feat in (features):\n", " colnums.append(column_index[feat])\n", " clevels.append(features[feat]['lvs'])\n", " keep.append(features[feat]['keep'])\n", "\n", "for row in data:\n", " for (i, cn) in enumerate(colnums):\n", " val = row[cn]\n", " flags = []\n", " for lv in clevels[i]:\n", " if lv in val:\n", " if keep[i]: val = val.replace(lv, '')\n", " flag = 1\n", " else:\n", " flag = 0\n", " flags.append(flag)\n", " row[cn:cn+1] = ([val] if keep[i] else []) + flags\n", " new_data.writerow(row)\n", " nrows += 1\n", " \n", "for feat in features:\n", " cn = column_index[feat]\n", " keep = features[feat]['keep']\n", " lvname = features[feat].get('lvname', feat)\n", " lvs = features[feat]['lvs']\n", " column_names[cn:cn+1] = ([feat] if keep else []) + ['{}_{}'.format(lvname, x) for x in lvs]\n", "\n", "stage_end(last=True)\n", "\n", "#for feat in features:\n", "# if features[feat]['keep']: show_col(feat)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }