{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "\n", "\n", "\n", "" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ETCBC in R" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook exports the ETCBC database to an R data frame.\n", "The nodes are exported as rows, they correspond to the objects.\n", "The edges corresponding to the etcbc features *mother*, *functional_parent*, *distributional_parent* are\n", "exported as columns. For each row, such a column indicates the target of a corresponding outgoing edge.\n", "In the ETCBC data objects have at most one outgoing edge for each type of edge.\n", "\n", "Extra data such as lexicon, phonetic transcription, and ketiv-qere is also included." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0.00s This is LAF-Fabric 4.5.6\n", "API reference: http://laf-fabric.readthedocs.org/en/latest/texts/API-reference.html\n", "Feature doc: https://shebanq.ancient-data.org/static/docs/featuredoc/texts/welcome.html\n", "\n" ] } ], "source": [ "import sys, collections\n", "%load_ext rpy2.ipython\n", "\n", "from laf.fabric import LafFabric\n", "import etcbc\n", "\n", "fabric = LafFabric()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0.00s LOADING API: please wait ... \n", " 0.00s INFO: USING DATA COMPILED AT: 2015-11-02T15-08-56\n", " 0.00s INFO: USING DATA COMPILED AT: 2015-11-03T06-44-21\n", " 1.07s LOGFILE=/Users/dirk/SURFdrive/laf-fabric-output/etcbc4b/hinr/__log__hinr.txt\n", " 1.07s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK hinr AT 2016-01-12T15-50-18\n" ] }, { "data": { "text/plain": [ "[('etcbc4',\n", " ['db.maxmonad',\n", " 'db.minmonad',\n", " 'db.monads',\n", " 'db.oid',\n", " 'db.otype',\n", " 'ft.code',\n", " 'ft.det',\n", " 'ft.dist',\n", " 'ft.dist_unit',\n", " 'ft.domain',\n", " 'ft.function',\n", " 'ft.g_cons',\n", " 'ft.g_cons_utf8',\n", " 'ft.g_lex',\n", " 'ft.g_lex_utf8',\n", " 'ft.g_nme',\n", " 'ft.g_nme_utf8',\n", " 'ft.g_pfm',\n", " 'ft.g_pfm_utf8',\n", " 'ft.g_prs',\n", " 'ft.g_prs_utf8',\n", " 'ft.g_uvf',\n", " 'ft.g_uvf_utf8',\n", " 'ft.g_vbe',\n", " 'ft.g_vbe_utf8',\n", " 'ft.g_vbs',\n", " 'ft.g_vbs_utf8',\n", " 'ft.g_word',\n", " 'ft.g_word_utf8',\n", " 'ft.gn',\n", " 'ft.is_root',\n", " 'ft.kind',\n", " 'ft.language',\n", " 'ft.lex',\n", " 'ft.lex_utf8',\n", " 'ft.ls',\n", " 'ft.mother_object_type',\n", " 'ft.nme',\n", " 'ft.nu',\n", " 'ft.number',\n", " 'ft.pdp',\n", " 'ft.pfm',\n", " 'ft.prs',\n", " 'ft.ps',\n", " 'ft.rela',\n", " 'ft.sp',\n", " 'ft.st',\n", " 'ft.tab',\n", " 'ft.trailer_utf8',\n", " 'ft.txt',\n", " 'ft.typ',\n", " 'ft.uvf',\n", " 'ft.vbe',\n", " 'ft.vbs',\n", " 'ft.vs',\n", " 'ft.vt',\n", " 'kq.g_qere_utf8',\n", " 'kq.qtrailer_utf8',\n", " 'lex.entry',\n", " 'lex.entry_heb',\n", " 'lex.entryid',\n", " 'lex.g_entry',\n", " 'lex.g_entry_heb',\n", " 'lex.gloss',\n", " 'lex.id',\n", " 'lex.lan',\n", " 'lex.nametype',\n", " 'lex.pos',\n", " 'lex.root',\n", " 'lex.subpos',\n", " 'ph.phono',\n", " 'ph.phono_sep',\n", " 'sft.book',\n", " 'sft.chapter',\n", " 'sft.label',\n", " 'sft.verse'])]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "API = fabric.load('etcbc4b', 'lexicon', 'hinr', {\n", " \"xmlids\": {\"node\": False, \"edge\": False},\n", " \"features\": ('''\n", "''',\"\"),\n", " \"primary\": False,\n", "})\n", "\n", "API['F_all']" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " 0.00s LOADING API: please wait ... \n", " 0.00s INFO: USING DATA COMPILED AT: 2015-11-02T15-08-56\n", " 0.00s INFO: USING DATA COMPILED AT: 2015-11-03T06-44-21\n", " 17s INFO: DATA LOADED FROM SOURCE etcbc4b AND ANNOX lexicon FOR TASK hinr AT 2016-01-12T15-50-51\n" ] } ], "source": [ "all_features = [x.split('.', 1)[1] for x in API['F_all'][0][1]]\n", "all_feature_str = ' '.join(all_features)\n", "\n", "API = fabric.load_again({\n", " \"xmlids\": {\"node\": False, \"edge\": False},\n", " \"features\": (all_feature_str,\"\"),\n", " \"primary\": False,\n", "})\n", "exec(fabric.localnames.format(var='fabric'))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "56m 22s 100000 nodes written\n", "56m 29s 200000 nodes written\n", "56m 36s 300000 nodes written\n", "56m 43s 400000 nodes written\n", "56m 50s 500000 nodes written\n", "56m 57s 600000 nodes written\n", "57m 04s 700000 nodes written\n", "57m 10s 800000 nodes written\n", "57m 18s 900000 nodes written\n", "57m 26s 1000000 nodes written\n", "57m 32s 1100000 nodes written\n", "57m 40s 1200000 nodes written\n", "57m 47s 1300000 nodes written\n", "57m 54s 1400000 nodes written\n", "57m 57s 1436858 nodes written and done\n" ] } ], "source": [ "hr = open('/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.txt', 'w')\n", "use_features = all_features\n", "# use_features = ['oid'] + all_features[60:77]\n", "hr.write('{}\\n'.format('\\t'.join(use_features)))\n", "chunk_size = 100000\n", "i = 0\n", "s = 0\n", "for n in NN():\n", " all_values = [F.item[x].v(n) for x in use_features]\n", " hr.write('{}\\n'.format(('\\t'.join(x or '' for x in all_values)).replace('\\n','')))\n", " i += 1\n", " s += 1\n", " if s == chunk_size:\n", " s = 0\n", " msg('{:>7} nodes written'.format(i))\n", "hr.close()\n", "msg('{:>7} nodes written and done'.format(i))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now read the data in R and save it in compact .rds format.\n", "\n", "Note that we have to ignore quotes and comment signs!" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1] 1436858 76\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%R\n", "etcbc = read.table(\n", " '/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.txt', \n", " sep=\"\\t\", \n", " header=TRUE, \n", " comment.char=\"\",\n", " quote=\"\",\n", " as.is = TRUE,\n", ")\n", "dim(etcbc)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%%R\n", "saveRDS(\n", " object=etcbc, \n", " file='/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.rds'\n", ")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 565792\r\n", "-rw-r--r-- 1 dirk staff 43M Jan 12 17:53 etcbc4b.rds\r\n", "-rw-r--r-- 1 dirk staff 233M Jan 12 17:48 etcbc4b.txt\r\n" ] } ], "source": [ "!ls -lh /Users/dirk/SURFdrive/laf-fabric-data/r" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now check how fast this loads. (Half the time)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[1] 1436858 76\n" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%R\n", "etcbc = readRDS(\n", " file='/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.rds'\n", ")\n", "dim(etcbc)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Copy it to the github directory" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "!cp '/Users/dirk/SURFdrive/laf-fabric-data/r/etcbc4b.rds' '/Users/dirk/SURFdrive/current/demos/github/laf-fabric-data/etcbc4b.rds'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The result is in the [laf-fabric-data github repository](https://github.com/ETCBC/laf-fabric-data)." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }