{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Kostic, D. 1999. Frekvencijski recnik savremenog srpskog jezika (Frequency Dictionary of Contemporary Serbian Language). Institute for Experimental Phonetics and Speech Pathology & Laboratory of Experimental Psychology, University of Belgrade, Serbia.\n", "\n", "Baayen, R. H., Milin, P., Filipovic Durdevic, D., Hendrix, P. and Marelli, M. 2011. \"An amorphous model for morphological processing in visual comprehension based on naive discriminative learning.\" *Psychological Review* 118:438-482.\n" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import pandas.rpy.common as com\n", "import numpy as np\n", "from sklearn.feature_extraction import DictVectorizer\n", "\n", "%load_ext autoreload\n", "%autoreload 2\n", "\n", "%load_ext rmagic\n", "\n", "%precision 2" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 1, "text": [ "u'%.2f'" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "from ndl import *" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "%%R\n", "library(ndl)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "display_data", "text": [ "This is ndl version 0.2.16. \n", "For an overview of the package, type 'help(\"ndl.package\")'.\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "serbian = com.load_data('serbian')\n", "serbian['Cues'] = orthoCoding(serbian.WordForm,grams=2)\n", "serbian['Outcomes'] = [tuple(c.split('_')) for c in serbian.LemmaCase]\n", "serbian.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | WordForm | \n", "LemmaCase | \n", "Frequency | \n", "Cues | \n", "Outcomes | \n", "
---|---|---|---|---|---|
1 | \n", "yena | \n", "yena_nom_Sg | \n", "576 | \n", "(#y, ye, en, na, a#) | \n", "(yena, nom, Sg) | \n", "
2 | \n", "yene | \n", "yena_gen_Sg | \n", "229 | \n", "(#y, ye, en, ne, e#) | \n", "(yena, gen, Sg) | \n", "
3 | \n", "yeni | \n", "yena_dat_Sg | \n", "55 | \n", "(#y, ye, en, ni, i#) | \n", "(yena, dat, Sg) | \n", "
4 | \n", "yenu | \n", "yena_acc_Sg | \n", "167 | \n", "(#y, ye, en, nu, u#) | \n", "(yena, acc, Sg) | \n", "
5 | \n", "yenom | \n", "yena_ins_Sg | \n", "39 | \n", "(#y, ye, en, no, om, m#) | \n", "(yena, ins, Sg) | \n", "
5 rows \u00d7 5 columns
\n", "\n", " | Pl | \n", "Sg | \n", "acc | \n", "akademija | \n", "aparat | \n", "bitka | \n", "bog | \n", "boja | \n", "bol | \n", "bor | \n", "borac | \n", "brazda | \n", "brdo | \n", "brid | \n", "briga | \n", "brigada | \n", "brod | \n", "bura | \n", "cena | \n", "cesta | \n", "\n", " |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
#a | \n", "-0.467714 | \n", "0.952442 | \n", "0.326035 | \n", "0.471536 | \n", "0.506614 | \n", "-0.001006 | \n", "-0.032020 | \n", "0.037598 | \n", "-0.028174 | \n", "0.015433 | \n", "-0.011195 | \n", "0.006800 | \n", "0.003051 | \n", "0.000134 | \n", "0.015739 | \n", "-0.039569 | \n", "0.017378 | \n", "-0.002766 | \n", "0.030046 | \n", "-0.052284 | \n", "... | \n", "
#b | \n", "-0.069955 | \n", "0.556007 | \n", "0.095385 | \n", "-0.005611 | \n", "-0.012985 | \n", "0.079436 | \n", "0.121653 | \n", "0.066181 | \n", "0.254183 | \n", "0.097360 | \n", "0.109091 | \n", "0.027724 | \n", "-0.003189 | \n", "0.009077 | \n", "0.036710 | \n", "0.022223 | \n", "0.106195 | \n", "0.000795 | \n", "-0.000225 | \n", "-0.019009 | \n", "... | \n", "
#c | \n", "-0.099439 | \n", "0.585776 | \n", "0.110130 | \n", "0.002622 | \n", "-0.021386 | \n", "0.010436 | \n", "-0.004947 | \n", "0.016928 | \n", "-0.016833 | \n", "-0.005711 | \n", "-0.021122 | \n", "-0.009816 | \n", "0.002113 | \n", "-0.007284 | \n", "0.023586 | \n", "-0.012067 | \n", "0.007555 | \n", "0.000627 | \n", "0.577554 | \n", "0.344233 | \n", "... | \n", "
#d | \n", "-0.017994 | \n", "0.500962 | \n", "0.077627 | \n", "-0.014584 | \n", "-0.004215 | \n", "-0.002597 | \n", "0.015613 | \n", "-0.035485 | \n", "0.024504 | \n", "0.006509 | \n", "0.007348 | \n", "-0.012459 | \n", "-0.045132 | \n", "-0.002656 | \n", "0.027050 | \n", "-0.018789 | \n", "0.013864 | \n", "0.000444 | \n", "-0.025700 | \n", "0.006292 | \n", "... | \n", "
#e | \n", "-0.465341 | \n", "0.925997 | \n", "-0.093245 | \n", "0.038236 | \n", "-0.056175 | \n", "0.024248 | \n", "0.112225 | \n", "-0.042413 | \n", "-0.031564 | \n", "-0.030352 | \n", "-0.037280 | \n", "-0.028162 | \n", "0.002051 | \n", "0.010243 | \n", "0.078581 | \n", "0.000173 | \n", "-0.073940 | \n", "0.000415 | \n", "-0.023138 | \n", "0.004618 | \n", "... | \n", "
5 rows \u00d7 278 columns
\n", "\n", " | WordForm | \n", "LemmaCase | \n", "Frequency | \n", "Cues | \n", "Outcomes | \n", "Predicted | \n", "
---|---|---|---|---|---|---|
1 | \n", "yena | \n", "yena_nom_Sg | \n", "576 | \n", "(#y, ye, en, na, a#) | \n", "(yena, nom, Sg) | \n", "(yena, nom, Sg) | \n", "
2 | \n", "yene | \n", "yena_gen_Sg | \n", "229 | \n", "(#y, ye, en, ne, e#) | \n", "(yena, gen, Sg) | \n", "(yena, nom, Pl) | \n", "
3 | \n", "yeni | \n", "yena_dat_Sg | \n", "55 | \n", "(#y, ye, en, ni, i#) | \n", "(yena, dat, Sg) | \n", "(yena, nom, Sg) | \n", "
4 | \n", "yenu | \n", "yena_acc_Sg | \n", "167 | \n", "(#y, ye, en, nu, u#) | \n", "(yena, acc, Sg) | \n", "(yena, acc, Sg) | \n", "
5 | \n", "yenom | \n", "yena_ins_Sg | \n", "39 | \n", "(#y, ye, en, no, om, m#) | \n", "(yena, ins, Sg) | \n", "(yena, ins, Sg) | \n", "
6 | \n", "yeni | \n", "yena_loc_Sg | \n", "16 | \n", "(#y, ye, en, ni, i#) | \n", "(yena, loc, Sg) | \n", "(yena, nom, Sg) | \n", "
7 | \n", "yene | \n", "yena_nom_Pl | \n", "415 | \n", "(#y, ye, en, ne, e#) | \n", "(yena, nom, Pl) | \n", "(yena, nom, Pl) | \n", "
8 | \n", "yena | \n", "yena_gen_Pl | \n", "336 | \n", "(#y, ye, en, na, a#) | \n", "(yena, gen, Pl) | \n", "(yena, nom, Sg) | \n", "
9 | \n", "yenama | \n", "yena_dat_Pl | \n", "33 | \n", "(#y, ye, en, na, am, ma, a#) | \n", "(yena, dat, Pl) | \n", "(yena, loc, Pl) | \n", "
10 | \n", "yene | \n", "yena_acc_Pl | \n", "136 | \n", "(#y, ye, en, ne, e#) | \n", "(yena, acc, Pl) | \n", "(yena, nom, Pl) | \n", "
11 | \n", "yenama | \n", "yena_ins_Pl | \n", "24 | \n", "(#y, ye, en, na, am, ma, a#) | \n", "(yena, ins, Pl) | \n", "(yena, loc, Pl) | \n", "
12 | \n", "yenama | \n", "yena_loc_Pl | \n", "4 | \n", "(#y, ye, en, na, am, ma, a#) | \n", "(yena, loc, Pl) | \n", "(yena, loc, Pl) | \n", "
13 | \n", "yeqa | \n", "yeqa_nom_Sg | \n", "179 | \n", "(#y, ye, eq, qa, a#) | \n", "(yeqa, nom, Sg) | \n", "(yeqa, nom, Sg) | \n", "
14 | \n", "yeqe | \n", "yeqa_gen_Sg | \n", "54 | \n", "(#y, ye, eq, qe, e#) | \n", "(yeqa, gen, Sg) | \n", "(yeqa, gen, Sg) | \n", "
15 | \n", "yeqi | \n", "yeqa_dat_Sg | \n", "7 | \n", "(#y, ye, eq, qi, i#) | \n", "(yeqa, dat, Sg) | \n", "(yeqa, loc, Sg) | \n", "
16 | \n", "yequ | \n", "yeqa_acc_Sg | \n", "95 | \n", "(#y, ye, eq, qu, u#) | \n", "(yeqa, acc, Sg) | \n", "(yeqa, acc, Sg) | \n", "
17 | \n", "yeqom | \n", "yeqa_ins_Sg | \n", "30 | \n", "(#y, ye, eq, qo, om, m#) | \n", "(yeqa, ins, Sg) | \n", "(yeqa, ins, Sg) | \n", "
18 | \n", "yeqi | \n", "yeqa_loc_Sg | \n", "43 | \n", "(#y, ye, eq, qi, i#) | \n", "(yeqa, loc, Sg) | \n", "(yeqa, loc, Sg) | \n", "
19 | \n", "yeqe | \n", "yeqa_nom_Pl | \n", "102 | \n", "(#y, ye, eq, qe, e#) | \n", "(yeqa, nom, Pl) | \n", "(yeqa, gen, Sg) | \n", "
20 | \n", "yeqa | \n", "yeqa_gen_Pl | \n", "164 | \n", "(#y, ye, eq, qa, a#) | \n", "(yeqa, gen, Pl) | \n", "(yeqa, nom, Sg) | \n", "
21 | \n", "yeqama | \n", "yeqa_dat_Pl | \n", "3 | \n", "(#y, ye, eq, qa, am, ma, a#) | \n", "(yeqa, dat, Pl) | \n", "(yeqa, loc, Pl) | \n", "
22 | \n", "yeqe | \n", "yeqa_acc_Pl | \n", "84 | \n", "(#y, ye, eq, qe, e#) | \n", "(yeqa, acc, Pl) | \n", "(yeqa, gen, Sg) | \n", "
23 | \n", "yeqama | \n", "yeqa_ins_Pl | \n", "14 | \n", "(#y, ye, eq, qa, am, ma, a#) | \n", "(yeqa, ins, Pl) | \n", "(yeqa, loc, Pl) | \n", "
24 | \n", "yeqama | \n", "yeqa_loc_Pl | \n", "7 | \n", "(#y, ye, eq, qa, am, ma, a#) | \n", "(yeqa, loc, Pl) | \n", "(yeqa, loc, Pl) | \n", "
25 | \n", "yivot | \n", "yivot_nom_Sg | \n", "991 | \n", "(#y, yi, iv, vo, ot, t#) | \n", "(yivot, nom, Sg) | \n", "(yivot, nom, Sg) | \n", "
26 | \n", "yivota | \n", "yivot_gen_Sg | \n", "1004 | \n", "(#y, yi, iv, vo, ot, ta, a#) | \n", "(yivot, gen, Sg) | \n", "(yivot, gen, Sg) | \n", "
27 | \n", "yivotu | \n", "yivot_dat_Sg | \n", "100 | \n", "(#y, yi, iv, vo, ot, tu, u#) | \n", "(yivot, dat, Sg) | \n", "(yivot, loc, Sg) | \n", "
28 | \n", "yivot | \n", "yivot_acc_Sg | \n", "799 | \n", "(#y, yi, iv, vo, ot, t#) | \n", "(yivot, acc, Sg) | \n", "(yivot, nom, Sg) | \n", "
29 | \n", "yivotom | \n", "yivot_ins_Sg | \n", "142 | \n", "(#y, yi, iv, vo, ot, to, om, m#) | \n", "(yivot, ins, Sg) | \n", "(yivot, ins, Sg) | \n", "
30 | \n", "yivotu | \n", "yivot_loc_Sg | \n", "248 | \n", "(#y, yi, iv, vo, ot, tu, u#) | \n", "(yivot, loc, Sg) | \n", "(yivot, loc, Sg) | \n", "
31 | \n", "yivoti | \n", "yivot_nom_Pl | \n", "22 | \n", "(#y, yi, iv, vo, ot, ti, i#) | \n", "(yivot, nom, Pl) | \n", "(yivot, gen, Sg) | \n", "
32 | \n", "yivota | \n", "yivot_gen_Pl | \n", "30 | \n", "(#y, yi, iv, vo, ot, ta, a#) | \n", "(yivot, gen, Pl) | \n", "(yivot, gen, Sg) | \n", "
33 | \n", "yivotima | \n", "yivot_dat_Pl | \n", "3 | \n", "(#y, yi, iv, vo, ot, ti, im, ma, a#) | \n", "(yivot, dat, Pl) | \n", "(yivot, ins, Pl) | \n", "
34 | \n", "yivote | \n", "yivot_acc_Pl | \n", "52 | \n", "(#y, yi, iv, vo, ot, te, e#) | \n", "(yivot, acc, Pl) | \n", "(yivot, gen, Sg) | \n", "
35 | \n", "yivotima | \n", "yivot_ins_Pl | \n", "5 | \n", "(#y, yi, iv, vo, ot, ti, im, ma, a#) | \n", "(yivot, ins, Pl) | \n", "(yivot, ins, Pl) | \n", "
36 | \n", "yivotima | \n", "yivot_loc_Pl | \n", "2 | \n", "(#y, yi, iv, vo, ot, ti, im, ma, a#) | \n", "(yivot, loc, Pl) | \n", "(yivot, ins, Pl) | \n", "
37 | \n", "{etwa | \n", "{etwa_nom_Sg | \n", "33 | \n", "(#{, {e, et, tw, wa, a#) | \n", "({etwa, nom, Sg) | \n", "({etwa, gen, Sg) | \n", "
38 | \n", "{etwe | \n", "{etwa_gen_Sg | \n", "10 | \n", "(#{, {e, et, tw, we, e#) | \n", "({etwa, gen, Sg) | \n", "({etwa, nom, Sg) | \n", "
39 | \n", "{etwi | \n", "{etwa_dat_Sg | \n", "1 | \n", "(#{, {e, et, tw, wi, i#) | \n", "({etwa, dat, Sg) | \n", "({etwa, nom, Pl) | \n", "
40 | \n", "{etwu | \n", "{etwa_acc_Sg | \n", "29 | \n", "(#{, {e, et, tw, wu, u#) | \n", "({etwa, acc, Sg) | \n", "({etwa, loc, Sg) | \n", "
41 | \n", "{etwom | \n", "{etwa_ins_Sg | \n", "5 | \n", "(#{, {e, et, tw, wo, om, m#) | \n", "({etwa, ins, Sg) | \n", "({etwa, ins, Sg) | \n", "
42 | \n", "{etwi | \n", "{etwa_loc_Sg | \n", "12 | \n", "(#{, {e, et, tw, wi, i#) | \n", "({etwa, loc, Sg) | \n", "({etwa, nom, Pl) | \n", "
43 | \n", "{etwe | \n", "{etwa_nom_Pl | \n", "6 | \n", "(#{, {e, et, tw, we, e#) | \n", "({etwa, nom, Pl) | \n", "({etwa, nom, Sg) | \n", "
44 | \n", "{etwi | \n", "{etwa_gen_Pl | \n", "5 | \n", "(#{, {e, et, tw, wi, i#) | \n", "({etwa, gen, Pl) | \n", "({etwa, nom, Pl) | \n", "
45 | \n", "{etwama | \n", "{etwa_dat_Pl | \n", "1 | \n", "(#{, {e, et, tw, wa, am, ma, a#) | \n", "({etwa, dat, Pl) | \n", "({etwa, ins, Pl) | \n", "
46 | \n", "{etwe | \n", "{etwa_acc_Pl | \n", "11 | \n", "(#{, {e, et, tw, we, e#) | \n", "({etwa, acc, Pl) | \n", "({etwa, nom, Sg) | \n", "
47 | \n", "{etwama | \n", "{etwa_ins_Pl | \n", "2 | \n", "(#{, {e, et, tw, wa, am, ma, a#) | \n", "({etwa, ins, Pl) | \n", "({etwa, ins, Pl) | \n", "
48 | \n", "{etwama | \n", "{etwa_loc_Pl | \n", "2 | \n", "(#{, {e, et, tw, wa, am, ma, a#) | \n", "({etwa, loc, Pl) | \n", "({etwa, ins, Pl) | \n", "
49 | \n", "{irina | \n", "{irina_nom_Sg | \n", "16 | \n", "(#{, {i, ir, ri, in, na, a#) | \n", "({irina, nom, Sg) | \n", "({irina, gen, Sg) | \n", "
50 | \n", "{irine | \n", "{irina_gen_Sg | \n", "28 | \n", "(#{, {i, ir, ri, in, ne, e#) | \n", "({irina, gen, Sg) | \n", "({irina, acc, Pl) | \n", "
51 | \n", "{irini | \n", "{irina_dat_Sg | \n", "3 | \n", "(#{, {i, ir, ri, in, ni, i#) | \n", "({irina, dat, Sg) | \n", "({irina, loc, Sg) | \n", "
52 | \n", "{irinu | \n", "{irina_acc_Sg | \n", "17 | \n", "(#{, {i, ir, ri, in, nu, u#) | \n", "({irina, acc, Sg) | \n", "({irina, acc, Sg) | \n", "
53 | \n", "{irinom | \n", "{irina_ins_Sg | \n", "20 | \n", "(#{, {i, ir, ri, in, no, om, m#) | \n", "({irina, ins, Sg) | \n", "({irina, ins, Sg) | \n", "
54 | \n", "{irini | \n", "{irina_loc_Sg | \n", "17 | \n", "(#{, {i, ir, ri, in, ni, i#) | \n", "({irina, loc, Sg) | \n", "({irina, loc, Sg) | \n", "
55 | \n", "{irine | \n", "{irina_nom_Pl | \n", "11 | \n", "(#{, {i, ir, ri, in, ne, e#) | \n", "({irina, nom, Pl) | \n", "({irina, acc, Pl) | \n", "
56 | \n", "{irina | \n", "{irina_gen_Pl | \n", "12 | \n", "(#{, {i, ir, ri, in, na, a#) | \n", "({irina, gen, Pl) | \n", "({irina, gen, Sg) | \n", "
57 | \n", "{irinama | \n", "{irina_dat_Pl | \n", "2 | \n", "(#{, {i, ir, ri, in, na, am, ma, a#) | \n", "({irina, dat, Pl) | \n", "({irina, loc, Pl) | \n", "
58 | \n", "{irine | \n", "{irina_acc_Pl | \n", "23 | \n", "(#{, {i, ir, ri, in, ne, e#) | \n", "({irina, acc, Pl) | \n", "({irina, acc, Pl) | \n", "
59 | \n", "{irinama | \n", "{irina_ins_Pl | \n", "2 | \n", "(#{, {i, ir, ri, in, na, am, ma, a#) | \n", "({irina, ins, Pl) | \n", "({irina, loc, Pl) | \n", "
60 | \n", "{irinama | \n", "{irina_loc_Pl | \n", "3 | \n", "(#{, {i, ir, ri, in, na, am, ma, a#) | \n", "({irina, loc, Pl) | \n", "({irina, loc, Pl) | \n", "
\n", " | ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
3240 rows \u00d7 6 columns
\n", "