In [1]:
import pandas as pd
import spacy
import os
import re

In [2]:
vf_files = [
    'VF_Argonautica_1.csv',
    'VF_Argonautica_2.csv',
    'VF_Argonautica_3.csv',
    'VF_Argonautica_4.csv',
    'VF_Argonautica_5.csv',
    'VF_Argonautica_6.csv',
    'VF_Argonautica_7.csv',
    'VF_Argonautica_8.csv',
]

nlp = spacy.load('la_core_web_lg')

In [3]:
tables = []
cur_id = 7000

for file in vf_files:
    print(file)
    df = pd.read_csv(os.path.join('data', 'vf', file))

    speech_ids = []
    last_spkr = None
    for speaker in df.speaker:
        if pd.isna(speaker):
            id = None
        else:
            if speaker != last_spkr:
                cur_id += 1
            id = cur_id
        last_spkr = speaker
        speech_ids.append(id)
        
    df['speech'] = speech_ids
    df = df.loc[~df.speaker.isna()]
    df = df.loc[~df.perseus_text.isna()]
    df = df.loc[~df.pc_bounds.isna()]    
    df['speech'] = df['speech'].astype(int)
    df['file'] = file[:-4]
    df['line_id'] = df['file'] + ':' + df['perseus_n'].astype(str)
    df['speaker'] = df['speaker'].str.replace('[\[\]\']', '', regex=True)
    df['tokens'] = df['perseus_text'].apply(lambda s: [tok for tok in nlp(s)])

    tables.append(df)

df = pd.concat(tables)
df['line_id'] = pd.Categorical(df['line_id'], categories=pd.unique(df['line_id']), ordered=True)

VF_Argonautica_1.csv
VF_Argonautica_2.csv
VF_Argonautica_3.csv
VF_Argonautica_4.csv
VF_Argonautica_5.csv
VF_Argonautica_6.csv
VF_Argonautica_7.csv
VF_Argonautica_8.csv


In [4]:
def normalize(s):
    return re.sub(r'[^a-z]', '', s.lower()).replace('jv', 'iu')


def getElided(df):
    '''extract elided tokens from a line-array table'''
    rows = []
    row_count = 0
    
    for row in df.itertuples():
        elided = [False] * len(row.tokens)
        row_count += 1
        
        if 'SY' in row.pc_bounds:
            bounds = row.pc_bounds[1:-1].split(',')
            pc_toks = row.pc_text.split()
            no_punct = [tok for tok in row.tokens if tok.pos_ != 'PUNCT']
            
            if len(no_punct) == len(bounds):
                for bound, tok in zip(bounds, no_punct):
                    if 'SY' in bound:
                        idx = row.tokens.index(tok)
                        elided[idx] = True
            else:
                if len(bounds) == len(pc_toks):
                    for bound, pc_tok in zip(bounds, pc_toks):
                        if 'SY' in bound:
                            normalized_pc = normalize(pc_tok)
                            normalized_toks = [normalize(tok.text) for tok in row.tokens]
                            count = normalized_toks.count(normalized_pc)
                            if count == 1:
                                idx = normalized_toks.index(normalized_pc)
                            elif pc_tok.endswith('que') and normalized_toks.count('que') == 1:
                                idx = normalized_toks.index('que')
                            else:
                                print(f'[{row_count}]\t' + ' '.join([f'{i}.{tok.text}' for i, tok in enumerate(row.tokens)]))
                                print(' '.join(bounds))
                                idx = int(input(f'Which word is {pc_tok}? '))
                            elided[idx] = True
        rows.append(elided)
    return rows

In [5]:
# df['elided'] = getElided(df)

[24]	0.da 1.Scythiam 2.Phasim 3.que 4.mihi 5.; 6.tu 7.que 8., 9.innuba 10.Pallas 11.,
'CM'  'CM'  'CF'  'CM'  'SY'  'DI'  None


Which word is tuque,?  7


[31]	0.iam 1.iam 2.ego 3.et 4.inviti 5.torsissem 6.coniugis 7.ignem 8..
'SY'  'SY'  'DI'  'CM'  'DI'  'DI'  None


Which word is Iamiam?  1


[38]	0.non 1.iuvenem 2.in 3.casus 4.eademn 5.que 6.pericula 7.Acastum
'CM'  'SY'  'CM'  'CM'  'CF'  'SY'  None


Which word is iuuenem?  1


[99]	0.hanc 1.vero 2., 3.socii 4., 5.venientem 6.litore 7.laeti
'CM'  'CM'  'CM'  'SY'  'DI'  'DI'  None


Which word is uenientem?  5


[176]	0.inde 1.meae 2.quercus 3.tripodes 4.que 5.animae 6.que 7.parentum
'CF'  'CM'  'CM'  'SY'  'CF'  None


Which word is tripodesque?  4


[229]	0.ut 1.superum 2.sic 3.claret 4.opus 5., 6.tolli 7.que 8.vicissim
'CM'  'CM'  'DI'  'SY'  'CM'  'CF'  None


Which word is stare?  3


[231]	0.armorum 1.que 2.hominum 3.que 4.truces 5.consurgere 6.in 7.iras
'SY'  'CF'  'CM'  'SY'  'DI'  None


Which word is Armorumque?  1


[270]	0.tu 1.que 2., 3.excite 4.parens 5.umbris 6., 7.ut 8.nostra 9.videres
'SY'  'CF'  'CM'  'CM'  'DI'  'CF'  None


Which word is Tuque,?  1


[275]	0.ultrices 1.que 2.deae 3.Fas 4.que 5.et 6.grandaeva 7.Furorum
'CF'  'CM'  'SY'  'CM'  'CF'  None


Which word is Fasque?  4


[335]	0.iam 1.que 2.aderunt 3., 4.thalamis 5.que 6.tuis 7.Threissa 8.propinquat
'SY'  'CM'  'CF'  'CM'  'CF'  None


Which word is Iamque?  1


[385]	0.Vulcani 1.que 2.' 3.ait 4.' 5.ecce 6.domos 7.: 8.date 9.vina 10.preces 11.que 12.....
'SY'  'DI'  'CF'  'CM'  'DI'  'CF'  None


Which word is Vulcanique"?  1


[482]	0.vos 1.que 2., 3.viri 4., 5.optatos 6.huc 7.adfore 8.credite 9.Colchos 10.. 11.'
'CF'  'SY'  'CM'  'DI'  'DI'  'DI'  None


Which word is uiri,?  3


[512]	0.' 1.quos 2.fugitis 3.? 4.vellem 5.hac 6.equidem 7.me 8.strage 9.meos 10.que
'CM'  'CM'  'SY'  'CM'  'CM'  'DI'  'CF'  None


Which word is uellem?  4


[754]	0.iam 1.iam 2.aliae 3.vires 4.maiora 5.que 6.sanguine 7.nostro
'SY'  'CM'  'CM'  'DI'  'DI'  None


Which word is Iamiam?  1


[824]	0.saepe 1.Iovem 2.in 3.terras 4.Argiva 5.que 6.regna 7.Pelasgum
'CF'  'SY'  'CM'  'CM'  'DI'  'CF'  None


Which word is Iouem?  1


[895]	0.exspectata 1.manus 2.nostris 3.que 4.' 5.ait 6.' 7.agnita 8.votis 9..
'CF'  'CM'  'SY'  'DI'  'DI'  None


Which word is nostrisque"?  3


[971]	0.tecta 1.vides 2.: 3.illae 4.redeunt 5., 6.illae 7.aequore 8.certant 9..
'CF'  'CM'  None  'CM'  'CM'  'SY'  'DI'  None


Which word is illae?  3


[980]	0.iam 1.que 2.alio 3.clamore 4.ruont 5., 6.omnis 7.que 8.tenetur
'SY'  'CM'  'CF'  'CM'  'CF'  None


Which word is Iamque?  1


[1002]	0.lin 1.que 2.gravem 3.fluvium 4.et 5.miseris 6.sua 7.fata 8.colonis 9.:
'CF'  'CM'  'SY'  'CM'  'CM'  'DI'  'CF'  None


Which word is fluuium?  3


[1056]	0.vela 1.fretis 2.. 3.ilium 4.in 5.sanie 6.tabo 7.que 8.recenti
'CF'  'CM'  'SY'  'CM'  'CM'  'CF'  None


Which word is illum?  3


[1207]	0.te 1.que 2.alium 3., 4.quam 5.quem 6.Pelias 7.sperat 8.que 9.cupit 10.que 11.,
'SY'  'CM'  'DI'  'CM'  'CM'  'CF'  None


Which word is Teque?  1


[1210]	0.Ossa 1.dabat 2.Pindus 3.que 4.rates 5.quot 6.que 7.ante 8.secuti
'CF'  'CM'  'CF'  'CM'  'SY'  'CF'  None


Which word is quotque?  6


[1343]	0.hic 1.labor 2.amborum 3.que 4.haec 5.sunt 6.discrimina 7.fratrum 8..
'CM'  'DI'  'SY'  'DI'  'CM'  'DI'  None


Which word is amborum<que>?  3


[1383]	0.cuncta 1.tenens 2., 3.me 4.cum 5.omnis 6.amor 7., 8.iactura 9.que 10.plaustri
'CF'  'CM'  'SY'  'CF'  'CM'  'DI'  None


Which word is mecum?  4


[1391]	0.praedari 1.que 2.iuvat 3., 4.talem 5.que 6.hanc 7.accipe 8.dextram 9.. 10.'
'CF'  'CM'  'SY'  'DI'  'DI'  None


Which word is talemque?  5


[1662]	0.' 1.ipse 2.rogat 3.certe 4.me 5.que 6.ipse 7.implorat 8.Iason 9.?
'CF'  'CM'  'CM'  'SY'  'SY'  'CF'  None


Which word is ipse?  1


[1697]	0.coeperat 1.his 2.que 3.iterum 4.compellat 5.Iasona 6.dictis 7.:
'DI'  'SY'  'CM'  'CF'  'DI'  None


Which word is his<que>?  2


[1744]	0.sit 1.mihi 2.nocturnae 3.que 4.Hecates 5.— 6.nostri 7.que 8.vigoris 9.'
'CM'  'DI'  'SY'  'CM'  'CF'  None


Which word is nocturnaeque?  3


[1766]	0.sola 1.que 2.tantarum 3.virgo 4.haut 5.indigna 6.viarum
'DI'  'CM'  'SY'  'CM'  'CF'  None


Which word is uirgo?  3


[1776]	0.sidera 1.et 2.haec 3.te 4.me 5.que 6.vident 7.. 8.te 9.cum 10.aequora 11., 12.te 13.cum
'SY'  'DI'  'CM'  'DI'  'CF'  'CM'  'SY'  'DI'  None


Which word is tecum?  9


[1786]	0.dic 1.age 2.nunc 3., 4.utrum 5.vigilanti 6.hostem 7.que 8.videnti
'CM'  'DI'  'CM'  'CM'  'SY'  'CF'  None


Which word is uigilanti?  5


[1790]	0.orbe 1.voco 2.in 3.que 4.unum 5.iubeo 6.nunc 7.ire 8.draconem 9.,
'CF'  'SY'  'SY'  'CM'  'CM'  'DI'  'CF'  None


Which word is uoco?  1


[1829]	0.ipsa 1.fugit 2.tanto 3.que 4.( 5.nefas 6.) 7.ipsa 8.ardet 9.amore 10..
'CF'  'CM'  'CF'  'CM'  'SY'  'CF'  None


Which word is ipsa?  7


[1844]	0.Cyaneas 1.que 2.vocat 3., 4.memini 5.que 6., 7.o 8.Tiphy 9., 10.tuorum
'CF'  'CM'  'SY'  'DI'  'CF'  None


Which word is meminique,?  5


[1879]	0.me 1.cum 2.adsunt 3.. 4.magni 5.virgo 6.ne 7.regia 8.Solis
'SY'  'CM'  'CM'  'CM'  'DI'  'DI'  None


Which word is Mecum?  1


[1911]	0.te 1.que 2.simul 3.me 4.cum 5.ipsa 6.traham 7.; 8.non 9.sola 10.reposcor
'CF'  'CM'  'SY'  'CF'  'CM'  'DI'  'CF'  None


Which word is mecum?  4


In [6]:
df.to_csv(os.path.join('data','vf_scanned.csv'), index=False)
df

Unnamed: 0,comp,perseus_n,perseus_text,pc_n,pc_text,pc_bounds,elision,speaker,speech,file,line_id,tokens,elided
39,0.931818,40,"'hanc mihi militiam, veterum quae pulchrior ac...",40.0,"""Hanc mihi militiam, ueterum quae pulchrior ac...","['CM', 'DI', 'CM', 'CM', 'DI', 'DI', None]",0,Pelias,7001,VF_Argonautica_1,VF_Argonautica_1:40,"[', hanc, mihi, militiam, ,, veterum, quae, pu...","[False, False, False, False, False, False, Fal..."
40,1.000000,41,adnue daque animum. nostri de sanguine Phrixus,41.0,Adnue daque animum. nostri de sanguine Phrixus,"['DI', 'SY', 'CM', 'CM', 'DI', 'DI', None]",1,Pelias,7001,VF_Argonautica_1,VF_Argonautica_1:41,"[adnue, da, que, animum, ., nostri, de, sangui...","[False, False, True, False, False, False, Fals..."
41,1.000000,42,Cretheos ut patrias audis effugerit aras.,42.0,Cretheos ut patrias audis effugerit aras.,"['DI', 'CM', 'CM', 'CM', 'DI', None]",0,Pelias,7001,VF_Argonautica_1,VF_Argonautica_1:42,"[Cretheos, ut, patrias, audis, effugerit, aras...","[False, False, False, False, False, False, False]"
42,0.928571,43,"hunc ferus Aeetes, Scythiam Phasimque rigentem",43.0,"Hunc ferus Aeetes, Scythiam Phasinque rigentem","['CM', 'DI', 'CM', 'CM', 'CF', None]",0,Pelias,7001,VF_Argonautica_1,VF_Argonautica_1:43,"[hunc, ferus, Aeetes, ,, Scythiam, Phasim, que...","[False, False, False, False, False, False, Fal..."
43,0.926829,44,"qui colit (heu magni Solis pudor), hospita vina",44.0,"Qui colit (heu magni Solis pudor), hospita uina","['CM', 'DI', 'CM', 'CM', 'CM', 'DI', 'DI', None]",0,Pelias,7001,VF_Argonautica_1,VF_Argonautica_1:44,"[qui, colit, (, heu, magni, Solis, pudor, ), ,...","[False, False, False, False, False, False, Fal..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,0.954545,441,"nescio quid tuus iste pudor? mene, optime quondam",441.0,"Nescioquid tuus iste pudor? mene, optime quondam","['CM', 'DI', 'CF', 'CM', 'SY', 'DI', None]",1,Medea,7194,VF_Argonautica_8,VF_Argonautica_8:441,"[nescio, quid, tuus, iste, pudor, ?, mene, ,, ...","[False, False, False, False, False, False, Tru..."
444,1.000000,442,"Aesonide, me ferre preces et supplicis ora",442.0,"Aesonide, me ferre preces et supplicis ora","['CM', 'DI', 'CF', 'CM', 'DI', 'DI', None]",0,Medea,7194,VF_Argonautica_8,VF_Argonautica_8:442,"[Aesonide, ,, me, ferre, preces, et, supplicis...","[False, False, False, False, False, False, Fal..."
445,1.000000,443,fas erat? haud hoc nunc genitor putat aut dare...,443.0,Fas erat? haud hoc nunc genitor putat aut dare...,"['CM', 'DI', 'CM', 'DI', 'CM', 'CM', 'DI', 'CM...",0,Medea,7194,VF_Argonautica_8,VF_Argonautica_8:443,"[fas, erat, ?, haud, hoc, nunc, genitor, putat...","[False, False, False, False, False, False, Fal..."
446,1.000000,444,iam sceleris dominumque pati.' sic fata parantem,444.0,"Iam sceleris dominumque pati."" sic fata parantem","['CM', 'CM', 'CF', 'CM', 'DI', 'CF', None]",0,Medea,7194,VF_Argonautica_8,VF_Argonautica_8:444,"[iam, sceleris, dominum, que, pati, ., ', sic,...","[False, False, False, False, False, False, Fal..."


In [7]:
def getContext(token_table):
    df = (token_table
        .assign(lemma = token_table.lemma.str.lower())
        .groupby('line_id', as_index = False)
        .agg(line_id = ('line_id', 'first'), lemmas = ('lemma', list))
    )

    r_context = (
        pd.concat(
            pd.DataFrame(dict(
                line_id = df.iloc[:-i].line_id.values,
                lemmas = df.iloc[i:].lemmas.values,
            )) for i in range(1, 3))
        .groupby('line_id', as_index=False)
        .agg(r_context=('lemmas', lambda lems: sum(lems, [])))
    )

    l_context = (
        pd.concat(
            pd.DataFrame(dict(
                line_id = df.iloc[i:].line_id.values,
                lemmas = df.iloc[:-i].lemmas.values,
            )) for i in range(1, 3))
        .groupby('line_id', as_index=False)
        .agg(l_context=('lemmas', lambda lems: sum(lems, [])))
    )

    context = pd.merge(l_context, r_context, how='outer', on='line_id')
    context = pd.merge(df, context, on='line_id')
    context['context'] = context['l_context'] + context['lemmas'] + context['r_context']
    context = context.drop(columns=['l_context', 'lemmas', 'r_context'])

    return context

In [8]:
tables = []
for label, group in df.groupby('speech'):
    print(label)
    token_table = group.explode(['tokens', 'elided']).rename(columns={'tokens':'token'})
    fulltext = ' '.join(group.perseus_text)
    token_table['token'] = [tok for tok in nlp(fulltext)]
    token_table['lemma'] = [tok.lemma_ for tok in token_table.token]
    context = getContext(token_table)
    reps = pd.merge(token_table, context, how='left', on='line_id')[['lemma', 'context']]
    reps['lemma'] = reps['lemma'].str.lower()
    token_table['reps'] = reps.apply(lambda row: row['context'].count(row['lemma']), axis=1).values

    tables.append(token_table)

7001
7002
7003


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7004
7005
7006
7007


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7008
7009
7010
7011


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7012
7013
7014
7015


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


7016
7017
7018


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7019
7020
7021


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7022
7023
7024


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


7025
7026
7027
7028


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7029
7030
7031
7032


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


7033
7034
7035
7036


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7037
7038
7039
7040


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7041
7042
7043


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7044
7045
7046
7047


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7048
7049
7050
7051


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7052
7053
7054
7055


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7056
7057
7058
7059


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7060
7061
7062


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7063
7064
7065
7066


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7067
7068


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7069
7070
7071


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7072
7073
7074
7075


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7076
7077
7078
7079


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7080
7081
7082
7083


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7084
7085
7086
7087


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7088
7089
7090
7091


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7092
7093
7094


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7095
7096
7097


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7098
7099
7100


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7101
7102
7103
7104


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


7105
7106
7107
7108


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7109
7110
7111


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7112
7113
7114
7115


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7116
7117
7118
7119


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7120
7121


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7122
7123
7124
7125


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7126
7127
7128
7129
7130


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


7131
7132


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(


7133
7134
7135
7136


  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7137
7138
7139
7140
7141


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7142
7143
7144
7145


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7146
7147
7148
7149
7150


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7151
7152
7153
7154


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7155
7156
7157
7158


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7159
7160


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7161
7162
7163


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7164
7165
7166
7167


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table


7168
7169
7170


  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7171
7172
7173


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7174
7175
7176


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7177
7178
7179
7180


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7181
7182
7183
7184


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7185
7186
7187
7188


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7189
7190
7191


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


7192
7193
7194
7195


  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(
  df = (token_table
  pd.concat(
  pd.concat(


In [9]:
token_table = pd.concat(tables)

token_table['upos'] = [tok.pos_ for tok in token_table.token]
token_table['morph'] = [tok.morph.to_dict() for tok in token_table.token]
token_table['mood'] = [morph.get('Mood') for morph in token_table.morph]
token_table['tense'] = [morph.get('Tense') for morph in token_table.morph]
token_table['voice'] = [morph.get('Voice') for morph in token_table.morph]
token_table['person'] = [morph.get('Person') for morph in token_table.morph]
token_table['number'] = [morph.get('Number') for morph in token_table.morph]
token_table['case'] = [morph.get('Case') for morph in token_table.morph]
token_table['gender'] = [morph.get('Gender') for morph in token_table.morph]
token_table = token_table.drop(columns=['morph', 'perseus_text', 'pc_text', 'pc_bounds'])
token_table = token_table.loc[token_table.upos != 'PUNCT']
token_table.to_csv(os.path.join('data', 'vf_tokens.csv'), index=False)