{ "cells": [ { "cell_type": "markdown", "id": "369fa1a6-8a7b-4a42-8306-f988749d8600", "metadata": {}, "source": [ "# Pandas\n", "\n", "Export a TF dataset as Pandas" ] }, { "cell_type": "code", "execution_count": 1, "id": "31d95bcb-8d06-45d1-8fd4-99692a2c6e4c", "metadata": { "tags": [] }, "outputs": [], "source": [ "from tf.app import use" ] }, { "cell_type": "code", "execution_count": 2, "id": "c5e11055-74ca-4357-ba87-e5c32b05c18d", "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "**Locating corpus resources ...**" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "app: ~/github/CLARIAH/wp6-ferdinandhuyck/app" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/github/CLARIAH/wp6-ferdinandhuyck/tf/0.1" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " Text-Fabric: Text-Fabric API 11.2.3, CLARIAH/wp6-ferdinandhuyck/app v3, Search Reference
\n", " Data: CLARIAH - wp6-ferdinandhuyck 0.1, Character table, Feature docs
\n", "
Node types\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " \n", " \n", "\n", "
Name# of nodes# slots/node% coverage
text1218025.00100
body1218018.00100
div425190.90100
chapter444963.18100
fileDesc1299.000
editionStmt1268.000
p372558.0499
chunk383356.93100
lg4123.340
ebook121.000
pod121.000
note920.890
sourceDesc116.000
bibl213.000
revisionDesc112.000
q279.040
head868.400
titleStmt18.000
l1227.800
interpGrp17.000
change26.000
publicationStmt15.000
title35.000
item24.000
hi6023.501
author33.000
imprint23.000
encodingDesc12.000
notesStmt12.000
order22.000
availability31.670
name2681.210
idno91.110
blurb21.000
colofon21.000
date41.000
figure51.000
interp71.000
price21.000
pubPlace21.000
publisher21.000
respStmt21.000
titlepage21.000
xptr51.000
word2183801.00100
\n", " Sets: no custom sets
\n", " Features:
\n", "
CLARIAH - wp6-ferdinandhuyck\n", "
\n", "\n", "
\n", "
\n", "after\n", "
\n", "
str
\n", "\n", " the text after a word till the next word\n", "\n", "
\n", "\n", "
\n", "
\n", "chapter\n", "
\n", "
str
\n", "\n", " name of chapter\n", "\n", "
\n", "\n", "
\n", "
\n", "chunk\n", "
\n", "
int
\n", "\n", " number of a chunk within a file\n", "\n", "
\n", "\n", "
\n", "
\n", "curr\n", "
\n", "
str
\n", "\n", " this is TEI attribute curr\n", "\n", "
\n", "\n", "
\n", "
\n", "empty\n", "
\n", "
int
\n", "\n", " whether a slot has been inserted in an empty element\n", "\n", "
\n", "\n", "
\n", "
\n", "empty_lb\n", "
\n", "
int
\n", "\n", " empty TEI element lb follows\n", "\n", "
\n", "\n", "
\n", "
\n", "empty_link\n", "
\n", "
int
\n", "\n", " empty TEI element link follows\n", "\n", "
\n", "\n", "
\n", "
\n", "empty_pb\n", "
\n", "
int
\n", "\n", " empty TEI element pb follows\n", "\n", "
\n", "\n", "
\n", "
\n", "empty_pb_n\n", "
\n", "
str
\n", "\n", " TEI attribute n of empty element pb\n", "\n", "
\n", "\n", "
\n", "
\n", "is_meta\n", "
\n", "
str
\n", "\n", " whether a slot or word is in the teiHeader element\n", "\n", "
\n", "\n", "
\n", "
\n", "is_note\n", "
\n", "
str
\n", "\n", " whether a slot or word is in the note element\n", "\n", "
\n", "\n", "
\n", "
\n", "n\n", "
\n", "
str
\n", "\n", " this is TEI attribute n\n", "\n", "
\n", "\n", "
\n", "
\n", "otype\n", "
\n", "
str
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", "place\n", "
\n", "
str
\n", "\n", " this is TEI attribute place\n", "\n", "
\n", "\n", "
\n", "
\n", "rend\n", "
\n", "
str
\n", "\n", " this is TEI attribute rend\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_1tab\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as 1tab\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_b\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as b\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_bq\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as bq\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_h2\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as h2\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_h3\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as h3\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_h4\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as h4\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_i\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as i\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_sc\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as sc\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_spat\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as spat\n", "\n", "
\n", "\n", "
\n", "
\n", "rend_sup\n", "
\n", "
int
\n", "\n", " whether text is to be rendered as sup\n", "\n", "
\n", "\n", "
\n", "
\n", "str\n", "
\n", "
str
\n", "\n", " the text of a word\n", "\n", "
\n", "\n", "
\n", "
\n", "to\n", "
\n", "
str
\n", "\n", " this is TEI attribute to\n", "\n", "
\n", "\n", "
\n", "
\n", "type\n", "
\n", "
str
\n", "\n", " this is TEI attribute type\n", "\n", "
\n", "\n", "
\n", "
\n", "value\n", "
\n", "
str
\n", "\n", " this is TEI attribute value\n", "\n", "
\n", "\n", "
\n", "
\n", "oslots\n", "
\n", "
none
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Text-Fabric API: names N F E L T S C TF directly usable

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "A = use(\"CLARIAH/wp6-ferdinandhuyck:clone\", checkout=\"clone\", hoist=globals())" ] }, { "cell_type": "code", "execution_count": 3, "id": "a524e056-cbf5-456b-a282-e5274a745354", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "218538" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "c1 = F.otype.s(\"chunk\")[100]\n", "c1" ] }, { "cell_type": "code", "execution_count": 4, "id": "d9765073-3717-4925-a20f-e822070f52d1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
&version=0.1\" title=\"Show this on the website\" sec=\"Tweede hoofdstuk.@-2\">Tweede hoofdstuk.@-2  Waarin men lezen zal, wat in en voor de herberg te Zoest voorviel.\n", "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "A.plain(c1)" ] }, { "cell_type": "code", "execution_count": 5, "id": "6bdd7ec1-ef38-4cd0-b65d-32a972a9ba55", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0.00s Create tsv file ...\n", " | 0.17s 5% 11363 nodes written\n", " | 0.33s 10% 22726 nodes written\n", " | 0.49s 15% 34089 nodes written\n", " | 0.65s 20% 45452 nodes written\n", " | 0.81s 25% 56815 nodes written\n", " | 0.97s 30% 68178 nodes written\n", " | 1.13s 35% 79541 nodes written\n", " | 1.29s 40% 90904 nodes written\n", " | 1.45s 45% 102267 nodes written\n", " | 1.61s 50% 113630 nodes written\n", " | 1.77s 55% 124993 nodes written\n", " | 1.93s 60% 136356 nodes written\n", " | 2.09s 65% 147719 nodes written\n", " | 2.25s 70% 159082 nodes written\n", " | 2.41s 75% 170445 nodes written\n", " | 2.57s 80% 181808 nodes written\n", " | 2.73s 85% 193171 nodes written\n", " | 2.89s 90% 204534 nodes written\n", " | 3.05s 95% 215897 nodes written\n", " | 3.22s 95% 227255 nodes written and done\n", " 3.22s TSV file is ~/github/CLARIAH/wp6-ferdinandhuyck/_temp/data-0.1.tsv\n", " 3.22s Columns 32:\n", " 3.22s \tnd\n", " 3.22s \totype\n", " 3.22s \tafter\n", " 3.22s \tstr\n", " 3.22s \tin_chapter\n", " 3.22s \tin_chunk\n", " 3.22s \tchapter\n", " 3.22s \tchunk\n", " 3.23s \tcurr\n", " 3.23s \tempty\n", " 3.23s \tempty_lb\n", " 3.23s \tempty_link\n", " 3.23s \tempty_pb\n", " 3.23s \tempty_pb_n\n", " 3.23s \tis_meta\n", " 3.23s \tis_note\n", " 3.23s \tn\n", " 3.23s \tplace\n", " 3.23s \trend\n", " 3.23s \trend_1tab\n", " 3.23s \trend_b\n", " 3.23s \trend_bq\n", " 3.23s \trend_h2\n", " 3.23s \trend_h3\n", " 3.23s \trend_h4\n", " 3.23s \trend_i\n", " 3.23s \trend_sc\n", " 3.23s \trend_spat\n", " 3.23s \trend_sup\n", " 3.23s \tto\n", " 3.23s \ttype\n", " 3.23s \tvalue\n", "\n", " 3.29s \t227256 rows\n", " 3.29s \t13520413 characters\n", " 3.29s Importing into Pandas ...\n", " | 0.00s Reading tsv file ...\n", " | 0.96s Done. Size = 7272160\n", " | 0.96s Saving as Parquet file ...\n", " | 1.12s Saved\n", " 4.41s PD in ~/github/CLARIAH/wp6-ferdinandhuyck/pandas/data-0.1.pd\n" ] } ], "source": [ "A.exportPandas(inTypes=\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "0f57e102-964e-47c0-98de-36065ce6be06", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" } }, "nbformat": 4, "nbformat_minor": 5 }