{
"cells": [
{
"cell_type": "markdown",
"id": "b5a2d5af-a461-44dc-9d44-9606a1f08375",
"metadata": {},
"source": [
"# Convert to TEI-XML"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "5c343f0c-e2fc-43fb-9d96-592217f9062f",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "778e617c-08e5-4e23-8a74-3409c042ed58",
"metadata": {},
"outputs": [],
"source": [
"from xmlFromTf import Convert\n",
"from tei_validator import validate"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d7c4d66e-e295-4caa-a916-6430d3606bb2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Converting TF version 1.0 to XML WITH entities\n"
]
},
{
"data": {
"text/html": [
"TF-app: ~/github/clariah/wp6-missieven/app"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"data: ~/github/clariah/wp6-missieven/tf/1.0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"data: ~/github/clariah/wp6-missieven/voc-missives/export/tf/1.0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Text-Fabric: Text-Fabric API 10.2.7, clariah/wp6-missieven/app v3, Search Reference
Data: WP6-MISSIEVEN, Character table, Feature docs
Features:
\n",
"clariah/wp6-missieven/voc-missives/export/tf
\n",
" \n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
identifier of a named entity\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
kind of a named entity\n",
"\n",
"
\n",
"\n",
"
\n",
" \n",
"\n",
"General Missives Dutch East India Company 1600-1800
\n",
" \n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
authors of the letter, surnames only\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
authors of the letter, full names\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
column number of a column in a row in a table\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
day part of the date of the letter\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word is the denominator in fraction, e.g. 4 in 1/4\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
whether a word is emphasized by typography\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
a folio reference\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word belongs to footnote text\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word is the numerator in fraction, e.g. 1 in 1/4\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word belongs to original text\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word is a numerical fraction, e.g. 1/4\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word belongs to the text of reference\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word belongs to the text of editorial remarks\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word has special typography possibly with OCR mistakes as well\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word has subscript typography possibly indicating the denominator of a fraction\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
whether a word has superscript typography possibly indicating the numerator of a fraction\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
whether a word is underlined by typography\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
footnote mark (not necessarily the same as shown on the printed page\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
month part of the date of the letter\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
number of a volume, letter, page, para, line, table\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
number of the first page of this letter in this volume\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
place from where the letter was sent\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
punctuation and/or whitespace following a wordup to the next word\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
punctuation and/or whitespace following a word,up to the next word, footnote text only\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
punctuation and/or whitespace following a word,up to the next word, original text only\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
punctuation and/or whitespace following a word,up to the next word, remark text only\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
the date the letter was sent\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
row number of a row of column in a table\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
('sequence number of this letter among the letters of the same author in this volume',)\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
status of the letter, e.g. secret, copy\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
title of the letter\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
transcription of a word\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
transcription of a word, only for footnote text\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
transcription of a word, only for original text\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
transcription of a word, only for remark text\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
volume number\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
str
\n",
"\n",
"
the page-specific part of web links for page nodes\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
column offset of a column in a row in a table\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
int
\n",
"\n",
"
year part of the date of the letter\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
none
\n",
"\n",
"
edge between a word and the footnotes associated with it\n",
"\n",
"
\n",
"\n",
"
\n",
"
\n",
"
none
\n",
"\n",
"
\n",
"\n",
"
\n",
"\n",
"
\n",
" \n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"\n",
"\n"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"CV = Convert(\"1.0\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ccf34f78-b029-4077-94da-379f8c7d2335",
"metadata": {},
"outputs": [],
"source": [
"volumes = CV.A.api.F.otype.s(\"volume\")"
]
},
{
"cell_type": "code",
"execution_count": 113,
"id": "69b18fa0-fae5-4288-8ba1-cadbcdde94ef",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"volume 13 => ~/github/clariah/wp6-missieven/xmlout/1.0/13.xml\n"
]
}
],
"source": [
"CV.doVolume(volumes[12], literal=False)"
]
},
{
"cell_type": "code",
"execution_count": 112,
"id": "06e39a24-e0b4-419d-95f8-5300f4c8a81f",
"metadata": {},
"outputs": [],
"source": [
"validate(f\"{CV.destDir}/1.xml\")"
]
},
{
"cell_type": "code",
"execution_count": 114,
"id": "96e726e4-d8f0-4409-b0f5-71faf00e0b4e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"volume 1 => ~/github/clariah/wp6-missieven/xmlout/1.0/1.xml\n",
"volume 2 => ~/github/clariah/wp6-missieven/xmlout/1.0/2.xml\n",
"volume 3 => ~/github/clariah/wp6-missieven/xmlout/1.0/3.xml\n",
"volume 4 => ~/github/clariah/wp6-missieven/xmlout/1.0/4.xml\n",
"volume 5 => ~/github/clariah/wp6-missieven/xmlout/1.0/5.xml\n",
"volume 6 => ~/github/clariah/wp6-missieven/xmlout/1.0/6.xml\n",
"volume 7 => ~/github/clariah/wp6-missieven/xmlout/1.0/7.xml\n",
"volume 8 => ~/github/clariah/wp6-missieven/xmlout/1.0/8.xml\n",
"volume 9 => ~/github/clariah/wp6-missieven/xmlout/1.0/9.xml\n",
"volume 10 => ~/github/clariah/wp6-missieven/xmlout/1.0/10.xml\n",
"volume 11 => ~/github/clariah/wp6-missieven/xmlout/1.0/11.xml\n",
"volume 12 => ~/github/clariah/wp6-missieven/xmlout/1.0/12.xml\n",
"volume 13 => ~/github/clariah/wp6-missieven/xmlout/1.0/13.xml\n",
"volume 14 => ~/github/clariah/wp6-missieven/xmlout/1.0/14.xml\n"
]
}
],
"source": [
"CV.doWork()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ea7f0d67-0d3e-4bac-a606-a6c71a0db70a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"validating 1\n",
"validating 2\n",
"validating 3\n",
"validating 4\n",
"validating 5\n",
"validating 6\n",
"validating 7\n",
"validating 8\n",
"validating 9\n",
"validating 10\n",
"validating 11\n",
"validating 12\n",
"validating 13\n",
"validating 14\n"
]
}
],
"source": [
"for i in range(1, 15):\n",
" print(f\"validating {i:>2}\")\n",
" validate(f\"{CV.destDir}/1.xml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "633a7cb5-dc4c-4e65-bcf8-3d7e23e8cf54",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}