{ "cells": [ { "cell_type": "markdown", "id": "b5a2d5af-a461-44dc-9d44-9606a1f08375", "metadata": {}, "source": [ "# Convert to TEI-XML" ] }, { "cell_type": "code", "execution_count": 1, "id": "5c343f0c-e2fc-43fb-9d96-592217f9062f", "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "id": "778e617c-08e5-4e23-8a74-3409c042ed58", "metadata": {}, "outputs": [], "source": [ "from xmlFromTf import Convert\n", "from tei_validator import validate" ] }, { "cell_type": "code", "execution_count": 4, "id": "d7c4d66e-e295-4caa-a916-6430d3606bb2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Converting TF version 1.0 to XML WITH entities\n" ] }, { "data": { "text/html": [ "TF-app: ~/github/clariah/wp6-missieven/app" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/github/clariah/wp6-missieven/tf/1.0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "data: ~/github/clariah/wp6-missieven/voc-missives/export/tf/1.0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Text-Fabric: Text-Fabric API 10.2.7, clariah/wp6-missieven/app v3, Search Reference
Data: WP6-MISSIEVEN, Character table, Feature docs
Features:
\n", "
clariah/wp6-missieven/voc-missives/export/tf\n", "
\n", "\n", "
\n", "
\n", "entityId\n", "
\n", "
str
\n", "\n", " identifier of a named entity\n", "\n", "
\n", "\n", "
\n", "
\n", "entityKind\n", "
\n", "
str
\n", "\n", " kind of a named entity\n", "\n", "
\n", "\n", "
\n", "
\n", "\n", "
General Missives Dutch East India Company 1600-1800\n", "
\n", "\n", "
\n", "
\n", "author\n", "
\n", "
str
\n", "\n", " authors of the letter, surnames only\n", "\n", "
\n", "\n", "
\n", "
\n", "authorFull\n", "
\n", "
str
\n", "\n", " authors of the letter, full names\n", "\n", "
\n", "\n", "
\n", "
\n", "col\n", "
\n", "
int
\n", "\n", " column number of a column in a row in a table\n", "\n", "
\n", "\n", "
\n", "
\n", "day\n", "
\n", "
int
\n", "\n", " day part of the date of the letter\n", "\n", "
\n", "\n", "
\n", "
\n", "isden\n", "
\n", "
int
\n", "\n", " whether a word is the denominator in fraction, e.g. 4 in 1/4\n", "\n", "
\n", "\n", "
\n", "
\n", "isemph\n", "
\n", "
str
\n", "\n", " whether a word is emphasized by typography\n", "\n", "
\n", "\n", "
\n", "
\n", "isfolio\n", "
\n", "
int
\n", "\n", " a folio reference\n", "\n", "
\n", "\n", "
\n", "
\n", "isnote\n", "
\n", "
int
\n", "\n", " whether a word belongs to footnote text\n", "\n", "
\n", "\n", "
\n", "
\n", "isnum\n", "
\n", "
int
\n", "\n", " whether a word is the numerator in fraction, e.g. 1 in 1/4\n", "\n", "
\n", "\n", "
\n", "
\n", "isorig\n", "
\n", "
int
\n", "\n", " whether a word belongs to original text\n", "\n", "
\n", "\n", "
\n", "
\n", "isq\n", "
\n", "
int
\n", "\n", " whether a word is a numerical fraction, e.g. 1/4\n", "\n", "
\n", "\n", "
\n", "
\n", "isref\n", "
\n", "
int
\n", "\n", " whether a word belongs to the text of reference\n", "\n", "
\n", "\n", "
\n", "
\n", "isremark\n", "
\n", "
int
\n", "\n", " whether a word belongs to the text of editorial remarks\n", "\n", "
\n", "\n", "
\n", "
\n", "isspecial\n", "
\n", "
int
\n", "\n", " whether a word has special typography possibly with OCR mistakes as well\n", "\n", "
\n", "\n", "
\n", "
\n", "issub\n", "
\n", "
int
\n", "\n", " whether a word has subscript typography possibly indicating the denominator of a fraction\n", "\n", "
\n", "\n", "
\n", "
\n", "issuper\n", "
\n", "
int
\n", "\n", " whether a word has superscript typography possibly indicating the numerator of a fraction\n", "\n", "
\n", "\n", "
\n", "
\n", "isund\n", "
\n", "
str
\n", "\n", " whether a word is underlined by typography\n", "\n", "
\n", "\n", "
\n", "
\n", "mark\n", "
\n", "
int
\n", "\n", " footnote mark (not necessarily the same as shown on the printed page\n", "\n", "
\n", "\n", "
\n", "
\n", "month\n", "
\n", "
int
\n", "\n", " month part of the date of the letter\n", "\n", "
\n", "\n", "
\n", "
\n", "n\n", "
\n", "
int
\n", "\n", " number of a volume, letter, page, para, line, table\n", "\n", "
\n", "\n", "
\n", "
\n", "otype\n", "
\n", "
str
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n", "page\n", "
\n", "
str
\n", "\n", " number of the first page of this letter in this volume\n", "\n", "
\n", "\n", "
\n", "
\n", "place\n", "
\n", "
str
\n", "\n", " place from where the letter was sent\n", "\n", "
\n", "\n", "
\n", "
\n", "punc\n", "
\n", "
str
\n", "\n", " punctuation and/or whitespace following a wordup to the next word\n", "\n", "
\n", "\n", "
\n", "
\n", "puncn\n", "
\n", "
str
\n", "\n", " punctuation and/or whitespace following a word,up to the next word, footnote text only\n", "\n", "
\n", "\n", "
\n", "
\n", "punco\n", "
\n", "
str
\n", "\n", " punctuation and/or whitespace following a word,up to the next word, original text only\n", "\n", "
\n", "\n", "
\n", "
\n", "puncr\n", "
\n", "
str
\n", "\n", " punctuation and/or whitespace following a word,up to the next word, remark text only\n", "\n", "
\n", "\n", "
\n", "
\n", "rawdate\n", "
\n", "
str
\n", "\n", " the date the letter was sent\n", "\n", "
\n", "\n", "
\n", "
\n", "row\n", "
\n", "
int
\n", "\n", " row number of a row of column in a table\n", "\n", "
\n", "\n", "
\n", "
\n", "seq\n", "
\n", "
str
\n", "\n", " ('sequence number of this letter among the letters of the same author in this volume',)\n", "\n", "
\n", "\n", "
\n", "
\n", "status\n", "
\n", "
str
\n", "\n", " status of the letter, e.g. secret, copy\n", "\n", "
\n", "\n", "
\n", "
\n", "title\n", "
\n", "
str
\n", "\n", " title of the letter\n", "\n", "
\n", "\n", "
\n", "
\n", "trans\n", "
\n", "
str
\n", "\n", " transcription of a word\n", "\n", "
\n", "\n", "
\n", "
\n", "transn\n", "
\n", "
str
\n", "\n", " transcription of a word, only for footnote text\n", "\n", "
\n", "\n", "
\n", "
\n", "transo\n", "
\n", "
str
\n", "\n", " transcription of a word, only for original text\n", "\n", "
\n", "\n", "
\n", "
\n", "transr\n", "
\n", "
str
\n", "\n", " transcription of a word, only for remark text\n", "\n", "
\n", "\n", "
\n", "
\n", "vol\n", "
\n", "
int
\n", "\n", " volume number\n", "\n", "
\n", "\n", "
\n", "
\n", "weblink\n", "
\n", "
str
\n", "\n", " the page-specific part of web links for page nodes\n", "\n", "
\n", "\n", "
\n", "
\n", "x\n", "
\n", "
int
\n", "\n", " column offset of a column in a row in a table\n", "\n", "
\n", "\n", "
\n", "
\n", "year\n", "
\n", "
int
\n", "\n", " year part of the date of the letter\n", "\n", "
\n", "\n", "
\n", "
\n", "note\n", "
\n", "
none
\n", "\n", " edge between a word and the footnotes associated with it\n", "\n", "
\n", "\n", "
\n", "
\n", "oslots\n", "
\n", "
none
\n", "\n", " \n", "\n", "
\n", "\n", "
\n", "
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "CV = Convert(\"1.0\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "ccf34f78-b029-4077-94da-379f8c7d2335", "metadata": {}, "outputs": [], "source": [ "volumes = CV.A.api.F.otype.s(\"volume\")" ] }, { "cell_type": "code", "execution_count": 113, "id": "69b18fa0-fae5-4288-8ba1-cadbcdde94ef", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "volume 13 => ~/github/clariah/wp6-missieven/xmlout/1.0/13.xml\n" ] } ], "source": [ "CV.doVolume(volumes[12], literal=False)" ] }, { "cell_type": "code", "execution_count": 112, "id": "06e39a24-e0b4-419d-95f8-5300f4c8a81f", "metadata": {}, "outputs": [], "source": [ "validate(f\"{CV.destDir}/1.xml\")" ] }, { "cell_type": "code", "execution_count": 114, "id": "96e726e4-d8f0-4409-b0f5-71faf00e0b4e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "volume 1 => ~/github/clariah/wp6-missieven/xmlout/1.0/1.xml\n", "volume 2 => ~/github/clariah/wp6-missieven/xmlout/1.0/2.xml\n", "volume 3 => ~/github/clariah/wp6-missieven/xmlout/1.0/3.xml\n", "volume 4 => ~/github/clariah/wp6-missieven/xmlout/1.0/4.xml\n", "volume 5 => ~/github/clariah/wp6-missieven/xmlout/1.0/5.xml\n", "volume 6 => ~/github/clariah/wp6-missieven/xmlout/1.0/6.xml\n", "volume 7 => ~/github/clariah/wp6-missieven/xmlout/1.0/7.xml\n", "volume 8 => ~/github/clariah/wp6-missieven/xmlout/1.0/8.xml\n", "volume 9 => ~/github/clariah/wp6-missieven/xmlout/1.0/9.xml\n", "volume 10 => ~/github/clariah/wp6-missieven/xmlout/1.0/10.xml\n", "volume 11 => ~/github/clariah/wp6-missieven/xmlout/1.0/11.xml\n", "volume 12 => ~/github/clariah/wp6-missieven/xmlout/1.0/12.xml\n", "volume 13 => ~/github/clariah/wp6-missieven/xmlout/1.0/13.xml\n", "volume 14 => ~/github/clariah/wp6-missieven/xmlout/1.0/14.xml\n" ] } ], "source": [ "CV.doWork()" ] }, { "cell_type": "code", "execution_count": 5, "id": "ea7f0d67-0d3e-4bac-a606-a6c71a0db70a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "validating 1\n", "validating 2\n", "validating 3\n", "validating 4\n", "validating 5\n", "validating 6\n", "validating 7\n", "validating 8\n", "validating 9\n", "validating 10\n", "validating 11\n", "validating 12\n", "validating 13\n", "validating 14\n" ] } ], "source": [ "for i in range(1, 15):\n", " print(f\"validating {i:>2}\")\n", " validate(f\"{CV.destDir}/1.xml\")" ] }, { "cell_type": "code", "execution_count": null, "id": "633a7cb5-dc4c-4e65-bcf8-3d7e23e8cf54", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.7" } }, "nbformat": 4, "nbformat_minor": 5 }