{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "<img align=\"right\" src=\"images/dans-small.png\"/>\n", "<img align=\"right\" src=\"images/tf-small.png\"/>\n", "<img align=\"right\" src=\"images/etcbc.png\"/>\n", "\n", "\n", "# Import BHSA data into Pandas\n", "\n", "This notebook contains the Pandas instructions to load the\n", "[Pandas export](export.ipynb) export of the BHSA.\n", "\n", "We then perform some simple information extracting on the data." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# How to get the Pandas file\n", "\n", "The direct download link is \n", "[data-2021.pd](https://github.com/ETCBC/bhsa/releases/download/v1.8/data-2021.pd)\n", "\n", "The pandas file is over 50 MB, a bit too large for GitHub without large file support.\n", "So I attached it to the\n", "[latest release](https://github.com/ETCBC/bhsa/releases/tag/v1.8).\n", "\n", "## Reproduction\n", "\n", "If you want to do it yourself,\n", "\n", "* clone this repo\n", "* find the [export](export.ipynb) notebook\n", "* run it in Jupyterlab\n", "* pick up the newly generated file from the `/pandas` subdirectory." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd # pip3 install pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# File locations\n", "\n", "We set up some variables for the location of the Pandas file and a location\n", "where we will save the full text of this corpus." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "VERSION = \"2021\"\n", "PANDAS_DIR = os.path.abspath(\"../pandas\")\n", "TEXT_DIR = os.path.abspath(os.path.expanduser(\"~/Downloads/text\"))\n", "TABLE_FILE_PD = f\"{PANDAS_DIR}/data-{VERSION}.pd\"\n", "TABLE_FILE_TXT = f\"{TEXT_DIR}/data-{VERSION}.txt\"\n", "\n", "if not os.path.exists(TEXT_DIR):\n", " os.makedirs(TEXT_DIR)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load the dataframe" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done. Size=104171832\n" ] } ], "source": [ "frame = pd.read_parquet(TABLE_FILE_PD, engine=\"pyarrow\")\n", "print(\"Done. Size={}\".format(frame.size))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1446831, 72)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.shape" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>nd</th>\n", " <th>otype</th>\n", " <th>g_cons</th>\n", " <th>g_cons_utf8</th>\n", " <th>g_lex</th>\n", " <th>g_lex_utf8</th>\n", " <th>g_word</th>\n", " <th>g_word_utf8</th>\n", " <th>lex</th>\n", " <th>lex_utf8</th>\n", " <th>...</th>\n", " <th>tab</th>\n", " <th>txt</th>\n", " <th>typ</th>\n", " <th>uvf</th>\n", " <th>vbe</th>\n", " <th>vbs</th>\n", " <th>verse</th>\n", " <th>voc_lex</th>\n", " <th>vs</th>\n", " <th>vt</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>426591</td>\n", " <td>book</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>426630</td>\n", " <td>chapter</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1414389</td>\n", " <td>verse</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>1</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>1172308</td>\n", " <td>sentence</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>1236025</td>\n", " <td>sentence_atom</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>427559</td>\n", " <td>clause</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td>?</td>\n", " <td>xQtX</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>515690</td>\n", " <td>clause_atom</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>xQtX</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>606394</td>\n", " <td>half_verse</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>651573</td>\n", " <td>phrase</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>PP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>904776</td>\n", " <td>phrase_atom</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>PP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>1437602</td>\n", " <td>lex</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>B</td>\n", " <td>ב</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td>B.:</td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>1</td>\n", " <td>word</td>\n", " <td>B</td>\n", " <td>ב</td>\n", " <td>B.:-</td>\n", " <td>בְּ</td>\n", " <td>B.:-</td>\n", " <td>בְּ</td>\n", " <td>B</td>\n", " <td>ב</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td>absent</td>\n", " <td>n/a</td>\n", " <td>n/a</td>\n", " <td><NA></td>\n", " <td>B.:</td>\n", " <td>NA</td>\n", " <td>NA</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>1437603</td>\n", " <td>lex</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>R>CJT/</td>\n", " <td>ראשׁית֜</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td>R;>CIJT</td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>2</td>\n", " <td>word</td>\n", " <td>R>CJT</td>\n", " <td>ראשׁית</td>\n", " <td>R;>CIJT</td>\n", " <td>רֵאשִׁית</td>\n", " <td>R;>CI73JT</td>\n", " <td>רֵאשִׁ֖ית</td>\n", " <td>R>CJT/</td>\n", " <td>ראשׁית</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td>absent</td>\n", " <td>n/a</td>\n", " <td>n/a</td>\n", " <td><NA></td>\n", " <td>R;>CIJT</td>\n", " <td>NA</td>\n", " <td>NA</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>1437604</td>\n", " <td>lex</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>BR>[</td>\n", " <td>ברא</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td>BR></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>651574</td>\n", " <td>phrase</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>VP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>904777</td>\n", " <td>phrase_atom</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>VP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>3</td>\n", " <td>word</td>\n", " <td>BR></td>\n", " <td>ברא</td>\n", " <td>B.@R@></td>\n", " <td>בָּרָא</td>\n", " <td>B.@R@74></td>\n", " <td>בָּרָ֣א</td>\n", " <td>BR>[</td>\n", " <td>ברא</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td>absent</td>\n", " <td></td>\n", " <td>absent</td>\n", " <td><NA></td>\n", " <td>BR></td>\n", " <td>qal</td>\n", " <td>perf</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>1437605</td>\n", " <td>lex</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>>LHJM/</td>\n", " <td>אלהים֜</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td>>:ELOHIJM</td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>651575</td>\n", " <td>phrase</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>NP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>904778</td>\n", " <td>phrase_atom</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>NP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>4</td>\n", " <td>word</td>\n", " <td>>LHJM</td>\n", " <td>אלהים</td>\n", " <td>>:ELOH</td>\n", " <td>אֱלֹה</td>\n", " <td>>:ELOHI92JM</td>\n", " <td>אֱלֹהִ֑ים</td>\n", " <td>>LHJM/</td>\n", " <td>אלהים</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td>absent</td>\n", " <td>n/a</td>\n", " <td>n/a</td>\n", " <td><NA></td>\n", " <td>>:ELOHIJM</td>\n", " <td>NA</td>\n", " <td>NA</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>606395</td>\n", " <td>half_verse</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>651576</td>\n", " <td>phrase</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>PP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>904779</td>\n", " <td>phrase_atom</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td>PP</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>1300539</td>\n", " <td>subphrase</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>1437606</td>\n", " <td>lex</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>>T</td>\n", " <td>את</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td>>;T</td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>5</td>\n", " <td>word</td>\n", " <td>>T</td>\n", " <td>את</td>\n", " <td>>;T</td>\n", " <td>אֵת</td>\n", " <td>>;71T</td>\n", " <td>אֵ֥ת</td>\n", " <td>>T</td>\n", " <td>את</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td>absent</td>\n", " <td>n/a</td>\n", " <td>n/a</td>\n", " <td><NA></td>\n", " <td>>;T</td>\n", " <td>NA</td>\n", " <td>NA</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>1437607</td>\n", " <td>lex</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>H</td>\n", " <td>ה</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td><NA></td>\n", " <td>HA</td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>6</td>\n", " <td>word</td>\n", " <td>H</td>\n", " <td>ה</td>\n", " <td>HA-</td>\n", " <td>הַ</td>\n", " <td>HA-</td>\n", " <td>הַ</td>\n", " <td>H</td>\n", " <td>ה</td>\n", " <td>...</td>\n", " <td><NA></td>\n", " <td></td>\n", " <td></td>\n", " <td>absent</td>\n", " <td>n/a</td>\n", " <td>n/a</td>\n", " <td><NA></td>\n", " <td>HA</td>\n", " <td>NA</td>\n", " <td>NA</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>30 rows × 72 columns</p>\n", "</div>" ], "text/plain": [ " nd otype g_cons g_cons_utf8 g_lex g_lex_utf8 \\\n", "0 426591 book \n", "1 426630 chapter \n", "2 1414389 verse \n", "3 1172308 sentence \n", "4 1236025 sentence_atom \n", "5 427559 clause \n", "6 515690 clause_atom \n", "7 606394 half_verse \n", "8 651573 phrase \n", "9 904776 phrase_atom \n", "10 1437602 lex \n", "11 1 word B ב B.:- בְּ \n", "12 1437603 lex \n", "13 2 word R>CJT ראשׁית R;>CIJT רֵאשִׁית \n", "14 1437604 lex \n", "15 651574 phrase \n", "16 904777 phrase_atom \n", "17 3 word BR> ברא B.@R@> בָּרָא \n", "18 1437605 lex \n", "19 651575 phrase \n", "20 904778 phrase_atom \n", "21 4 word >LHJM אלהים >:ELOH אֱלֹה \n", "22 606395 half_verse \n", "23 651576 phrase \n", "24 904779 phrase_atom \n", "25 1300539 subphrase \n", "26 1437606 lex \n", "27 5 word >T את >;T אֵת \n", "28 1437607 lex \n", "29 6 word H ה HA- הַ \n", "\n", " g_word g_word_utf8 lex lex_utf8 ... tab txt typ uvf \\\n", "0 ... <NA> \n", "1 ... <NA> \n", "2 ... <NA> \n", "3 ... <NA> \n", "4 ... <NA> \n", "5 ... <NA> ? xQtX \n", "6 ... <NA> xQtX \n", "7 ... <NA> \n", "8 ... <NA> PP \n", "9 ... <NA> PP \n", "10 B ב ... <NA> \n", "11 B.:- בְּ B ב ... <NA> absent \n", "12 R>CJT/ ראשׁית֜ ... <NA> \n", "13 R;>CI73JT רֵאשִׁ֖ית R>CJT/ ראשׁית ... <NA> absent \n", "14 BR>[ ברא ... <NA> \n", "15 ... <NA> VP \n", "16 ... <NA> VP \n", "17 B.@R@74> בָּרָ֣א BR>[ ברא ... <NA> absent \n", "18 >LHJM/ אלהים֜ ... <NA> \n", "19 ... <NA> NP \n", "20 ... <NA> NP \n", "21 >:ELOHI92JM אֱלֹהִ֑ים >LHJM/ אלהים ... <NA> absent \n", "22 ... <NA> \n", "23 ... <NA> PP \n", "24 ... <NA> PP \n", "25 ... <NA> \n", "26 >T את ... <NA> \n", "27 >;71T אֵ֥ת >T את ... <NA> absent \n", "28 H ה ... <NA> \n", "29 HA- הַ H ה ... <NA> absent \n", "\n", " vbe vbs verse voc_lex vs vt \n", "0 <NA> \n", "1 <NA> \n", "2 1 \n", "3 <NA> \n", "4 <NA> \n", "5 <NA> \n", "6 <NA> \n", "7 <NA> \n", "8 <NA> \n", "9 <NA> \n", "10 <NA> B.: \n", "11 n/a n/a <NA> B.: NA NA \n", "12 <NA> R;>CIJT \n", "13 n/a n/a <NA> R;>CIJT NA NA \n", "14 <NA> BR> \n", "15 <NA> \n", "16 <NA> \n", "17 absent <NA> BR> qal perf \n", "18 <NA> >:ELOHIJM \n", "19 <NA> \n", "20 <NA> \n", "21 n/a n/a <NA> >:ELOHIJM NA NA \n", "22 <NA> \n", "23 <NA> \n", "24 <NA> \n", "25 <NA> \n", "26 <NA> >;T \n", "27 n/a n/a <NA> >;T NA NA \n", "28 <NA> HA \n", "29 n/a n/a <NA> HA NA NA \n", "\n", "[30 rows x 72 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "frame.head(30)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['nd',\n", " 'otype',\n", " 'g_cons',\n", " 'g_cons_utf8',\n", " 'g_lex',\n", " 'g_lex_utf8',\n", " 'g_word',\n", " 'g_word_utf8',\n", " 'lex',\n", " 'lex_utf8',\n", " 'phono',\n", " 'phono_trailer',\n", " 'qere',\n", " 'qere_trailer',\n", " 'qere_trailer_utf8',\n", " 'qere_utf8',\n", " 'trailer',\n", " 'trailer_utf8',\n", " 'voc_lex_utf8',\n", " 'in_book',\n", " 'in_chapter',\n", " 'in_verse',\n", " 'in_lex',\n", " 'in_half_verse',\n", " 'in_sentence',\n", " 'in_sentence_atom',\n", " 'in_clause',\n", " 'in_clause_atom',\n", " 'in_phrase',\n", " 'in_phrase_atom',\n", " 'in_subphrase',\n", " 'in_word',\n", " 'crossref',\n", " 'mother',\n", " 'book',\n", " 'chapter',\n", " 'code',\n", " 'det',\n", " 'domain',\n", " 'freq_lex',\n", " 'function',\n", " 'gloss',\n", " 'gn',\n", " 'label',\n", " 'language',\n", " 'ls',\n", " 'nametype',\n", " 'nme',\n", " 'nu',\n", " 'number',\n", " 'pargr',\n", " 'pdp',\n", " 'pfm',\n", " 'prs',\n", " 'prs_gn',\n", " 'prs_nu',\n", " 'prs_ps',\n", " 'ps',\n", " 'rank_lex',\n", " 'rela',\n", " 'sp',\n", " 'st',\n", " 'tab',\n", " 'txt',\n", " 'typ',\n", " 'uvf',\n", " 'vbe',\n", " 'vbs',\n", " 'verse',\n", " 'voc_lex',\n", " 'vs',\n", " 'vt']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columnList = frame.columns.values.tolist()\n", "columnList" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Books" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us extract some data.\n", "First a list of the book names." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Genesis Exodus Leviticus Numeri Deuteronomium Josua Judices Samuel_I Samuel_II Reges_I Reges_II Jesaia Jeremia Ezechiel Hosea Joel Amos Obadia Jona Micha Nahum Habakuk Zephania Haggai Sacharia Maleachi Psalmi Iob Proverbia Ruth Canticum Ecclesiastes Threni Esther Daniel Esra Nehemia Chronica_I Chronica_II\n" ] } ], "source": [ "books = frame[frame.otype == \"book\"].book\n", "print(\" \".join(str(x) for x in books))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Text\n", "\n", "Now the complete text of the whole bible." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "words = frame.loc[frame.otype == \"word\"]\n", "text = words.g_word_utf8 + words.trailer_utf8\n", "\n", "with open(TABLE_FILE_TXT, \"w\") as pt:\n", " pt.write((\"\".join(text)).replace(\"\\u05C3\", \"\\u05C3\\n\").replace(\"\\\\n\", \"\\n\"))\n", " pt.write(\"\\n\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃\n", " וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם׃\n", " וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י אֹ֑ור וַֽיְהִי־אֹֽור׃\n", " וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָאֹ֖ור כִּי־טֹ֑וב וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָאֹ֖ור וּבֵ֥ין הַחֹֽשֶׁךְ׃\n", " וַיִּקְרָ֨א אֱלֹהִ֤ים׀ לָאֹור֙ יֹ֔ום וְלַחֹ֖שֶׁךְ קָ֣רָא לָ֑יְלָה וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר יֹ֥ום אֶחָֽד׃\n", " פ וַיֹּ֣אמֶר אֱלֹהִ֔ים יְהִ֥י רָקִ֖יעַ בְּתֹ֣וךְ הַמָּ֑יִם וִיהִ֣י מַבְדִּ֔יל בֵּ֥ין מַ֖יִם לָמָֽיִם׃\n", " וַיַּ֣עַשׂ אֱלֹהִים֮ אֶת־הָרָקִיעַ֒ וַיַּבְדֵּ֗ל בֵּ֤ין הַמַּ֨יִם֙ אֲשֶׁר֙ מִתַּ֣חַת לָרָקִ֔יעַ וּבֵ֣ין הַמַּ֔יִם אֲשֶׁ֖ר מֵעַ֣ל לָרָקִ֑יעַ וַֽיְהִי־כֵֽן׃\n", " וַיִּקְרָ֧א אֱלֹהִ֛ים לָֽרָקִ֖יעַ שָׁמָ֑יִם וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר יֹ֥ום שֵׁנִֽי׃\n", " פ וַיֹּ֣אמֶר אֱלֹהִ֗ים יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶל־מָקֹ֣ום אֶחָ֔ד וְתֵרָאֶ֖ה הַיַּבָּשָׁ֑ה וַֽיְהִי־כֵֽן׃\n", " וַיִּקְרָ֨א אֱלֹהִ֤ים׀ לַיַּבָּשָׁה֙ אֶ֔רֶץ וּלְמִקְוֵ֥ה הַמַּ֖יִם קָרָ֣א יַמִּ֑ים וַיַּ֥רְא אֱלֹהִ֖ים כִּי־טֹֽוב׃\n" ] } ], "source": [ "!head {TABLE_FILE_TXT}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Drill down to a passage" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us get the words from the first verse.\n", "\n", "How do we know the node of the first verse? See the end of the\n", "[bigTable](bigTable.ipynb) notebook." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "v1 = 1414389" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Is this really a verse?" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'verse'" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "element = frame[frame.nd == v1].otype.iloc[0]\n", "element" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<IntegerArray>\n", "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n", "Length: 11, dtype: Int64\n" ] } ], "source": [ "wordIds = frame[(frame.otype == \"word\") & (frame.in_verse == v1)].nd\n", "print(wordIds.values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now the *text* of the first verse." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃\n", " \n" ] } ], "source": [ "words = frame[(frame.otype == \"word\") & (frame.in_verse == v1)]\n", "text = words.g_word_utf8 + words.trailer_utf8\n", "print((\"\".join(text)).replace(\"\\u05C3\", \"\\u05C3\\n\").replace(\"\\\\n\", \"\\n\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us get the words and text of an arbitrary passage, say Psalmi 131:2\n", "\n", "First the id of the chunk (i.e. the Text-Fabric node number):" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1431812\n" ] } ], "source": [ "verse_id = frame[\n", " (frame.otype == \"verse\")\n", " & (frame.book == \"Psalmi\")\n", " & (frame.chapter == 131)\n", " & (frame.verse == 2)\n", "].nd.iloc[0]\n", "print(verse_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now the word ids of that verse:" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<IntegerArray>\n", "[333425, 333426, 333427, 333428, 333429, 333430, 333431, 333432, 333433,\n", " 333434, 333435, 333436, 333437, 333438, 333439]\n", "Length: 15, dtype: Int64\n" ] } ], "source": [ "words = frame[(frame.otype == \"word\") & (frame.in_verse == verse_id)]\n", "print(words.nd.values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And, finally, the text of those words." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "אִם־לֹ֤א שִׁוִּ֨יתִי׀ וְדֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּגָּמֻ֖ל עָלַ֣י נַפְשִֽׁי׃\n", " \n" ] } ], "source": [ "text = words.g_word_utf8 + words.trailer_utf8\n", "print((\"\".join(text)).replace(\"\\u05C3\", \"\\u05C3\\n\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let us organize this in two functions: one that returns the verse object given a passage, and one that prints the texts of the words in a given object." ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def object2text(nd):\n", " otype = frame[frame.nd == nd].otype.iloc[0]\n", " inelement = \"in_\" + otype\n", " words = frame[(frame.otype == \"word\") & (frame[inelement] == nd)]\n", " text = words.g_word_utf8 + words.trailer_utf8\n", " return (\"\".join(text)).replace(\"\\u05C3\", \"\\u05C3\\n\").replace(\"\\\\n\", \"\\n\")\n", "\n", "\n", "def verse2object(book, chapter, verse):\n", " return frame[\n", " (frame.otype == \"verse\")\n", " & (frame.book == book)\n", " & (frame.chapter == chapter)\n", " & (frame.verse == verse)\n", " ].nd.iloc[0]\n", "\n", "\n", "def verse2text(book, chapter, verse):\n", " return object2text(verse2object(book, chapter, verse))\n", "\n", "\n", "def chapter2object(book, chapter):\n", " return frame[\n", " (frame.otype == \"chapter\") & (frame.book == book) & (frame.chapter == chapter)\n", " ].nd.iloc[0]\n", "\n", "\n", "def chapter2text(book, chapter):\n", " return object2text(chapter2object(book, chapter))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "אִם־לֹ֤א שִׁוִּ֨יתִי׀ וְדֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּגָּמֻ֖ל עָלַ֣י נַפְשִֽׁי׃\n", " \n" ] } ], "source": [ "print(verse2text(\"Psalmi\", 131, 2))" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "שִׁ֥יר הַֽמַּֽעֲלֹ֗ות לְדָ֫וִ֥ד יְהוָ֤ה׀ לֹא־גָבַ֣הּ לִ֭בִּי וְלֹא־רָמ֣וּ עֵינַ֑י וְלֹֽא־הִלַּ֓כְתִּי׀ בִּגְדֹלֹ֖ות וּבְנִפְלָאֹ֣ות מִמֶּֽנִּי׃\n", " אִם־לֹ֤א שִׁוִּ֨יתִי׀ וְדֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּגָּמֻ֖ל עָלַ֣י נַפְשִֽׁי׃\n", " יַחֵ֣ל יִ֝שְׂרָאֵל אֶל־יְהוָ֑ה מֵֽ֝עַתָּ֗ה וְעַד־עֹולָֽם׃\n", " \n" ] } ], "source": [ "print(chapter2text(\"Psalmi\", 131))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Bi-grams\n", "\n", "We make a column of verse-bound bi-grams of lexemes. The two lexemes are separated by an underscore `_`." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "vsNext = frame[frame.otype == \"word\"].in_verse\n", "vsPrev = frame[frame.otype == \"word\"].in_verse.shift(1)\n", "lex = frame[frame.otype == \"word\"].lex_utf8\n", "lexNext = frame[frame.otype == \"word\"].lex_utf8.shift(1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "lastInVs = vsPrev != vsNext\n", "lexNext[lastInVs] = \"\"" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "bigram = [\"{}_{}\".format(*p) for p in zip(lex, lexNext)]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['אשׁה_אם',\n", " 'מן_אשׁה',\n", " 'ארץ_מן',\n", " 'מצרים_ארץ',\n", " 'ו_',\n", " 'היה_ו',\n", " 'ב_היה',\n", " 'ה_ב',\n", " 'עת_ה',\n", " 'ה_עת',\n", " 'היא_ה',\n", " 'ו_היא',\n", " 'אמר_ו',\n", " 'אבימלך_אמר',\n", " 'ו_אבימלך',\n", " 'פיכל_ו',\n", " 'שׂר_פיכל',\n", " 'צבא_שׂר',\n", " 'אל_צבא',\n", " 'אברהם_אל',\n", " 'ל_אברהם',\n", " 'אמר_ל',\n", " 'אלהים_אמר',\n", " 'עם_אלהים',\n", " 'ב_עם',\n", " 'כל_ב',\n", " 'אשׁר_כל',\n", " 'אתה_אשׁר',\n", " 'עשׂה_אתה',\n", " 'ו_']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bigram[10_000:10_030]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.1" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }