{ "cells": [ { "cell_type": "markdown", "id": "903467ca-0daa-4139-90a4-258df7fd3a79", "metadata": {}, "source": [ "# Sort picle file" ] }, { "cell_type": "markdown", "id": "851ddc59-0418-4f6e-8c67-965bd561e1db", "metadata": {}, "source": [ "load config data" ] }, { "cell_type": "code", "execution_count": 1, "id": "bcd109c6-934a-4749-aab5-bc8a453f6cbb", "metadata": { "ExecuteTime": { "end_time": "2022-10-28T03:01:34.810259Z", "start_time": "2022-10-28T03:01:25.745112Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "import re\n", "import gc\n", "from datetime import date\n", "import pickle\n", "\n", "\n", "\n", "\n", "\n", "# key: filename, [0]=book_long, [1]=book_num, [3]=book_short\n", "bo2book = {'01-matthew': ['Matthew', '1', 'Matt'],\n", " '02-mark': ['Mark', '2', 'Mark'],\n", " '03-luke': ['Luke', '3', 'Luke'],\n", " '04-john': ['John', '4', 'John'],\n", " '05-acts': ['Acts', '5', 'Acts'],\n", " '06-romans': ['Romans', '6', 'Rom'],\n", " '07-1corinthians': ['I_Corinthians', '7', '1Cor'],\n", " '08-2corinthians': ['II_Corinthians', '8', '2Cor'],\n", " '09-galatians': ['Galatians', '9', 'Gal'],\n", " '10-ephesians': ['Ephesians', '10', 'Eph'],\n", " '11-philippians': ['Philippians', '11', 'Phil'],\n", " '12-colossians': ['Colossians', '12', 'Col'],\n", " '13-1thessalonians':['I_Thessalonians', '13', '1Thess'],\n", " '14-2thessalonians':['II_Thessalonians','14', '2Thess'],\n", " '15-1timothy': ['I_Timothy', '15', '1Tim'],\n", " '16-2timothy': ['II_Timothy', '16', '2Tim'],\n", " '17-titus': ['Titus', '17', 'Titus'],\n", " '18-philemon': ['Philemon', '18', 'Phlm'],\n", " '19-hebrews': ['Hebrews', '19', 'Heb'],\n", " '20-james': ['James', '20', 'Jas'],\n", " '21-1peter': ['I_Peter', '21', '1Pet'],\n", " '22-2peter': ['II_Peter', '22', '2Pet'],\n", " '23-1john': ['I_John', '23', '1John'],\n", " '24-2john': ['II_John', '24', '2John'],\n", " '25-3john': ['III_John', '25', '3John'], \n", " '26-jude': ['Jude', '26', 'Jude'],\n", " '27-revelation': ['Revelation', '27', 'Rev']}\n" ] }, { "cell_type": "markdown", "id": "7e145368-bd60-4915-b24b-25814a5431c1", "metadata": {}, "source": [ "load and sort the data in pickle file and store in new fileset." ] }, { "cell_type": "code", "execution_count": 11, "id": "bef2bbaf-e0e4-4b5b-98f6-8438f4e183fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\01-matthew.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\02-mark.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\03-luke.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\04-john.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\05-acts.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\06-romans.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\07-1corinthians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\08-2corinthians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\09-galatians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\10-ephesians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\11-philippians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\12-colossians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\13-1thessalonians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\14-2thessalonians.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\15-1timothy.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\16-2timothy.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\17-titus.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\18-philemon.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\19-hebrews.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\20-james.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\21-1peter.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\22-2peter.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\23-1john.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\24-2john.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\25-3john.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\26-jude.pkl...\n", "\tloading C:\\Users\\tonyj\\my_new_Jupyter_folder\\test_of_xml_etree\\outputfiles\\27-revelation.pkl...\n" ] } ], "source": [ "BaseDir = 'C:\\\\Users\\\\tonyj\\\\my_new_Jupyter_folder\\\\test_of_xml_etree\\\\'\n", "source_dir = BaseDir+'outputfiles\\\\' #the input files (with 'wordjumps')\n", "output_dir = BaseDir+'outputfiles_sorted\\\\' #the output files (words in order of running text)\n", "\n", "\n", "for bo in bo2book:\n", " '''\n", " load all data into a dataframe\n", " process books in order (bookinfo is a list!)\n", " ''' \n", " InputFile = os.path.join(source_dir, f'{bo}.pkl')\n", " OutputFile = os.path.join(output_dir, f'{bo}.pkl')\n", " \n", " print(f'\\tloading {InputFile}...')\n", " pkl_file = open(InputFile, 'rb')\n", " df = pickle.load(pkl_file)\n", " pkl_file.close()\n", " \n", " # not sure if this is needed\n", " # fill dictionary of column names for this book \n", " IndexDict = {} # init an empty dictionary\n", " ItemsInRow=1\n", " for itemname in df.columns.to_list():\n", " IndexDict.update({'i_{}'.format(itemname): ItemsInRow})\n", " ItemsInRow+=1\n", " \n", " # sort by id\n", " df.sort_values(by=['nodeId'])\n", " #store the resulting DataFrame per book into a pickle file for further processing\n", " #df = df.convert_dtypes(convert_string=True)\n", " output = open(r\"{}\".format(OutputFile), 'wb')\n", " pickle.dump(df, output)\n", " output.close()\n", " \n", " " ] }, { "cell_type": "code", "execution_count": 12, "id": "cbb1154d-a72e-41df-a069-d2ec0b4aa7c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " {http://www.w3.org/XML/1998/namespace}id ref Cat Start End \\\n", "127273 n64001001001 3JN 1:1!1 det 0 0 \n", "127274 n64001001002 3JN 1:1!2 adj 1 1 \n", "127275 n64001001003 3JN 1:1!3 noun 2 2 \n", "127276 n64001001004 3JN 1:1!4 det 3 3 \n", "127277 n64001001005 3JN 1:1!5 adj 4 4 \n", "... ... ... ... ... .. \n", "127487 n64001015007 3JN 1:15!7 verb 0 0 \n", "127488 n64001015008 3JN 1:15!8 det 1 1 \n", "127489 n64001015009 3JN 1:15!9 adj 2 2 \n", "127490 n64001015010 3JN 1:15!10 prep 3 3 \n", "127491 n64001015011 3JN 1:15!11 noun 4 4 \n", "\n", " StrongNumber UnicodeLemma Gender Number FunctionalTag ... \\\n", "127273 3588 ὁ Masculine Singular T-NSM ... \n", "127274 4245 πρεσβύτερος Masculine Singular A-NSM-C ... \n", "127275 1050 Γάϊος Masculine Singular N-DSM ... \n", "127276 3588 ὁ Masculine Singular T-DSM ... \n", "127277 27 ἀγαπητός Masculine Singular A-DSM ... \n", "... ... ... ... ... ... ... \n", "127487 782 ἀσπάζομαι NaN Singular V-PNM-2S ... \n", "127488 3588 ὁ Masculine Plural T-APM ... \n", "127489 5384 φίλος Masculine Plural A-APM ... \n", "127490 2596 κατά NaN NaN PREP ... \n", "127491 3686 ὄνομα Neuter Singular N-ASN ... \n", "\n", " Parent16Name Parent16Type Parent16Cat Parent16Start Parent16End \\\n", "127273 NaN NaN NaN NaN NaN \n", "127274 NaN NaN NaN NaN NaN \n", "127275 NaN NaN NaN NaN NaN \n", "127276 NaN NaN NaN NaN NaN \n", "127277 NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... \n", "127487 NaN NaN NaN NaN NaN \n", "127488 NaN NaN NaN NaN NaN \n", "127489 NaN NaN NaN NaN NaN \n", "127490 NaN NaN NaN NaN NaN \n", "127491 NaN NaN NaN NaN NaN \n", "\n", " Parent16Rule Parent16Head Parent16NodeId Parent16ClType Parent16HasDet \n", "127273 NaN NaN NaN NaN NaN \n", "127274 NaN NaN NaN NaN NaN \n", "127275 NaN NaN NaN NaN NaN \n", "127276 NaN NaN NaN NaN NaN \n", "127277 NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... \n", "127487 NaN NaN NaN NaN NaN \n", "127488 NaN NaN NaN NaN NaN \n", "127489 NaN NaN NaN NaN NaN \n", "127490 NaN NaN NaN NaN NaN \n", "127491 NaN NaN NaN NaN NaN \n", "\n", "[219 rows x 198 columns]\n", " {http://www.w3.org/XML/1998/namespace}id ref Cat Start End \\\n", "127273 n64001001001 3JN 1:1!1 det 0 0 \n", "127274 n64001001002 3JN 1:1!2 adj 1 1 \n", "127275 n64001001003 3JN 1:1!3 noun 2 2 \n", "127276 n64001001004 3JN 1:1!4 det 3 3 \n", "127277 n64001001005 3JN 1:1!5 adj 4 4 \n", "... ... ... ... ... .. \n", "127487 n64001015007 3JN 1:15!7 verb 0 0 \n", "127488 n64001015008 3JN 1:15!8 det 1 1 \n", "127489 n64001015009 3JN 1:15!9 adj 2 2 \n", "127490 n64001015010 3JN 1:15!10 prep 3 3 \n", "127491 n64001015011 3JN 1:15!11 noun 4 4 \n", "\n", " StrongNumber UnicodeLemma Gender Number FunctionalTag ... \\\n", "127273 3588 ὁ Masculine Singular T-NSM ... \n", "127274 4245 πρεσβύτερος Masculine Singular A-NSM-C ... \n", "127275 1050 Γάϊος Masculine Singular N-DSM ... \n", "127276 3588 ὁ Masculine Singular T-DSM ... \n", "127277 27 ἀγαπητός Masculine Singular A-DSM ... \n", "... ... ... ... ... ... ... \n", "127487 782 ἀσπάζομαι NaN Singular V-PNM-2S ... \n", "127488 3588 ὁ Masculine Plural T-APM ... \n", "127489 5384 φίλος Masculine Plural A-APM ... \n", "127490 2596 κατά NaN NaN PREP ... \n", "127491 3686 ὄνομα Neuter Singular N-ASN ... \n", "\n", " Parent16Name Parent16Type Parent16Cat Parent16Start Parent16End \\\n", "127273 NaN NaN NaN NaN NaN \n", "127274 NaN NaN NaN NaN NaN \n", "127275 NaN NaN NaN NaN NaN \n", "127276 NaN NaN NaN NaN NaN \n", "127277 NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... \n", "127487 NaN NaN NaN NaN NaN \n", "127488 NaN NaN NaN NaN NaN \n", "127489 NaN NaN NaN NaN NaN \n", "127490 NaN NaN NaN NaN NaN \n", "127491 NaN NaN NaN NaN NaN \n", "\n", " Parent16Rule Parent16Head Parent16NodeId Parent16ClType Parent16HasDet \n", "127273 NaN NaN NaN NaN NaN \n", "127274 NaN NaN NaN NaN NaN \n", "127275 NaN NaN NaN NaN NaN \n", "127276 NaN NaN NaN NaN NaN \n", "127277 NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... \n", "127487 NaN NaN NaN NaN NaN \n", "127488 NaN NaN NaN NaN NaN \n", "127489 NaN NaN NaN NaN NaN \n", "127490 NaN NaN NaN NaN NaN \n", "127491 NaN NaN NaN NaN NaN \n", "\n", "[219 rows x 198 columns]\n" ] } ], "source": [ "from pprint import pprint\n", "# dump test data\n", "\n", "BaseDir = 'C:\\\\Users\\\\tonyj\\\\my_new_Jupyter_folder\\\\test_of_xml_etree\\\\'\n", "source_dir = BaseDir+'outputfiles\\\\' #the input files (with 'wordjumps')\n", "output_dir = BaseDir+'outputfiles_sorted\\\\' #the output files (words in order of running text)\n", "InputFile = os.path.join(source_dir, f'25-3john.pkl')\n", "OutputFile = os.path.join(output_dir, f'25-3john.pkl')\n", "\n", "pkl_file = open(InputFile, 'rb')\n", "df = pickle.load(pkl_file)\n", "pkl_file.close()\n", "pprint(df)\n", "\n", "\n", "pkl_file = open(OutputFile, 'rb')\n", "df = pickle.load(pkl_file)\n", "pkl_file.close()\n", "pprint(df)" ] }, { "cell_type": "code", "execution_count": null, "id": "c47de537-5592-4c80-8b8f-e2c78bc63d77", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }