{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Downloading the Septuagint\n",
    "\n",
    "This is a script to donwload the plain text of the Septuagint from\n",
    "[Sacred Texts](http://sacred-texts.com/bib/sep/index.htm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os,sys,re,collections\n",
    "from lxml import html\n",
    "import requests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "base_url = 'http://sacred-texts.com/bib/sep'\n",
    "top_url = '{}/index.htm'.format(base_url)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read the book index page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "page = requests.get(top_url)\n",
    "tree = html.fromstring(page.content)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compile the list of books"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Genesis, Exodus, Leviticus, Numbers, Deuteronomy, Joshua B, Joshua A, Judges B, Judges A, Ruth, 1 Samuel, 2 Samuel, 1 Kings, 2 Kings, 1 Chronicles, 2 Chronicles, 1 Esdras, 2 Esdras, Esther, Judith, Tobit BA, Tobit S, 1 Macabees, 2 Macabees, 3 Macabees, 4 Macabees, Psalms, Odes, Proverbs, Ecclesiastes, Song of Solomon, Job, Wisdom, Sirach, Psalms of Solomon, Hosea, Micah, Amos, Joel, Jonah, Obadiah, Nahum, Habakkuk, Zephaniah, Haggai, Zechariah, Malachi, Isaiah, Jeremiah, Baruch, Epistle of Jeremiah, Lamentations, Ezekiel, Bel and the Dragon, Bel and the Dragon Th, Daniel, Daniel Th, Susanna, Susanna Th\n"
     ]
    }
   ],
   "source": [
    "books = collections.OrderedDict()\n",
    "start = False\n",
    "for x in tree.iter('a'):\n",
    "    link_text = ''.join(y.text if y.text != None else '' for y in x.iter())\n",
    "    if not start and link_text == 'Genesis': start = True\n",
    "    elif not start: continue\n",
    "    link = x.get('href')\n",
    "    books[link_text] = '{}/{}'.format(base_url, link)\n",
    "print(', '.join(books))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get the chapters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Genesis: 50 chapters\n",
      "Exodus: 40 chapters\n",
      "Leviticus: 27 chapters\n",
      "Numbers: 36 chapters\n",
      "Deuteronomy: 34 chapters\n",
      "Joshua B: 24 chapters\n",
      "Joshua A: 19 chapters\n",
      "Judges B: 21 chapters\n",
      "Judges A: 21 chapters\n",
      "Ruth: 4 chapters\n",
      "1 Samuel: 31 chapters\n",
      "2 Samuel: 24 chapters\n",
      "1 Kings: 22 chapters\n",
      "2 Kings: 25 chapters\n",
      "1 Chronicles: 29 chapters\n",
      "2 Chronicles: 36 chapters\n",
      "1 Esdras: 9 chapters\n",
      "2 Esdras: 23 chapters\n",
      "Esther: 10 chapters\n",
      "Judith: 16 chapters\n",
      "Tobit BA: 14 chapters\n",
      "Tobit S: 14 chapters\n",
      "1 Macabees: 16 chapters\n",
      "2 Macabees: 15 chapters\n",
      "3 Macabees: 7 chapters\n",
      "4 Macabees: 18 chapters\n",
      "Psalms: 151 chapters\n",
      "Odes: 14 chapters\n",
      "Proverbs: 36 chapters\n",
      "Ecclesiastes: 12 chapters\n",
      "Song of Solomon: 8 chapters\n",
      "Job: 42 chapters\n",
      "Wisdom: 19 chapters\n",
      "Sirach: 51 chapters\n",
      "Psalms of Solomon: 18 chapters\n",
      "Hosea: 14 chapters\n",
      "Micah: 7 chapters\n",
      "Amos: 9 chapters\n",
      "Joel: 4 chapters\n",
      "Jonah: 4 chapters\n",
      "Obadiah: 1 chapters\n",
      "Nahum: 3 chapters\n",
      "Habakkuk: 3 chapters\n",
      "Zephaniah: 3 chapters\n",
      "Haggai: 2 chapters\n",
      "Zechariah: 14 chapters\n",
      "Malachi: 3 chapters\n",
      "Isaiah: 66 chapters\n",
      "Jeremiah: 52 chapters\n",
      "Baruch: 5 chapters\n",
      "Epistle of Jeremiah: 1 chapters\n",
      "Lamentations: 5 chapters\n",
      "Ezekiel: 48 chapters\n",
      "Bel and the Dragon: 1 chapters\n",
      "Bel and the Dragon Th: 1 chapters\n",
      "Daniel: 12 chapters\n",
      "Daniel Th: 12 chapters\n",
      "Susanna: 1 chapters\n",
      "Susanna Th: 1 chapters\n"
     ]
    }
   ],
   "source": [
    "chapters = collections.defaultdict(dict)\n",
    "\n",
    "def getchapters(book):\n",
    "    book_url = books[book]\n",
    "    page = requests.get(book_url)\n",
    "    tree = html.fromstring(page.content)\n",
    "    chfilter = re.compile(book+' Chapter ([0-9]+)')\n",
    "    for p in tree.iter('p'):\n",
    "        for x in p.iter('a'):\n",
    "            link_text = ''.join(y.text if y.text != None else '' for y in x.iter())\n",
    "            match = chfilter.match(link_text)\n",
    "            if match:\n",
    "                chnum = int(match.group(1))\n",
    "                link = x.get('href')\n",
    "                chapters[book][chnum] = '{}/{}'.format(base_url, link)\n",
    "    print('{}: {} chapters'.format(book, max(x for x in chapters[book])))\n",
    "\n",
    "for book in books: getchapters(book)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Get the texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "writing Genesis ..................................................\n",
      "writing Exodus ........................................\n",
      "writing Leviticus ...........................\n",
      "writing Numbers ....................................\n",
      "writing Deuteronomy ..................................\n",
      "writing Joshua B ........................\n",
      "writing Joshua A ...\n",
      "writing Judges B .....................\n",
      "writing Judges A .....................\n",
      "writing Ruth ....\n",
      "writing 1 Samuel ...............................\n",
      "writing 2 Samuel ........................\n",
      "writing 1 Kings ......................\n",
      "writing 2 Kings .........................\n",
      "writing 1 Chronicles .............................\n",
      "writing 2 Chronicles ....................................\n",
      "writing 1 Esdras .........\n",
      "writing 2 Esdras .......................\n",
      "writing Esther ..........\n",
      "writing Judith ................\n",
      "writing Tobit BA ..............\n",
      "writing Tobit S ..............\n",
      "writing 1 Macabees ................\n",
      "writing 2 Macabees ...............\n",
      "writing 3 Macabees .......\n",
      "writing 4 Macabees ..................\n",
      "writing Psalms .......................................................................................................................................................\n",
      "writing Odes ..............\n",
      "writing Proverbs ...............................\n",
      "writing Ecclesiastes ............\n",
      "writing Song of Solomon ........\n",
      "writing Job ..........................................\n",
      "writing Wisdom ...................\n",
      "writing Sirach ....................................................\n",
      "writing Psalms of Solomon ..................\n",
      "writing Hosea ..............\n",
      "writing Micah .......\n",
      "writing Amos .........\n",
      "writing Joel ....\n",
      "writing Jonah ....\n",
      "writing Obadiah .\n",
      "writing Nahum ...\n",
      "writing Habakkuk ...\n",
      "writing Zephaniah ...\n",
      "writing Haggai ..\n",
      "writing Zechariah ..............\n",
      "writing Malachi ...\n",
      "writing Isaiah ..................................................................\n",
      "writing Jeremiah ....................................................\n",
      "writing Baruch .....\n",
      "writing Epistle of Jeremiah .\n",
      "writing Lamentations ......\n",
      "writing Ezekiel ................................................\n",
      "writing Bel and the Dragon .\n",
      "writing Bel and the Dragon Th .\n",
      "writing Daniel ............\n",
      "writing Daniel Th ............\n",
      "writing Susanna .\n",
      "writing Susanna Th .\n"
     ]
    }
   ],
   "source": [
    "def getchapter(book, chapter):\n",
    "    url = chapters[book][chapter]\n",
    "    page = requests.get(url)\n",
    "    page.encoding = 'utf-8'\n",
    "    tree = html.fromstring(page.content)\n",
    "    chtext = ['\\n{} {}\\n'.format(book, chapter)]\n",
    "    for x in tree.iter('p'):\n",
    "        chtext.append(x.text_content())\n",
    "    return chtext\n",
    "\n",
    "sf = open('septuagint.txt', 'w')\n",
    "for book in books:\n",
    "    sys.stdout.write('writing {} '.format(book))\n",
    "    sys.stdout.flush()\n",
    "    for chapter in chapters[book]:\n",
    "        sys.stdout.write('.')\n",
    "        sys.stdout.flush()\n",
    "        sf.write('\\n'.join(getchapter(book, chapter)))\n",
    "    sys.stdout.write('\\n')\n",
    "    sys.stdout.flush()\n",
    "sf.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}