{ "cells": [ { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "os.chdir('/Users/muneebalam/Desktop/ipython/')\n", "import os.path\n", "if not os.path.exists('/Users/muneebalam/Desktop/ipython/chilcot/'):\n", " os.mkdir('/Users/muneebalam/Desktop/ipython/chilcot/')\n", "os.chdir('/Users/muneebalam/Desktop/ipython/chilcot/')\n", "import urllib.request\n", "urlbase = 'http://www.iraqinquiry.org.uk'\n", "from PyPDF2 import PdfFileReader" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def download():\n", " r = urllib.request.urlopen('http://www.iraqinquiry.org.uk/the-report/')\n", " page = r.read().decode('utf-8')\n", " r.close()\n", " \n", " page = page.split('.pdf')\n", " for i in range(len(page)-1):\n", " url = '{0:s}{1:s}.pdf'.format(urlbase, page[i][page[i].rfind('href=\"')+6:])\n", " savefile = '/Users/muneebalam/Desktop/ipython/chilcot/' + url[url.rfind('/')+1:]\n", " print(i+1, ' of ', len(page)-1, url)\n", " if not os.path.exists(savefile):\n", " response = urllib.request.urlopen(url)\n", " data = response.read()\n", " response.close()\n", "\n", " w = open(savefile, 'wb')\n", " w.write(data)\n", " w.close()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [], "source": [ "#download()\n", "files = ['/Users/muneebalam/Desktop/ipython/chilcot/' + x for x in os.listdir('/Users/muneebalam/Desktop/ipython/chilcot/')]" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def parse(file):\n", " r = PdfFileReader(file)\n", " numpages = r.getNumPages()\n", " \n", " fbase = file[:file.rfind('.')]\n", " alltext = []\n", " for i in range(numpages):\n", " try:\n", " text = r.getPage(i).extractText()\n", " text = text.replace('.\\n', '. ').replace('\\n', '').replace(' ', ' ')\n", " alltext.append(text)\n", " except Exception as e:\n", " print(file, 'page', i, e, e.args)\n", " text = '\\n'.join(alltext)\n", " w = open('{0:s}-{1:d}.txt'.format(fbase, i), 'w')\n", " w.write(text)\n", " w.close()\n", " print('Done with', file)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done with /Users/muneebalam/Desktop/ipython/chilcot/2016-09-06-sir-john-chilcots-public-statement.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_executive-summary.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_introduction.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-101.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-102.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-103.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-104.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-11.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-111.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-112.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-12.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-121.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-122.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-131.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-132.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-141.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-142.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-15-1.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-152.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-161.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-162.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-163.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-164.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-170.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-20.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-31.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-32.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-33.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-34.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-35.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-36.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-37.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-38.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-40.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-41.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-42.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-43.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-44.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-50.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-61.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-62.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-63.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-64.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-65.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-70.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-80.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-91.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-92.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-93.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-94.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-95.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-96.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-97.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section-98.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section_annex-1.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section_annex-2.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section_annex-3.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section_annex-4.pdf\n", "Done with /Users/muneebalam/Desktop/ipython/chilcot/the-report-of-the-iraq-inquiry_section_annex-5.pdf\n" ] } ], "source": [ "for file in files:\n", " parse(file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.3" } }, "nbformat": 4, "nbformat_minor": 0 }