This notebook was designed to run in Appmode (with all the code hidden).
" ], "text/plain": [ "This notebook was designed to run in Appmode (with all the code hidden).
'))\n", " display(HTML('Launch in appmode'.format(url)))\n", " else:\n", " display(HTML('This is a Jupyter notebook running in App mode (with all the code hidden). To view and edit the code, click on the Edit App button.
'))\n", "\n", "create_appmode_link()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def is_memento(url):\n", " '''\n", " Is this url a Memento? Checks for the presence of a timestamp.\n", " '''\n", " return bool(re.search(r'/\\d{14}(?:id_|mp_|if_)*/http', url))\n", "\n", "def get_timestamp(url):\n", " '''\n", " Extract the timestamp from a Memento\n", " '''\n", " return re.search(r'/(\\d{14})(?:if_|mp_|id_)*/', url).group(1)\n", "\n", "def get_dates(page_data):\n", " '''\n", " Return formatted dates of the saved pages.\n", " '''\n", " dates = []\n", " for capture in page_data:\n", " dates.append(format_date_from_timestamp(capture['url']))\n", " return dates\n", "\n", "def get_html(url):\n", " '''\n", " Retrieve the original HTML content of an archived page.\n", " Follow redirects if they go to another archived page.\n", " Return the (possibly redirected) url from the response and the HTML content.\n", " '''\n", " # Adding the id_ hint tells the archive to give us the original harvested version, without any rewriting.\n", " url = re.sub(r'/(\\d{14})(?:mp_)*/http', r'/\\1id_/http', url)\n", " response = requests.get(url, allow_redirects=True)\n", " # Some captures might redirect themselves to live versions\n", " # If the redirected url doesn't look like a Memento rerun this without redirection\n", " if not is_memento(response.url):\n", " response = requests.get(url, allow_redirects=False)\n", " return {'url': response.url, 'html': response.content}\n", "\n", "def get_all_text(capture_data):\n", " '''\n", " Get all the human visible text from a web page, including headers, footers, and navigation.\n", " Does some cleaning up to remove multiple spaces, tabs, and newlines.\n", " ''' \n", " try:\n", " text = BeautifulSoup(capture_data['html']).get_text()\n", " except TypeError:\n", " return None\n", " else:\n", " # Remove multiple newlines\n", " text = re.sub(r'\\n\\s*\\n', '\\n\\n', text)\n", " # Remove multiple spaces or tabs with a single space\n", " text = re.sub(r'( |\\t){2,}', ' ', text)\n", " # Remove leading spaces\n", " text = re.sub(r'\\n ', '\\n', text)\n", " # Remove leading newlines\n", " text = re.sub(r'^\\n*', '', text)\n", " return text\n", "\n", "def get_main_text(capture_data):\n", " '''\n", " Get only the main text from a page, excluding boilerplate and navigation.\n", " '''\n", " text = tf.extract(capture_data['html'])\n", " return text\n", "\n", "def load_data(urls):\n", " '''\n", " Load all the content of the specified urls into the page_data list.\n", " Add in the text and main text.\n", " '''\n", " global page_data\n", " for i, url in enumerate(urls):\n", " # Get the HTML of the archives page\n", " page_data.append(get_html(url))\n", " for capture in page_data:\n", " # Add the human-readable text\n", " capture['text'] = get_all_text(capture)\n", " # Add the main text\n", " capture['main_text'] = get_main_text(capture)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# METADATA\n", "\n", "def get_page_metadata(html):\n", " '''\n", " Get the metadata from a page extracted by Trafilatura\n", " '''\n", " return jsons.dump(tf.metadata.extract_metadata(html))\n", "\n", "def get_metadata(page_data):\n", " '''\n", " Get metadata from all saved pages.\n", " '''\n", " metadata = []\n", " for capture in page_data:\n", " metadata.append(get_page_metadata(capture['html']))\n", " return metadata\n", "\n", "def display_metadata(page_data):\n", " '''\n", " Display the extracted metadata.\n", " '''\n", " metadata = get_metadata(page_data)\n", " # Get the formatted dates of the two pages\n", " dates = get_dates(page_data)\n", " # Use Pandas to make nice tables, using the dates as index\n", " df = pd.DataFrame(metadata, index=dates)\n", " with md_out:\n", " display(HTML('{date}
{display_url}
Share this: {share_url}
'))\n", " \n", "def clear(e):\n", " global page_data\n", " page_data = []\n", " md_out.clear_output()\n", " stats_out.clear_output()\n", " links_out.clear_output()\n", " sim_out.clear_output()\n", " diff_out.clear_output()\n", " ss_out.clear_output()\n", " share_out.clear_output()\n", "\n", "def start(e):\n", " clear('e')\n", " if url1 and url2:\n", " urls = [url1, url2]\n", " else:\n", " urls = get_mementos()\n", " load_data(urls)\n", " display_metadata(page_data)\n", " display_summaries(page_data)\n", " display_links(page_data)\n", " display_similarities(page_data)\n", " display_diff('e')\n", " display_screenshots(urls)\n", " share_this(urls)\n", " \n", "md_out = widgets.Output()\n", "stats_out = widgets.Output()\n", "links_out = widgets.Output()\n", "sim_out = widgets.Output()\n", "diff_out = widgets.Output()\n", "ss_out = widgets.Output()\n", "share_out = widgets.Output()\n", "\n", "if url1 and url1:\n", " memento1 = widgets.Text(value=url1, layout=widgets.Layout(width='400px'))\n", " memento2 = widgets.Text(value=url2, layout=widgets.Layout(width='400px'))\n", " display(\n", " widgets.HBox([\n", " widgets.VBox([widgets.Label('First memento:'), \n", " widgets.Label('Second memento:')\n", " ]), \n", " widgets.VBox([memento1, memento2])], layout=widgets.Layout(padding='20px')\n", " )\n", " )\n", " display(md_out, stats_out, links_out, sim_out, diff_out, ss_out, share_out)\n", " start('e')\n", "else:\n", " repository = widgets.Dropdown(\n", " options=[('---', ''), ('UK Web Archive', 'bl'), ('National Library of Australia', 'nla'), ('National Library of New Zealand', 'nlnz'), ('Internet Archive', 'ia')],\n", " description='Archive:',\n", " disabled=False,\n", " )\n", "\n", " target_url = widgets.Text(description='Target URL:')\n", "\n", " first_date = widgets.DatePicker(\n", " description='Date 1: ',\n", " disabled=False\n", " )\n", "\n", " second_date = widgets.DatePicker(\n", " description='Date 2: ',\n", " disabled=False\n", " )\n", " \n", " start_button = widgets.Button(description='Start', button_style='primary')\n", " start_button.on_click(start)\n", " display(widgets.HBox([widgets.VBox([repository, first_date]), widgets.VBox([target_url, second_date])], layout=widgets.Layout(padding='20px')), widgets.HBox([start_button])) \n", " display(md_out, stats_out, links_out, sim_out, diff_out, ss_out, share_out)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----\n", "Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io).\n", "\n", "Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }