{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Make a new `JournalCrawler` (soup)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can create a new `JournalCrawler` whose `crawl_type` is **\"soup\"**." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m[success]\u001b[0m local driver can be built.\n", "\u001b[31m[failure]\u001b[0m remote driver can't be built.\n", "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n" ] } ], "source": [ "from gummy.utils import get_driver\n", "from gummy.journals import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_soup(url):\n", " cano_url = canonicalize(url=url, driver=None)\n", " return BeautifulSoup(requests.get(url).content, \"html.parser\"), cano_url" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_soup_driver(url):\n", " with get_driver() as driver:\n", " driver.get(url)\n", " time.sleep(3)\n", " html = driver.page_source.encode(\"utf-8\")\n", " cano_url = canonicalize(url=url, driver=driver)\n", " return BeautifulSoup(html, \"html.parser\"), cano_url" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "class GoogleJournal(GummyAbstJournal):\n", " pass\n", "self = GoogleJournal()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.google.com/\n" ] } ], "source": [ "url = input()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## create `get_contents_soup`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### No Driver Ver." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "canonicalized URL: \u001b[34mhttps://www.google.com/\u001b[0m\n" ] } ], "source": [ "soup, cano_url = get_soup(url)\n", "self._store_crawled_info(cano_url=cano_url)\n", "print(f\"canonicalized URL: {toBLUE(cano_url)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_title_from_soup`" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "title: \u001b[32m2020-08-06@23.55.12\u001b[0m\n" ] } ], "source": [ "title = find_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, not_found=self.default_title)\n", "print(f\"title: {toGREEN(title)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_sections_from_soup`" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num sections: \u001b[34m1\u001b[0m\n" ] } ], "source": [ "sections = soup.find_all(name=\"center\")\n", "print(f\"num sections: {toBLUE(len(sections))}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_sections_from_soup`" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "soup_sections = sections" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1/1] None\n" ] } ], "source": [ "contents = []\n", "len_soup_sections = len(soup_sections)\n", "for i,section in enumerate(soup_sections):\n", " headline = \"headline\"\n", " inputTag = section.find(\"input\")\n", " if inputTag is not None:\n", " headline = inputTag.get(\"aria-label\")\n", " inputTag.decompose()\n", " contents.extend(self.organize_soup_section(section=section, headline=headline))\n", " if self.verbose: print(f\"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "***" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With Driver Ver." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n", "canonicalized URL: \u001b[34mhttps://www.google.com/\u001b[0m\n" ] } ], "source": [ "soup, cano_url = get_soup_driver(url)\n", "self._store_crawled_info(cano_url=cano_url)\n", "print(f\"canonicalized URL: {toBLUE(cano_url)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_title_from_soup`" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "title: \u001b[32m2020-08-06@23.55.12\u001b[0m\n" ] } ], "source": [ "title = find_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, not_found=self.default_title)\n", "print(f\"title: {toGREEN(title)}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_sections_from_soup`" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "num sections: \u001b[34m3\u001b[0m\n" ] } ], "source": [ "sections = soup.find_all(name=\"center\")\n", "print(f\"num sections: {toBLUE(len(sections))}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_sections_from_soup`" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "soup_sections = sections" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1/3] Google 検索\n", "[2/3] Google 検索\n", "[3/3] headline\n" ] } ], "source": [ "contents = []\n", "len_soup_sections = len(soup_sections)\n", "for i,section in enumerate(soup_sections):\n", " headline = \"headline\"\n", " inputTag = section.find(\"input\")\n", " if inputTag is not None:\n", " headline = inputTag.get(\"aria-label\")\n", " inputTag.decompose()\n", " contents_.extend(self.organize_soup_section(section=section, headline=headline))\n", " if self.verbose: print(f\"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Confirmation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "NOTE: You also have to modify these variables:\n", "\n", "- [`gummy.journals.TranslationGummyJournalCrawlers`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/journals.py)\n", "- [`gummy.utils.journal_utils.DOMAIN2JOURNAL`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/utils/journal_utils.py)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from gummy import TranslationGummy" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "model = TranslationGummy()\n", "model.toPDF(url=url)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If successful, edit here too:\n", "\n", "- [Wiki: Supported journals](https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals)\n", "- [tests.data](https://github.com/iwasakishuto/Translation-Gummy/blob/master/tests/data.py)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3 (/usr/local/bin/)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 2 }