{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Make a new `JournalCrawler` (soup)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<div class=\"frame\" style=\"border: solid 1.0px #000000; padding: 0.5em 1em; margin: 2em 0;\">\n",
    "    <h3 style=\"color: #880000; text-decoration: underline\">Where you need to update</h3>\n",
    "    <ul>\n",
    "        <li><code>gummy.utils.journal_utils.py</code></li>\n",
    "        <li><code>gummy.journals.py</code></li>\n",
    "        <li><code>tests.data.py</code></li>\n",
    "        <li><a href=\"https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals\">Wiki</a></li>\n",
    "    </ul>\n",
    "</div>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```python\n",
    "# Copy HERE\n",
    "from gummy.utils import get_driver\n",
    "from gummy.journals import *\n",
    "\n",
    "self = GummyAbstJournal()\n",
    "def get_soup(url):\n",
    "    with get_driver() as driver:\n",
    "        soup = self.get_soup_source(url=url, driver=driver)\n",
    "        cano_url = canonicalize(url=url, driver=driver)\n",
    "    return soup, cano_url\n",
    "\n",
    "url = input()\n",
    "soup, cano_url = get_soup(url)\n",
    "self._store_crawling_logs(cano_url=cano_url)\n",
    "print(f\"canonicalized URL: {toBLUE(cano_url)}\")\n",
    "\n",
    "# get_title_from_soup\n",
    "title = find_target_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, default=self.default_title)\n",
    "print(f\"title: {toGREEN(title)}\")\n",
    "\n",
    "# get_sections_from_soup\n",
    "sections = soup.find_all(name=\"center\")\n",
    "print(f\"num sections: {toBLUE(len(sections))}\")\n",
    "\n",
    "# get_head_from_section\n",
    "def get_head_from_section(section):\n",
    "    head = section.find(name=\"input\")\n",
    "    return head\n",
    "self.get_head_from_section = get_head_from_section\n",
    "contens = self.get_contents_from_soup_sections(sections)\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can create a new `JournalCrawler` whose `crawl_type` is **\"soup\"**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m[success]\u001b[0m local driver can be built.\n",
      "\u001b[31m[failure]\u001b[0m remote driver can't be built.\n",
      "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "from gummy.utils import get_driver\n",
    "from gummy.journals import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GoogleJournal(GummyAbstJournal):\n",
    "    pass\n",
    "self = GoogleJournal()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_soup_driver(url):\n",
    "    with get_driver() as driver:\n",
    "        soup = self.get_soup_source(url=url, driver=driver)\n",
    "        cano_url = canonicalize(url=url, driver=driver)\n",
    "    return soup, cano_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_soup(url):\n",
    "    cano_url = canonicalize(url=url, driver=None)\n",
    "    soup = self.get_soup_source(url=url, driver=None)\n",
    "    return soup, cano_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://www.google.com/\n"
     ]
    }
   ],
   "source": [
    "url = input()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## create `get_contents_soup`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### With Driver Ver."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Use \u001b[32mUselessGateWay\u001b[0m.\u001b[34m_pass2others\u001b[0m method.\n",
      "Wait up to 3[s] for all page elements to load.\n",
      "Scroll down to the bottom of the page.\n",
      "\n",
      "Decompose unnecessary tags to make it easy to parse.\n",
      "==============================\n",
      "Decomposed \u001b[32m<i>\u001b[0m tag (0)\n",
      "Decomposed \u001b[32m<link>\u001b[0m tag (1)\n",
      "Decomposed \u001b[32m<meta>\u001b[0m tag (4)\n",
      "Decomposed \u001b[32m<noscript>\u001b[0m tag (0)\n",
      "Decomposed \u001b[32m<script>\u001b[0m tag (13)\n",
      "Decomposed \u001b[32m<style>\u001b[0m tag (24)\n",
      "Decomposed \u001b[32m<sup>\u001b[0m tag (0)\n",
      "Decomposed \u001b[32m<None>\u001b[0m tag (0)\n",
      "canonicalized URL: \u001b[34mhttps://www.google.com/\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "soup, cano_url = get_soup_driver(url)\n",
    "self._store_crawling_logs(cano_url=cano_url)\n",
    "print(f\"canonicalized URL: {toBLUE(cano_url)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_title_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "title: \u001b[32mGoogle 検索は次の言語でもご利用いただけます: English\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "title = find_target_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, default=self.default_title)\n",
    "print(f\"title: {toGREEN(title)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_sections_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num sections: \u001b[34m3\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "sections = soup.find_all(name=\"center\")\n",
    "print(f\"num sections: {toBLUE(len(sections))}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_head_from_section`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "code_folding": []
   },
   "outputs": [],
   "source": [
    "def get_head_from_section(section):\n",
    "    head = section.find(name=\"input\")\n",
    "    return head\n",
    "\n",
    "self.get_head_from_section = get_head_from_section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Show contents of the paper.\n",
      "==============================\n",
      "[1/3] \n",
      "[2/3] \n",
      "[3/3] \n"
     ]
    }
   ],
   "source": [
    "contens = self.get_contents_from_soup_sections(sections)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### No Driver Ver."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Get HTML content from \u001b[34mhttps://www.google.com/\u001b[0m\n",
      "\n",
      "Decompose unnecessary tags to make it easy to parse.\n",
      "==============================\n",
      "Decomposed \u001b[32m<i>\u001b[0m tag (0)\n",
      "Decomposed \u001b[32m<link>\u001b[0m tag (0)\n",
      "Decomposed \u001b[32m<meta>\u001b[0m tag (4)\n",
      "Decomposed \u001b[32m<noscript>\u001b[0m tag (0)\n",
      "Decomposed \u001b[32m<script>\u001b[0m tag (6)\n",
      "Decomposed \u001b[32m<style>\u001b[0m tag (2)\n",
      "Decomposed \u001b[32m<sup>\u001b[0m tag (0)\n",
      "Decomposed \u001b[32m<None>\u001b[0m tag (0)\n",
      "canonicalized URL: \u001b[34mhttps://www.google.com/\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "soup, cano_url = get_soup(url)\n",
    "self._store_crawling_logs(cano_url=cano_url)\n",
    "print(f\"canonicalized URL: {toBLUE(cano_url)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_title_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "title: \u001b[32m2020-09-30@17.42.18\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "title = find_target_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, default=self.default_title)\n",
    "print(f\"title: {toGREEN(title)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_sections_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num sections: \u001b[34m1\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "sections = soup.find_all(name=\"center\")\n",
    "print(f\"num sections: {toBLUE(len(sections))}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_head_from_section`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_head_from_section(section):\n",
    "    head = section.find(name=\"input\")\n",
    "    return head\n",
    "\n",
    "self.get_head_from_section = get_head_from_section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Show contents of the paper.\n",
      "==============================\n",
      "[1/1] \n"
     ]
    }
   ],
   "source": [
    "contens = self.get_contents_from_soup_sections(sections)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Confirmation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font color=\"red\"><b>NOTE:</b></font> You also have to modify these variables:\n",
    "\n",
    "- [`gummy.journals.TranslationGummyJournalCrawlers`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/journals.py)\n",
    "- [`gummy.utils.journal_utils.DOMAIN2JOURNAL`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/utils/journal_utils.py)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gummy import TranslationGummy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model = TranslationGummy()\n",
    "# model.toPDF(url=url)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If successful, edit here too:\n",
    "\n",
    "- [Wiki: Supported journals](https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals)\n",
    "- [tests.data](https://github.com/iwasakishuto/Translation-Gummy/blob/master/tests/data.py)"
   ]
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (/usr/local/bin/)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}