{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Make a new `JournalCrawler` (soup)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can create a new `JournalCrawler` whose `crawl_type` is **\"soup\"**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[32m[success]\u001b[0m local driver can be built.\n",
      "\u001b[31m[failure]\u001b[0m remote driver can't be built.\n",
      "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "from gummy.utils import get_driver\n",
    "from gummy.journals import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_soup(url):\n",
    "    cano_url = canonicalize(url=url, driver=None)\n",
    "    return BeautifulSoup(requests.get(url).content, \"html.parser\"), cano_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_soup_driver(url):\n",
    "    with get_driver() as driver:\n",
    "        driver.get(url)\n",
    "        time.sleep(3)\n",
    "        html = driver.page_source.encode(\"utf-8\")\n",
    "        cano_url = canonicalize(url=url, driver=driver)\n",
    "    return BeautifulSoup(html, \"html.parser\"), cano_url"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GoogleJournal(GummyAbstJournal):\n",
    "    pass\n",
    "self = GoogleJournal()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://www.google.com/\n"
     ]
    }
   ],
   "source": [
    "url = input()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## create `get_contents_soup`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### No Driver Ver."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "canonicalized URL: \u001b[34mhttps://www.google.com/\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "soup, cano_url = get_soup(url)\n",
    "self._store_crawled_info(cano_url=cano_url)\n",
    "print(f\"canonicalized URL: {toBLUE(cano_url)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_title_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "title: \u001b[32m2020-08-06@23.55.12\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "title = find_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, not_found=self.default_title)\n",
    "print(f\"title: {toGREEN(title)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_sections_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num sections: \u001b[34m1\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "sections = soup.find_all(name=\"center\")\n",
    "print(f\"num sections: {toBLUE(len(sections))}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_sections_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "soup_sections = sections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1/1] None\n"
     ]
    }
   ],
   "source": [
    "contents = []\n",
    "len_soup_sections = len(soup_sections)\n",
    "for i,section in enumerate(soup_sections):\n",
    "    headline = \"headline\"\n",
    "    inputTag = section.find(\"input\")\n",
    "    if inputTag is not None:\n",
    "        headline = inputTag.get(\"aria-label\")\n",
    "        inputTag.decompose()\n",
    "    contents.extend(self.organize_soup_section(section=section, headline=headline))\n",
    "    if self.verbose: print(f\"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### With Driver Ver."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n",
      "canonicalized URL: \u001b[34mhttps://www.google.com/\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "soup, cano_url = get_soup_driver(url)\n",
    "self._store_crawled_info(cano_url=cano_url)\n",
    "print(f\"canonicalized URL: {toBLUE(cano_url)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_title_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "title: \u001b[32m2020-08-06@23.55.12\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "title = find_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, not_found=self.default_title)\n",
    "print(f\"title: {toGREEN(title)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_sections_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "num sections: \u001b[34m3\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "sections = soup.find_all(name=\"center\")\n",
    "print(f\"num sections: {toBLUE(len(sections))}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### `get_sections_from_soup`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "soup_sections = sections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1/3] Google 検索\n",
      "[2/3] Google 検索\n",
      "[3/3] headline\n"
     ]
    }
   ],
   "source": [
    "contents = []\n",
    "len_soup_sections = len(soup_sections)\n",
    "for i,section in enumerate(soup_sections):\n",
    "    headline = \"headline\"\n",
    "    inputTag = section.find(\"input\")\n",
    "    if inputTag is not None:\n",
    "        headline = inputTag.get(\"aria-label\")\n",
    "        inputTag.decompose()\n",
    "    contents_.extend(self.organize_soup_section(section=section, headline=headline))\n",
    "    if self.verbose: print(f\"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Confirmation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<font color=\"red\"><b>NOTE:</b></font> You also have to modify these variables:\n",
    "\n",
    "- [`gummy.journals.TranslationGummyJournalCrawlers`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/journals.py)\n",
    "- [`gummy.utils.journal_utils.DOMAIN2JOURNAL`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/utils/journal_utils.py)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gummy import TranslationGummy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TranslationGummy()\n",
    "model.toPDF(url=url)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If successful, edit here too:\n",
    "\n",
    "- [Wiki: Supported journals](https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals)\n",
    "- [tests.data](https://github.com/iwasakishuto/Translation-Gummy/blob/master/tests/data.py)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (/usr/local/bin/)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}