{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Make a new `JournalCrawler` (soup)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Where you need to update

\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```python\n", "# Copy HERE\n", "from gummy.utils import get_driver\n", "from gummy.journals import *\n", "\n", "self = GummyAbstJournal()\n", "def get_soup(url):\n", " with get_driver() as driver:\n", " soup = self.get_soup_source(url=url, driver=driver)\n", " cano_url = canonicalize(url=url, driver=driver)\n", " return soup, cano_url\n", "\n", "url = input()\n", "soup, cano_url = get_soup(url)\n", "self._store_crawling_logs(cano_url=cano_url)\n", "print(f\"canonicalized URL: {toBLUE(cano_url)}\")\n", "\n", "# get_title_from_soup\n", "title = find_target_text(soup=soup, name=\"div\", attrs={\"id\": \"SIvCob\"}, strip=True, default=self.default_title)\n", "print(f\"title: {toGREEN(title)}\")\n", "\n", "# get_sections_from_soup\n", "sections = soup.find_all(name=\"center\")\n", "print(f\"num sections: {toBLUE(len(sections))}\")\n", "\n", "# get_head_from_section\n", "def get_head_from_section(section):\n", " head = section.find(name=\"input\")\n", " return head\n", "self.get_head_from_section = get_head_from_section\n", "contens = self.get_contents_from_soup_sections(sections)\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can create a new `JournalCrawler` whose `crawl_type` is **\"soup\"**." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m[success]\u001b[0m local driver can be built.\n", "\u001b[31m[failure]\u001b[0m remote driver can't be built.\n", "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n" ] } ], "source": [ "from gummy.utils import get_driver\n", "from gummy.journals import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class GoogleJournal(GummyAbstJournal):\n", " pass\n", "self = GoogleJournal()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_soup_driver(url):\n", " with get_driver() as driver:\n", " soup = self.get_soup_source(url=url, driver=driver)\n", " cano_url = canonicalize(url=url, driver=driver)\n", " return soup, cano_url" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_soup(url):\n", " cano_url = canonicalize(url=url, driver=None)\n", " soup = self.get_soup_source(url=url, driver=None)\n", " return soup, cano_url" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.google.com/\n" ] } ], "source": [ "url = input()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## create `get_contents_soup`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With Driver Ver." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Use \u001b[32mUselessGateWay\u001b[0m.\u001b[34m_pass2others\u001b[0m method.\n", "Wait up to 3[s] for all page elements to load.\n", "Scroll down to the bottom of the page.\n", "\n", "Decompose unnecessary tags to make it easy to parse.\n", "==============================\n", "Decomposed \u001b[32m\u001b[0m tag (0)\n", "Decomposed \u001b[32m\u001b[0m tag (1)\n", "Decomposed \u001b[32m\u001b[0m tag (4)\n", "Decomposed \u001b[32m