{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Make a new `JournalCrawler` (soup)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "

Where you need to update

\n", " \n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can create a new `JournalCrawler` whose `crawl_type` is **\"soup\"**." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m[success]\u001b[0m local driver can be built.\n", "\u001b[31m[failure]\u001b[0m remote driver can't be built.\n", "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n" ] } ], "source": [ "from gummy.utils import get_driver\n", "from gummy.journals import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class GoogleJournal(GummyAbstJournal):\n", " pass\n", "self = GoogleJournal()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_soup_driver(url):\n", " with get_driver() as driver:\n", " soup = self.get_soup_source(url=url, driver=driver)\n", " cano_url = canonicalize(url=url, driver=driver)\n", " return soup, cano_url" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_soup(url):\n", " cano_url = canonicalize(url=url, driver=None)\n", " soup = self.get_soup_source(url=url, driver=None)\n", " return soup, cano_url" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.google.com/\n" ] } ], "source": [ "url = input()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## create `get_contents_soup`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With Driver Ver." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Use \u001b[32mUselessGateWay\u001b[0m.\u001b[34m_pass2others\u001b[0m method.\n", "Wait up to 3[s] for all page elements to load.\n", "Scroll down to the bottom of the page.\n", "\n", "Decompose unnecessary tags to make it easy to parse.\n", "==============================\n", "Decomposed \u001b[32m\u001b[0m tag (0)\n", "Decomposed \u001b[32m\u001b[0m tag (1)\n", "Decomposed \u001b[32m\u001b[0m tag (4)\n", "Decomposed \u001b[32m