{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Make a new `JournalCrawler` (soup)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can create a new `JournalCrawler` whose `crawl_type` is **\"soup\"**." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[32m[success]\u001b[0m local driver can be built.\n", "\u001b[31m[failure]\u001b[0m remote driver can't be built.\n", "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n" ] } ], "source": [ "import re\n", "import time\n", "import requests\n", "import urllib\n", "import pandas as pd\n", "from bs4 import BeautifulSoup\n", "from gummy.utils import whichJournal, canonicalize, get_driver, find_text, split_soup, split_soup_by_name" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def get_soup(url):\n", " return BeautifulSoup(requests.get(url).content, \"html.parser\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_soup_driver(url):\n", " with get_driver() as driver:\n", " driver.get(url)\n", " time.sleep(3)\n", " html = driver.page_source.encode(\"utf-8\")\n", " return BeautifulSoup(html, \"html.parser\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## create `get_contents_soup`" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "url = \"https://www.google.com/\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DRIVER_TYPE: \u001b[32mlocal\u001b[0m\n" ] } ], "source": [ "soup = get_soup(url)\n", "soup_driver = get_soup_driver(url)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_title_from_soup`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```py\n", " def get_title_from_soup(self, soup):\n", " title = find_text(soup=soup, name=\"h1\", class_=\"title\", strip=True)\n", " return title\n", "```" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[NOT FOUND]'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "find_text(soup=soup, name=\"h1\", class_=\"title\", strip=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_sections_from_soup`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```py\n", "def get_sections_from_soup(self, soup):\n", " sections = soup.find_all(name=\"h2\", class_=\"section-title\")\n", " return sections\n", "```" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.find_all(name=\"h2\", class_=\"section-title\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup_driver.find_all(name=\"h2\", class_=\"section-title\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### `get_sections_from_soup`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```py\n", " def get_sections_from_soup(self, soup):\n", " sections = soup.find_all(name=\"section\", attrs={\"type\" : \"other\"})\n", " abst = soup.find(name=\"div\", class_=\"art-abstract\")\n", " if abst is not None:\n", " asbt_section = soup.new_tag(name=\"section\", attrs={\"type\" : \"other\"})\n", " asbt_h2Tag = soup.new_tag(name=\"h2\")\n", " asbt_h2Tag.string = \"0. Abstract\"\n", " asbt_section.append(asbt_h2Tag)\n", " asbt_section.append(abst)\n", " sections.insert(0, asbt_section)\n", " sections = [e for e in soup.find_all(name=\"section\") if e.get(\"aria-labelledby\") not in self.AvoidAriaLabel]\n", " return sections\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }