## Make a new `JournalCrawler` (soup)

You can create a new `JournalCrawler` whose `crawl_type` is **"soup"**.

In [1]:
from gummy.utils import get_driver
from gummy.journals import *

[32m[success][0m local driver can be built.
[31m[failure][0m remote driver can't be built.
DRIVER_TYPE: [32mlocal[0m


In [2]:
def get_soup(url):
 cano_url = canonicalize(url=url, driver=None)
 return BeautifulSoup(requests.get(url).content, "html.parser"), cano_url

In [3]:
def get_soup_driver(url):
 with get_driver() as driver:
 driver.get(url)
 time.sleep(3)
 html = driver.page_source.encode("utf-8")
 cano_url = canonicalize(url=url, driver=driver)
 return BeautifulSoup(html, "html.parser"), cano_url

In [4]:
class GoogleJournal(GummyAbstJournal):
 pass
self = GoogleJournal()

In [5]:
url = input()

https://www.google.com/


## create `get_contents_soup`

### No Driver Ver.

In [6]:
soup, cano_url = get_soup(url)
self._store_crawled_info(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")

canonicalized URL: [34mhttps://www.google.com/[0m


#### `get_title_from_soup`

In [7]:
title = find_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, not_found=self.default_title)
print(f"title: {toGREEN(title)}")

title: [32m2020-08-06@23.55.12[0m


#### `get_sections_from_soup`

In [8]:
sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")

num sections: [34m1[0m


#### `get_sections_from_soup`

In [9]:
soup_sections = sections

In [10]:
contents = []
len_soup_sections = len(soup_sections)
for i,section in enumerate(soup_sections):
 headline = "headline"
 inputTag = section.find("input")
 if inputTag is not None:
 headline = inputTag.get("aria-label")
 inputTag.decompose()
 contents.extend(self.organize_soup_section(section=section, headline=headline))
 if self.verbose: print(f"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}")

[1/1] None


***

### With Driver Ver.

In [11]:
soup, cano_url = get_soup_driver(url)
self._store_crawled_info(cano_url=cano_url)
print(f"canonicalized URL: {toBLUE(cano_url)}")

DRIVER_TYPE: [32mlocal[0m
canonicalized URL: [34mhttps://www.google.com/[0m


#### `get_title_from_soup`

In [12]:
title = find_text(soup=soup, name="div", attrs={"id": "SIvCob"}, strip=True, not_found=self.default_title)
print(f"title: {toGREEN(title)}")

title: [32m2020-08-06@23.55.12[0m


#### `get_sections_from_soup`

In [13]:
sections = soup.find_all(name="center")
print(f"num sections: {toBLUE(len(sections))}")

num sections: [34m3[0m


#### `get_sections_from_soup`

In [14]:
soup_sections = sections

In [15]:
contents = []
len_soup_sections = len(soup_sections)
for i,section in enumerate(soup_sections):
 headline = "headline"
 inputTag = section.find("input")
 if inputTag is not None:
 headline = inputTag.get("aria-label")
 inputTag.decompose()
 contents_.extend(self.organize_soup_section(section=section, headline=headline))
 if self.verbose: print(f"[{i+1:>0{len(str(len_soup_sections))}}/{len_soup_sections}] {headline}")

[1/3] Google 検索
[2/3] Google 検索
[3/3] headline


## Confirmation

NOTE: You also have to modify these variables:

- [`gummy.journals.TranslationGummyJournalCrawlers`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/journals.py)
- [`gummy.utils.journal_utils.DOMAIN2JOURNAL`](https://github.com/iwasakishuto/Translation-Gummy/blob/master/gummy/utils/journal_utils.py)

In [16]:
from gummy import TranslationGummy

In [17]:
model = TranslationGummy()
model.toPDF(url=url)

If successful, edit here too:

- [Wiki: Supported journals](https://github.com/iwasakishuto/Translation-Gummy/wiki/Supported-journals)
- [tests.data](https://github.com/iwasakishuto/Translation-Gummy/blob/master/tests/data.py)