import http.server import socketserver import requests import re import json import base64 import threading from http import HTTPStatus from bs4 import BeautifulSoup from urllib.parse import urlparse from urllib.parse import parse_qs from dataclasses import dataclass, field from typing import List from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry import cloudscraper # Config default values @dataclass class ForvoConfig(): port: int = 8770 language: str = 'ja' preferred_usernames: List[str] = field(default_factory=list) show_gender: bool = True def set(self, config): self.__init__(**config) _forvo_config = ForvoConfig() class Forvo(): """ Forvo web-scraper utility class that matches YomiChan's expected output for a custom audio source """ _SERVER_HOST = "https://forvo.com" _AUDIO_HTTP_HOST = "https://audio12.forvo.com" def __init__(self, config=_forvo_config): self.config = config # self._set_session() self.scraper = cloudscraper.create_scraper(delay=10, browser={'custom': 'ScraperBot/1.0',}) def _set_session(self): """ Sets the session with basic backoff retries. Put in a separate function so we can try resetting the session if something goes wrong """ retry_strategy = Retry( total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504], method_whitelist=["HEAD", "GET", "OPTIONS"] ) adapter = HTTPAdapter(max_retries=retry_strategy) self.session = requests.Session() self.session.mount("https://", adapter) self.session.mount("http://", adapter) # Use my personal user agent to try to avoid scraping detection self.session.headers.update( { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.27", "Accept-Language": "en-US,en;q=0.5", } ) def _get(self, path): """ Makes a GET request assuming base url. Creates a new session if something goes wrong """ url = self._SERVER_HOST + path try: # return self.session.get(url, timeout=10).text return self.scraper.get(url).text except Exception: self._set_session() return self.scraper.get(url, timeout=10).text # return self.session.get(url, timeout=10).text def word(self, w): """ Scrape forvo's word page for audio sources """ w = w.strip() if len(w) == 0: return [] path = f"/word/{w}/" html = self._get(path) # soup = BeautifulSoup(html, features="html.parser") soup = BeautifulSoup(html, 'lxml') # Forvo's word page returns multiple result sets grouped by langauge like: #
#
# # ... #
#
...
# # We also filter out ads results = soup.select(f"#language-container-{self.config.language}>article>ul.pronunciations-list>li:not(.li-ad)") pronunciations = [] for i in results: url = self._extract_url(i.div) # Capture the username of the user # Some users have deleted accounts which is why can't just parse it from the tag username = re.search(r"Pronunciation by([^(]+)\(", i.get_text(strip=True)).group(1).strip() pronunciation = { 'username': username, 'url': url } if self.config.show_gender: m = re.search(r"\((Male|Female)", i.get_text(strip=True)) if m: pronunciation['gender'] = m.group(1).strip() pronunciations.append(pronunciation) # Order the list based on preferred_usernames if len(self.config.preferred_usernames): keys = self.config.preferred_usernames def get_index(pronunciation): key = pronunciation['username'] if key in keys: return keys.index(key) for i in range(len(pronunciations)): if key == pronunciations[i]['username']: return i + len(keys) pronunciations = sorted(pronunciations, key=get_index) # Transform the list of pronunciations into Yomichan format audio_sources = [] for pronunciation in pronunciations: genderSymbol = { "Male": '♂', "Female": '♀', }.get(pronunciation.get("gender"), "") audio_sources.append({ "url": pronunciation['url'], "name": f"Forvo ({genderSymbol}{pronunciation['username']})", }) return audio_sources @classmethod def _extract_url(cls, element): play = element['onclick'] # We are interested in Forvo's javascript Play function which takes in some parameters to play the audio # Example: Play(3060224,'OTQyN...','OTQyN..',false,'Yy9wL2NwXzk0MjYzOTZfNzZfMzM1NDkxNS5tcDM=','Yy9wL...','h') # Match anything that isn't commas, parentheses or quotes to capture the function arguments # Regex will match something like ["Play", "3060224", ...] play_args = re.findall(r"([^',\(\)]+)", play) # Forvo has two locations for mp3, /audios/mp3 and just /mp3 # /audios/mp3 is normalized and has the filename in the 5th argument of Play base64 encoded # /mp3 is raw and has the filename in the 2nd argument of Play encoded try: file = base64.b64decode(play_args[5]).decode("utf-8") url = f"{cls._AUDIO_HTTP_HOST}/audios/mp3/{file}" # Some pronunciations don't have a normalized version so fallback to raw except: file = base64.b64decode(play_args[2]).decode("utf-8") url = f"{cls._AUDIO_HTTP_HOST}/mp3/{file}" return url def search(self, s): """ Scrape Forvo's search page for audio sources. Note that the search page omits the username """ s = s.strip() if len(s) == 0: return [] path = f"/search/{s}/{self.config.language}/" html = self._get(path) soup = BeautifulSoup(html, features="html.parser") # Forvo's search page returns two result sets like: # results = soup.select('ul.word-play-list-icon-size-l>li>div.play') audio_sources = [] for i in results: url = self._extract_url(i) audio_sources.append({"name":"Forvo Search","url":url}) return audio_sources class ForvoHandler(http.server.SimpleHTTPRequestHandler): forvo = Forvo(config=_forvo_config) # By default, SimpleHTTPRequestHandler logs to stderr # This would cause Anki to show an error, even on successful requests # log_error is still a useful function though, so replace it with the inherited log_message # Make log_message do nothing def log_error(self, *args, **kwargs): super().log_message(*args, **kwargs) def log_message(self, *args): pass def do_GET(self): # Extract 'term' and 'reading' query parameters query_components = parse_qs(urlparse(self.path).query) term = query_components["term"][0] if "term" in query_components else "" # Yomichan used to use "expression" but renamed to term. Still support "expression" for older versions expression = query_components["expression"][0] if "expression" in query_components else "" if term == "": term = expression reading = query_components["reading"][0] if "reading" in query_components else "" debug = query_components["debug"][0] if "debug" in query_components else False if debug: debug_resp = { "debug":True } debug_resp['reading'] = reading debug_resp['term'] = term debug_resp['word.term'] = self.forvo.word(term) debug_resp['word.reading'] = self.forvo.word(reading) debug_resp['search.term'] = self.forvo.search(term) debug_resp['search.reading'] = self.forvo.search(reading) self.wfile.write(bytes(json.dumps(debug_resp), "utf8")) return audio_sources = [] # Try looking for word sources for 'term' first audio_sources = self.forvo.word(term) # Try looking for word sources for 'reading' if len(audio_sources) == 0: audio_sources += self.forvo.word(reading) # Finally use forvo search to look for similar words if len(audio_sources) == 0: audio_sources += self.forvo.search(term) if len(audio_sources) == 0: audio_sources += self.forvo.search(reading) # Build JSON that yomichan requires # Ref: https://github.com/FooSoft/yomichan/blob/master/ext/data/schemas/custom-audio-list-schema.json resp = { "type": "audioSourceList", "audioSources": audio_sources, } # Writing the JSON contents with UTF-8 payload = bytes(json.dumps(resp), "utf8") self.send_response(HTTPStatus.OK) self.send_header("Content-type", "application/json") self.send_header("Content-length", str(len(payload))) self.end_headers() try: self.wfile.write(payload) except BrokenPipeError: self.log_error("BrokenPipe when sending reply") return if __name__ == "__main__": # If we're not in Anki, run the server directly and blocking for easier debugging print("Running in debug mode...") httpd = socketserver.TCPServer(('localhost', 8770), ForvoHandler) httpd.serve_forever() else: # Else, run it in a separate thread so it doesn't block # Also import Anki-specific packages here from aqt import mw _forvo_config.set(mw.addonManager.getConfig(__name__)) httpd = http.server.ThreadingHTTPServer(('localhost', _forvo_config.port), ForvoHandler) server_thread = threading.Thread(target=httpd.serve_forever) server_thread.daemon = True server_thread.start()