#!/usr/bin/env python -tt ########################################################### ## ## ## get-infinitives.py ## ## ## ## Author: Tony Fischetti ## ## tony.fischetti@gmail.com ## ## ## ########################################################### """ """ __author__ = 'Tony Fischetti' __version__ = '0.1' import sys import requests import json import re import lxml.html from lxml.cssselect import CSSSelector import os from lxml import etree import html2text import logging import http.client as http_client import urllib.parse import time import random import unicodedata PATTERN = re.compile("^\s*(\d+)\s+(.+?)$", re.UNICODE) FN_VERB_LIST = "./all-verbs-count.txt" VERB_LIST = [] # with open(FN_VERB_LIST, "r") as fh: # VERB_LIST = [item.rstrip() for item in fh.readlines()] with open(FN_VERB_LIST, "rb") as fh: VERB_LIST = [item.decode("utf-8").rstrip() for item in fh.readlines()] LENGTH = len(VERB_LIST) TRANSLATE_STUB = "http://www.spanishdict.com/translate/" BIG_DICT = {"verbs": {}} MISMATCH_CSS = CSSSelector(".mismatch") TEST1 = re.compile("represents different", re.UNICODE) GET_INF1 = re.compile("\*\*.+?\*\* represents .+? \*\*(.+?)\*\*", re.UNICODE) TEST2 = re.compile("\*\*.+?\*\* is the", re.UNICODE) GET_INF2 = re.compile("\*\*.+?\*\* is the (\w+) form of \*\*(.+?)\*\* in the (\w+ \w+) (\w+)", re.UNICODE) BK_REGEX = re.compile('
{}%".format(ind, LENGTH, get_perc(ind))) current_count, current_verb = parse_line(line) current_verb = unicodedata.normalize("NFC", current_verb) current_url = "{}{}".format(TRANSLATE_STUB, current_verb) print("Current verb: {}".format(current_verb)) print("Current verb data: {}".format(current_verb.encode("utf-8"))) print("Current count: {}".format(current_count)) print("Current url: ->{}<-".format(current_url)) try: r = requests.get(current_url) tree = lxml.html.fromstring(r.text) results = MISMATCH_CSS(tree) if results: print("# of matches: {}".format(len(results))) else: print("results is none") match = results[0] THE_TEXT = html2text.html2text(etree.tostring(match).decode("utf-8")).rstrip() the_infinitive = "" the_person = "" the_number = "" the_tense = "" AMB = False if TEST1.search(THE_TEXT): the_infinitive = GET_INF1.search(THE_TEXT).group(1) AMB = True elif TEST2.search(THE_TEXT): matches = GET_INF2.search(THE_TEXT) the_tense = matches.group(1) the_infinitive = matches.group(2) the_person = matches.group(3) the_number = matches.group(4) AMB = False print("The infinitive : >{}<".format(the_infinitive)) if AMB: BIG_DICT[current_verb] = {"count": current_count, "spanish_dict_success": True, "infinitive": the_infinitive, "line": ind+1} else: BIG_DICT[current_verb] = {"count": current_count, "spanish_dict_success": True, "infinitive": the_infinitive, "line": ind+1} except: print("UNSPECIFIED FAILURE!") BIG_DICT[current_verb] = {"count": current_count, "spanish_dict_success": False, "infinitive": None, "line": ind+1} # with open("./mistakes/{}.html".format(current_verb), "w") as fh: # fh.write(r.text) # every twenty if (ind % 20) == 0: out_str = json.dumps(BIG_DICT, sort_keys=True, indent=2, separators=(',', ': ')) with open("./running.json", "w") as fh: fh.write(out_str) print() time.sleep(random.uniform(0.5, 2)) out_str = json.dumps(BIG_DICT, sort_keys=True, indent=2, separators=(',', ': '), ensure_ascii=False) with open("./finished.json", "w") as fh: fh.write(out_str) if __name__ == '__main__': STATUS = main() sys.exit(STATUS)