. :return: """ # https://github.com/detro/ghostdriver/issues/169 @self.chromedriver_short_timeout def chromedriver_find_elements_by_css_selector(): return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_css_selector(self.SafeSearch.css_selector)) elements = chromedriver_find_elements_by_css_selector() # get links in random order until max. per page k = 0 links = [] try: for elt in sorted(elements,key=lambda k: random.random()): @self.chromedriver_short_timeout def chromedriver_find_element_by_tag_name(): return elt.find_element_by_tag_name('a') a_tag = chromedriver_find_element_by_tag_name() @self.chromedriver_short_timeout def chromedriver_get_attribute(): return a_tag.get_attribute('href') href = chromedriver_get_attribute() if href is not None: href = self.SafeSearch.result_extraction(href) links.append(href) k += 1 if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break except Exception as e: if self.debug: print(f'.find_element_by_tag_name.get_attribute() exception:\n{e}') return links def get_url(self,url): """ HTTP GET of the url, and add any embedded links. :param url: :return: """ if not self.check_robots(url): return # bail out if robots.txt says to @self.chromedriver_timeout def chromedriver_get(): self.driver.get(url) # selenium driver chromedriver_get() @self.chromedriver_short_timeout def chromedriver_page_source(): self.data_usage += len(self.driver.page_source) chromedriver_page_source() new_links = self.url_links() if self.link_count() < self.max_links_cached: self.add_url_links(new_links,url) def url_links(self): """Generic webpage link finder format.""" # https://github.com/detro/ghostdriver/issues/169 @self.chromedriver_short_timeout def chromedriver_find_elements_by_tag_name(): return WebDriverWait(self.driver,short_timeout).until(lambda x: x.find_elements_by_tag_name('a')) elements = chromedriver_find_elements_by_tag_name() # get links in random order until max. per page k = 0 links = [] try: for a in sorted(elements,key=lambda k: random.random()): @self.chromedriver_short_timeout def chromedriver_get_attribute(): return a.get_attribute('href') href = chromedriver_get_attribute() if href is not None: links.append(href) k += 1 if k > self.max_links_per_page or self.link_count() == self.max_links_cached: break except Exception as e: if self.debug: print(f'.get_attribute() exception:\n{e}') return links def check_robots(self,url): result = True url_robots = uprs.urlunparse(uprs.urlparse(url)._replace(scheme='https', path='/robots.txt', query='', params='')) @self.robots_timeout def robots_read(): rp = RobotFileParserUserAgent() rp.set_url(url_robots) rp.read() result = rp.can_fetch(self.user_agent,url) del rp # ensure self.close() in urllib return result result = robots_read() return result def add_url_links(self,links,url=''): k = 0 for link in sorted(links,key=lambda k: random.random()): lp = uprs.urlparse(link) if (lp.scheme == 'http' or lp.scheme == 'https') and not self.blacklisted(link): if self.add_link(link): k += 1 if k > self.max_links_per_page: break if self.verbose or self.debug: current_url = url # default try: @self.chromedriver_short_timeout def chromedriver_current_url(): return self.driver.current_url current_url = chromedriver_current_url() # the current_url method breaks on a lot of sites, e.g. # python3 -c 'from selenium import webdriver; driver = webdriver.PhantomJS(); driver.get("https://github.com"); print(driver.title); print(driver.current_url); driver.quit()' except Exception as e: if self.debug: print(f'.current_url exception:\n{e}') if self.debug: print("{}: {:d} links added, {:d} total, {:.1f} bits domain entropy".format(current_url,k,self.link_count(),self.domain_entropy())) elif self.verbose: self.print_progress(current_url,num_links=k) def print_url(self,url): if self.debug: print(url + ' …') else: self.print_progress(url) def print_progress(self,url,num_links=None): if num_links is not None: text_suffix = ': +{:d}/{:d} links, H(domain)={:.1f} b'.format(num_links,self.link_count(),self.domain_entropy()) else: text_suffix = '; {:d} links, H(domain)={:.1f} b …'.format(self.link_count(),self.domain_entropy()) self.print_truncated_line(url,text_suffix) def print_truncated_line(self,url,text_suffix='',terminal_width=terminal_width): """ Print truncated `url` + `text_suffix` to fill `terminal_width` :param url: :param text_suffix: :param terminal_width: :return: """ chars_used = len(text_suffix) if text_suffix == '…': if len(url) >= terminal_width: url = url[:terminal_width-1] # add '…' below elif len(url) < terminal_width-1: url += ' ' # add an extra space before the ellipsis else: if len(url) + chars_used > terminal_width: url = url[:terminal_width-chars_used-1] + '…' text = f"{url}{text_suffix}" # added white space necessary text = text[:min(terminal_width,len(text))] + ' ' * max(0,terminal_width-len(text)) print(text,end='',flush=True) time.sleep(0.01) print('\r',end='',flush=True) def blacklisted(self,link): return link in self.blacklist_urls or self.domain_name(link) in self.blacklist_domains def bandwidth_test(self): running_bandwidth = self.data_usage/(self.elapsed_time+900.) running_bandwidth = running_bandwidth/407. # Convert to GB/month, 2**30/(3600*24*30.5) # if self.debug: print(f'Using {running_bandwidth} GB/month') return running_bandwidth > self.gb_per_month # handle chromedriver timeouts # configurable decorator to timeout chromedriver and robotparser calls # http://stackoverflow.com/questions/15572288/general-decorator-to-wrap-try-except-in-python # Syntax: # chromedriver_timeout = block_timeout(chromedriver_hang_handler) # @chromedriver_timeout # def chromedriver_block(): # # chromedriver stuff # pass # chromedriver_block() def block_timeout(self,hang_handler, alarm_time=timeout, errors=(Exception,), debug=False): def decorator(func): def call_func(*args, **kwargs): signal.signal(signal.SIGALRM, hang_handler) # register hang handler signal.alarm(alarm_time) # set an alarm result = None try: result = func(*args, **kwargs) except errors as e: if debug: print(f'{func.__name__} exception:\n{e}') finally: signal.alarm(0) # cancel the alarm return result return call_func return decorator class TimeoutError(Exception): pass def chromedriver_hang_handler(self, signum, frame): # https://github.com/detro/ghostdriver/issues/334 # http://stackoverflow.com/questions/492519/timeout-on-a-function-call if self.debug: print('Looks like chromedriver has hung.') try: self.quit_driver(chromedriver_short_timeout_decorator=self.chromedriver_quit_timeout) except Exception as e: if self.debug: print(e) self.open_driver() def chromedriver_quit_hang_handler(self, signum, frame): raise self.TimeoutError('chromedriver .quit method is taking too long') def robots_hang_handler(self, signum, frame): if self.debug: print('Looks like robotparser has hung.') raise self.TimeoutError('robotparser is taking too long') def check_chromedriver_process(self): """ Check if chromedriver is running. :return: """ # Check rss and restart if too large, then check existence # http://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python try: if not hasattr(self,'driver'): self.open_driver() pid, rss_mb = self.chromedriver_pid_and_memory() if rss_mb > self.chromedriver_rss_limit_mb: # memory limit self.quit_driver(pid=pid) self.open_driver() pid, _ = self.chromedriver_pid_and_memory() # check existence os.kill(pid, 0) except (OSError,psutil.NoSuchProcess,Exception) as e: if self.debug: print(f'.chromedriver_pid_and_memory() exception:\n{e}') if issubclass(type(e),psutil.NoSuchProcess): raise Exception("There's a chromedriver zombie, and the thread shouldn't have reached this statement.") return False else: return True def chromedriver_pid_and_memory(self): """ Return the pid and memory (MB) of the chromedriver process, restart if it's a zombie, and exit if a restart isn't working after three attempts. """ for k in range(3): # three strikes try: @self.chromedriver_short_timeout def chromedriver_process_pid(): return self.driver.service.process.pid pid = chromedriver_process_pid() rss_mb = psutil.Process(pid).memory_info().rss / float(2 ** 20) break except (psutil.NoSuchProcess,Exception) as e: if self.debug: print(f'.service.process.pid exception:\n{e}') self.quit_driver(pid=pid) self.open_driver() else: # throw in the towel and exit if no viable chromedriver process after multiple attempts print('No viable chromedriver process after multiple attempts!') sys.exit(1) return (pid, rss_mb) def parse_and_filter_rule_urls(self,line): """Convert EasyList domain anchor rule to domain or url.""" line = line.rstrip() # filter out configuration, comment, exception lines, domain-specific, and selector rules if re_test(configuration_re, line) or re_test(comment_re, line) or re_test(exception_re, line) or re_test( domain_option_re, line) or re_test(selector_re, line): return if re_test(option_re, line): line = option_re.sub('\\1', line) # delete all the options and continue # ignore these cases # blank url case: ignore if re_test(httpempty_re, line): return # blank line case: ignore if not bool(line): return # parse all remaining rules # treat each of the these cases separately # regex case: ignore if re_test(regex_re, line): return # now that regex's are handled, delete unnecessary wildcards, e.g. /.../* line = wildcard_begend_re.sub('\\1', line) # domain anchors, || or '|http://a.b' -> domain anchor 'a.b' for regex efficiency in JS if re_test(domain_anch_re, line) or re_test(scheme_anchor_re, line): # strip off initial || or |scheme:// if re_test(domain_anch_re, line): line = domain_anch_re.sub('\\1', line) elif re_test(scheme_anchor_re, line): line = scheme_anchor_re.sub("", line) # host subcase if re_test(da_hostonly_re, line): line = da_hostonly_re.sub('\\1', line) if not re_test(wild_anch_sep_exc_re, line): # exact subsubcase if wildcard_ignore_test(line): return self.blacklist_domains |= set([line]) return line else: return # regex subsubcase # hostpath subcase if re_test(da_hostpath_re, line): line = da_hostpath_re.sub('\\1', line) if not re_test(wild_sep_exc_noanch_re, line) and re_test(pathend_re, line): # exact subsubcase line = re.sub(r'[/|]$', '', line) # strip EOL slashes and anchors if wildcard_ignore_test(line): return self.blacklist_urls |= set([line]) return line else: return # regex subsubcase # hostpathquery default case if wildcard_ignore_test(line): return self.blacklist_urls |= set([line]) return line # all other non-regex patterns in for the path parts: ignore return # EasyList regular expressions # See https://github.com/essandess/easylist-pac-privoxy comment_re = re.compile(r'^\s*?!') # ! commment configuration_re = re.compile(r'^\s*?\[[^]]*?\]') # [Adblock Plus 2.0] easylist_opts = r'~?\b(?:third\-party|domain|script|image|stylesheet|object(?!-subrequest)|object\-subrequest|xmlhttprequest|subdocument|ping|websocket|webrtc|document|elemhide|generichide|genericblock|other|sitekey|match-case|collapse|donottrack|popup|media|font)\b' option_re = re.compile(r'^(.*?)\$(' + easylist_opts + r'.*?)$') # regex's used to exclude options for specific cases domain_option_re = re.compile(r'\$.*?(?:domain=)') # discards rules specific to links from specific domains selector_re = re.compile(r'^(.*?)#\@?#*?.*?$') # #@##div [should be #+?, but old style still used] regex_re = re.compile(r'^\@{0,2}\/(.*?)\/$') wildcard_begend_re = re.compile(r'^(?:\**?([^*]*?)\*+?|\*+?([^*]*?)\**?)$') wild_anch_sep_exc_re = re.compile(r'[*|^@]') wild_sep_exc_noanch_re = re.compile(r'(?:[*^@]|\|[\s\S])') exception_re = re.compile(r'^@@(.*?)$') httpempty_re = re.compile(r'^\|?https?://$') pathend_re = re.compile(r'(?i)(?:[/|]$|\.(?:jsp?|php|xml|jpe?g|png|p?gif|img|swf|flv|[sp]?html?|f?cgi|pl?|aspx|ashx|css|jsonp?|asp|search|cfm|ico|act|act(?:ion)?|spy|do|stm|cms|txt|imu|dll|io|smjs|xhr|ount|bin|py|dyn|gne|mvc|lv|nap|jam|nhn))',re.IGNORECASE) domain_anch_re = re.compile(r'^\|\|(.+?)$') # omit scheme from start of rule -- this will also be done in JS for efficiency scheme_anchor_re = re.compile(r'^(\|?(?:[\w*+-]{1,15})?://)'); # e.g. '|http://' at start # (Almost) fully-qualified domain name extraction (with EasyList wildcards) # Example case: banner.3ddownloads.com^ da_hostonly_re = re.compile(r'^((?:[\w*-]+\.)+[a-zA-Z0-9*-]{1,24}\.?)(?:$|[/^?])$') da_hostpath_re = re.compile(r'^((?:[\w*-]+\.)+[a-zA-Z0-9*-]{1,24}\.?[\w~%./^*-]+?)\??$') def re_test(regex,string): if isinstance(regex,str): regex = re.compile(regex) return bool(regex.search(string)) def wildcard_ignore_test(rule): return bool(wild_anch_sep_exc_re.search(rule)) if __name__ == "__main__": ISPDataPollution()