#!/usr/bin/env python # -------------------------------------------------- # Parse Domain TLD: Check Valid TLD # from https://github.com/lsemel/python-parse-domain # fork https://github.com/hmgle/python-parse-domain # note: compare with 2 list # IMPORTANT: USE PYTHON 3X # ------------------------------------------------- from urllib.parse import urlparse import re def parse_domain(url, levels=2): """ Given a URL or hostname, returns the domain to the given level (level 1 is the top-level domain). Uses a list of active top-level domains to ensure long TLD's such as ".co.uk" are correctly treated as a single TLD. If the domain has an unrecognizable TLD, assumes it is one level. """ if levels < 1 or not url: return None # Parse the hostname from the url parsed = urlparse(url) hostname = getattr(parsed,'hostname',url) partial_domains = [] partial_domain = "" if hostname is None: return None for section in reversed(hostname.split(".")): partial_domain = "." + section + partial_domain partial_domains.append(partial_domain) # Find the longest matching TLD, recording its index tld_idx = 0 for idx, item in enumerate(partial_domains): if item in clean: tld_idx = idx # Add the desired number of levels to the tld index, # counting the TLD itself as the first level try: domain = partial_domains[tld_idx + levels - 1] except IndexError: domain = partial_domains[-1] # Remove the initial dot return domain[1:] clean = set(d.strip() for d in open("tlds.txt").readlines()) valid = set(d.strip() for d in open('urls.txt').readlines()) filename = 'capture' domains = [d.strip('.\n') for d in open(filename).readlines()] domains = [d for d in domains if '.'+d not in valid] D = dict() for domain in domains: D[parse_domain('http://'+domain)] = 0 for d in D: if d: print (d)