') : skip = False if skip == True and word.find(''): skip = True # 'wtf does this even do?' can't remember... probably nothing pass if len(word) >= min and len(word) <= max and skip == False: for ban in banned: try: while 1: word = word.replace(ban, '') if word.find(ban) == -1: break except: pass if word == '' or word == ' ' or len(word) < min: continue else: self.rv.append(word.lower()) else: pass return list(set(self.rv)) def getUrls(self, data): global bad_urls self.test = data self.rv = [] for lineA in StringIO.StringIO(self.test): match = re.findall(r'.+', lineA) if match: match2 = re.findall(r'.+', lineA) if match2 != True: for i in match: try: reg = re.compile('/wiki/.*?"') self.urlvalue = reg.search(i).group(0) self.urlvalue.replace('"', '') self.urlvalue = str(URLVALUE) + str(self.urlvalue).strip('"') if self.urlvalue.endswith('.jpg') or self.urlvalue.endswith('.svg') or self.urlvalue.endswith('.png') or self.urlvalue.endswith('.gif') : pass elif '/wiki/Wikipedia:' in self.urlvalue or '/wiki/Portal:' in self.urlvalue or '/wiki/Special:' in self.urlvalue or '%' in self.urlvalue or '/wiki/Template' in self.urlvalue: pass else: self.rv.append(self.urlvalue) except Exception, e: pass else: pass return list(set(self.rv)) def writeWords(): global outputfile, words, wordqueue while 1: data = wordqueue.get() for line in data: try: line_encoded = line.encode('ISO-8859-1') #line_encoded = line.encode('UTF-8') # might want to uncomment $ except: continue f = open(outputfile, 'a') f.write(line_encoded.lower() + '\n') f.close() words += 1 if wordqueue.empty(): break ################## def handler(signum, frame): # http://stackoverflow.com/questions/1112343/how-do-i-capture-sigint-in-python global words, outputfile if not wordqueue.empty(): print '\nHold on cowboy, let me finish the running threads and dump the words into %s' % outputfile writeWords() print 'Done. Wrote %i words into %s' % (words, outputfile) quit() signal.signal(signal.SIGINT, handler) ################### filename = os.path.split(inspect.getfile(inspect.currentframe())) parser = optparse.OptionParser('Usage: ' + filename[1] + ' ' + '\nWikipedia Wordlist Generator by @_tmp0\nURL must be formated as following (most subdomains should work): ' 'http://en.wikipedia.org/wiki/wikipage\n\nExample: python %s -u http://en.wikipedia.org/wiki/Europe -o wordlist.txt -t 5\nIf no minumum or max length is set the script will save words between 6 and 30 characters length' '\n\nctrl+c to break\n\nI suggest doing something like this to clean the wordlist from duplicates:' ' sort -u wordlist.txt >> n_wordlist.txt' % filename[1]) parser.add_option('-u', dest='starturl', type='string', help='Wikipedia URL to use as start for the crawler') parser.add_option('-t', dest='nrthreads', type='int', help='Amount of threads') parser.add_option('-o', dest='outputfile', type='string', help='File to write output to') parser.add_option('-m', dest='min', type='int', help='Minimum length of words') parser.add_option('-M', dest='max', type='int', help='Maximum length of words') (options, args) = parser.parse_args() nrthreads = options.nrthreads starturl = options.starturl outputfile = options.outputfile min = options.min max = options.max if starturl == None or outputfile == None or nrthreads == None: print parser.print_help() quit(0) if min == None: print '[!] No minimum length supplied. Setting minimum length to 6' min = 6 if max == None: print '[!] No maximum length supplied. Setting maximum length to 30' max = 30 words = 0 URLVALUE = starturl.split('/wiki')[0] bad_urls = [bad_url.replace(bad_url, str(URLVALUE) + str(bad_url)) for bad_url in bad_urls] firstlayerqueue.put(starturl) while 1: # generate first crawl content thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue) thread.daemon = True thread.start() if thread.isAlive(): break int_count = 0 while 1: if firstlayerqueue.empty(): while 1: firstlayerqueue.put(secondlayerqueue.get()) if secondlayerqueue.empty(): writeWords() print '\nWrote %i words to %s. Queue empty, filling...' % (words, outputfile) words = 0 break if not firstlayerqueue.empty(): alivethread = 0 for i in range(nrthreads): if not firstlayerqueue.empty(): alivethread += 1 thread = Crawl(firstlayerqueue, secondlayerqueue, wordqueue) thread.daemon = True thread.start() for i in range(alivethread): thread.join(5) int_count += 1 if int_count == 2: print 'Joined %i threads. Queue size: %i' % (alivethread, firstlayerqueue.qsize()) int_count = 0 continue