''' Created on May 22, 2015 @author: sprasa7 ''' from bs4 import BeautifulSoup import urllib2 from logging import info, debug, error, INFO, getLogger import logging #from nltk.corpus import stopwords logging.basicConfig(level=INFO) getLogger('dedup').setLevel(INFO) HN_URL = "https://news.ycombinator.com/news?" #cachedStopWords = stopwords.words("english") class Article(object): def __init__(self, url, text): self.url = url self.text = text def __str__(self): return str(self.url) def __eq__(self, other): return self.url == other.url class DuplicateArticlePair(object): def __init__(self, articleA, articleB): self.articleA = articleA self.articleB = articleB def __str__(self): return "(" + str(self.articleA) + " , " + str(self.articleB) + ")" def __eq__(self, other): return ( self.articleA == other.articleA and self.articleB == other.articleB ) or ( self.articleA == other.articleB and self.articleB == other.articleA ) def compute_jaccard_index(set_1, set_2): return len(set_1.intersection(set_2)) / float(len(set_1.union(set_2))) def are_duplicates(articleA, articleB): #cleaned_articleA = ' '.join([word for word in articleA.split() if word not in cachedStopWords]) #cleaned_articleB = ' '.join([word for word in articleB.split() if word not in cachedStopWords]) set_1 = set(articleA.lower().split(" ")) set_2 = set(articleB.lower().split(" ")) score = compute_jaccard_index(set_1, set_2) if score > 0.2: debug("Jaccard score for [%s, %s] = > %f", articleA, articleB, score) if score > 0.5: info("Jaccard score for [%s, %s] = > %f", articleA, articleB, score) return True return False def find_duplicates(articles): duplicates = [] for article_outer in sorted(articles): debug("Analyzing article : %s", article_outer) for article_inner in articles: debug("Comparing %s with %s", article_inner.text, article_outer.text) if article_outer.text == article_inner.text: continue if are_duplicates(article_outer.text, article_inner.text): dup_pair = DuplicateArticlePair(article_outer, article_inner) duplicates.append(dup_pair) debug("Duplicate Pair : %s", str(dup_pair)) return set(duplicates) def get_articles_to_analyze(url): html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) articles = soup.findAll('td', {'class' : 'title'}) article_list = [] for article_wrapper in articles: for article_url in article_wrapper.find_all('a', href=True): article_header = article_url.text if not is_hn_article_to_be_skipped(article_header): article_list.append(Article(article_url, article_header)) info("Found %d articles for url %s", len(article_list), url) return article_list def is_hn_article_to_be_skipped(article_header): if article_header == "More" or "Show HN:" in article_header or "Ask HN:" in article_header: return True return False if __name__ == '__main__': articles = [] count = 0 for index in range(1,25): try: url = HN_URL + "p=" + str(index) index_articles = get_articles_to_analyze(url) articles.extend(index_articles) count +=1 except Exception, e: error("Failed for index %d. Reason => %s", index, str(e)) duplicates = find_duplicates(articles) info("Found %d duplicates in top %d HN pages", len(duplicates), count) for duplicate in duplicates: info("%s", str(duplicate))