from difflib import SequenceMatcher import plotly.plotly as py import plotly.graph_objs as go import feedparser import operator import itertools TOP_NUMBER = 10 RSS_FEED = 'rss.xml' SIMILAR = 0.87 def get_tags(): """Find all tags in live feed. Replace dash with whitespace.""" tags=[] blog_feed = feedparser.parse('https://pybit.es/feeds/all.rss.xml') for item in range(len(blog_feed.entries)): for i in range(len(blog_feed.entries[item].tags)): word=blog_feed.entries[item].tags[i]['term'] tags.append(word) return tags def get_top_tags(tags): """ Get the TOP_NUMBER of most common tags. tags: List of all the tags used by the website. """ tag_list=[] D={} top_tags={} for words in tags: tag_list.append(words.lower()) key = words.lower() D[key] = tag_list.count(key) top_tags=sorted(D.items(),key=operator.itemgetter(1), reverse=True)[:TOP_NUMBER] return top_tags def get_similarities(tags): """ Find set of tags pairs with similarity ratio of > SIMILAR. Argument: tags: List of all the tags used by the website. """ D={} for word in tags: word=word.replace(' ','').lower() for words in tags: words=words.replace(' ','').lower() value = SequenceMatcher(None, word, words).ratio() if SIMILAR