import re
import os
import logging
def remove_long_titles(filename):
with open(filename, 'r') as f:
data = f.read()
title_regex = r'
(.*?)'
headlines = re.findall(title_regex, data)
for headline in headlines:
if len(headline) > 68 or headline.strip().lower() == "showerthoughts":
data = data.replace("{}".format(headline), '')
data = data.replace("&", "&")
with open(filename, 'w') as f:
f.write(data)
with open('/root/shortshowerthoughts.rss', 'a') as f:
f.write(data)
def check_for_duplicates(filename):
with open(filename, 'r') as f:
data = f.read()
title_regex = r'(.*?)'
headlines = set(re.findall(title_regex, data))
for headline in headlines:
if data.count("{}".format(headline)) > 1:
data = data.replace("{}".format(headline), '')
data = data.replace("&", "&")
with open(filename, 'w') as f:
f.write(data)
if __name__ == '__main__':
filename = '/root/showerthoughts.rss'
remove_long_titles(filename)
check_for_duplicates('/root/shortshowerthoughts.rss')
logging.info('Finished removing long headlines and duplicates from RSS feed.')