import requests
from bs4 import BeautifulSoup
from csv import reader, DictReader, DictWriter
from collections import OrderedDict


dcm_keys = ['misc', 'lifestyle', 'entertainment','bus', 'socmed', 'tech',\
    'world', 'culture', 'u.s.', 'social-good']

dcm_vals_next = len(dcm_keys)

dcm = OrderedDict()

for idx,key in enumerate(dcm_keys):
    dcm[key] = idx
    
def get_data_channel(url):
    global dcm_vals_next
    hgroup = BeautifulSoup(requests.get(url).content,'html.parser').find('hgroup')
    ret = 0
    if hgroup is not None:
        dc = hgroup['data-channel']
        if dc not in dcm_keys:
            dcm[dc] = dcm_vals_next
            dcm_keys.append(dc)
            dcm_vals_next += 1
        ret = dcm[dc]
    return ret

start = 7984
with open('data_channel_df.csv','r') as ifile:
    df = DictReader(ifile)
    with open(f'data_channel_cleaned_{start}_df.csv','w') as ofile:
        out_df = DictWriter(ofile,fieldnames=df.fieldnames)
        out_df.writeheader()
        for row in df:
            if int(row['']) < start:
                print(f"Ignoring row<{row['']}>")
                continue
            if row['data_channel'] == '0':
                dc = get_data_channel(row['url'])
                row['data_channel'] = str(dc)
                row['data_channel_name'] = dcm_keys[dc]
                print(f"row<{row['']}>")
                out_df.writerow(row)