import glob
import csv
import os
import re
from bs4 import BeautifulSoup
from bs4 import Comment
import pprint
import re
import pandas as pd
from url_normalize import url_normalize
from langdetect import detect_langs
from langdetect import DetectorFactory
from urllib import parse
import zipfile
import unicodedata ####
i = 0

DetectorFactory.seed = 123456789

scrapedurls = pd.read_csv('scrapedURLs.csv', low_memory=False, dtype=str)

i = 0

with zipfile.ZipFile('HTML.zip', 'r') as zip:
    with open('input_categorization_allpages.csv', "w", encoding="utf-8", newline='') as createfile:
                    csvwriter = csv.writer(createfile)
                    csvwriter.writerow(['gvkey','year','id','site','HTML_title', 'path', 'path_cleaned'])
        
                    for filename in zip.namelist()[1:]:
                        with zip.open(filename) as file:
                                file_text = file.read().decode('utf-8')
                                # Remove HTML comments that the LXML parser often is incorrectly parsing. 
                                file_text = re.sub(r'<!--(.*?)-->', '', str(file_text))
                                
                                splitname = filename.split("\\")
                                splitname = splitname[-1]
                                splitname = splitname.split("_")
                                year = splitname[0]
                                year = year.replace("HTML/","")
                                
                                gvkey = splitname[1]
                                gvkey = gvkey.replace(".txt","")
                                gvkeywithslash = "/"+gvkey
                                level = splitname[2]
                                number = splitname[3]
                                number = number.replace(".html","")
                                id_page = str(year)+"_"+str(gvkey)+"_"+str(level)+"_"+str(number)

                                soup_original = BeautifulSoup(file_text, 'lxml')

                                # Write page title and path for further classification. 
                                # VLOOKUP to get website, and clean.
                                df_focalpage = scrapedurls[(scrapedurls['id'].astype(str) == str(id_page))]
                                website = df_focalpage['site'].iloc[0]
                                            
                                try:
                                    # Try to retrieve the title tag's content from the parsed HTML file, stripping leading/trailing whitespaces
                                    title = soup_original.title.string.strip()
                                                
                                except:
                                    # If no title is present, store the title tag as missing
                                    title = 'missing'

                                # Strip whitespace, tabs, new lines. 
                                title = re.sub('\s+',' ',title)
                                title = title.replace('\r', ' ').replace('\n', ' ')
                                title = re.sub(' +', ' ', title)
                                title = title.strip()
                                            
                                path = parse.urlsplit(website).path
                                                
                                # Remove slash, underscore, file endings (e.g., html), and leading/trailing whitespaces from path   
                                path_cleaned = re.sub('/', ' ', path)
                                path_cleaned = re.sub('_', ' ', path_cleaned)
                                path_cleaned = re.sub('\..*', '', path_cleaned)
                                path_cleaned = path_cleaned.strip()
                                                
                                csvwriter.writerow([gvkey,year,id_page,website,str(title),path,path_cleaned])
                                
                                soup = soup_original

                                # Find and remove the title tag
                                title_tag = soup.find('title')
                                if title_tag:
                                    title_tag.decompose()
                                    
                                # Keep text in links. 
                                for script in soup(["a"]):
                                    script.extract()

                                # Remove HTML sections by tag name
                                for tag in soup(["script", "style", 'header', 'footer', 'nav', 'sidebar',"meta","button","hidden","hide","visuallyhidden","code","pre","samp","option"]):
                                    tag.decompose()

                                # Remove divs with certain classes
                                for div in soup('div', {'class': ['header', 'footer', 'nav', 'sidebar',"meta","button","hidden","hide","visuallyhidden","d-none"]}):
                                    div.decompose()

                                # Remove code blocks:
                                for widget in soup('p', {'class': ['widgetState']}):
                                    widget.decompose()

                                # Identify elements with CSS properties hiding content
                                hidden_elements = soup.find_all(style=re.compile(r'display:\s*none|visibility:\s*hidden|opacity:\s*0'))

                                # Identify elements with hidden attribute or aria-hidden="true"
                                hidden_elements += soup.find_all(lambda tag: tag.has_attr('hidden') or tag.has_attr('aria-hidden'))

                                # Remove identified hidden elements
                                for element in hidden_elements:
                                    element.decompose()
                                    
                                # Find and remove the div with style="display: none;" or "display: none". These are hidden and contain UI/UX info.
                                hidden_divs = soup.find_all('div', style="display: none;")
                                # Remove each hidden div
                                for div in hidden_divs:
                                    div.decompose()

                                # Remove any remaining CSS rules or attributes that may hide content
                                style_tags = soup.find_all('style')
                                for style_tag in style_tags:
                                    style_content = style_tag.get_text()
                                    style_content = re.sub(r'display:\s*none|visibility:\s*hidden|opacity:\s*0', '', style_content)
                                    style_tag.string = style_content

                                text = soup.get_text(separator=' ')
                                text = text.replace("REPLACEMENT CHARACTER"," ")
                                # Remove any lingering URLs (only removed content after / until the next space).
                                text = re.sub(r"/[^ ]+ ", ' ', text)

                                # Cleaning curly brackets, accounting for potentially unbalanced brackets. 
                                while re.search(r'{[^{}]*}', text):
                                    # Remove unbalanced curly brackets
                                    text = re.sub(r'{[^{}]*}', ' ', text)

                                text=text.replace('-\n', '')
                                
                                # Strip whitespace, tabs, new lines. 
                                text = re.sub('\s+',' ',text)
                                text = re.sub(' +', ' ', text)
                                text = text.strip()

                                text = " "+text+" "

                                # Remove leftover code <*>
                                text  = re.sub('<[^>]+>', ' ', text)
                                text=text.replace('-\n', '')

                                text = text.replace("[ ]"," ")

                                # Strip whitespace, tabs, new lines. 
                                text = re.sub('\s+',' ',text)
                                text = re.sub(' +', ' ', text)
                                text = text.strip()

                                text = re.sub('\s+',' ',text)
                                text = re.sub(' +', ' ', text)
                                text = text.strip()

                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.strip()
                                
                                # Keep letters, whitespace, punctuation marks, numbers.
                                text = re.sub(r'[^\w\s.,?!:;\-()[\]\'"\d]+', ' ', text)

                                # Safer to replace these with space to prevent concatenated words.  
                                text = text.replace('—',' ')
                                text = text.replace('–',' ')
                                text = text.replace('-',' ') 
                                
                                # Remove anything non alphanumeric
                                # text  = re.sub(r'[^A-Za-z\s]+', ' ', text)
                                text = unicodedata.normalize('NFKD', text) ###
                                text = text.encode('ascii', 'ignore').decode('utf-8') ###
                                
                                # Remove underscores (which are kept with the above)
                                text = text.replace('_', ' ')

                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.replace('  ', ' ')
                                text = text.strip()

                                with open("TXT_uncleaned\\"+str(id_page)+".txt", 'w', encoding = 'utf-8') as textfile:
                                        textfile.write(text+" \n\n")

                                i += 1
                                if i % 10000 == 0:
                                    print(i)