import glob import csv import os import re from bs4 import BeautifulSoup from bs4 import Comment import pprint import re import pandas as pd from url_normalize import url_normalize from langdetect import detect_langs from langdetect import DetectorFactory from urllib import parse import zipfile import unicodedata #### ############################################################################### ### USER INPUT: Adjust the desired cleaning steps below ############################################################################### ############################################################################### ### DECISION 1: Duplicates # Remove duplicates? Yes / No # Explanation: Some website addresses are associated with more than one firm # identifier. We manually decided on the primary firm identifier # that each website address belongs to. Activating this cleaning # rule removes all of the non-primary, duplicate observations. # Recommended: Yes. setting_duplicate = "Yes" ############################################################################### ### DECISION 2: Invalid pages # Remove invalid pages? Yes / No # Explanation: Some webpages are invalid as they are, for example, only # placeholders. We manually checked all webpages without links # (which is a reliable sign of an invalid website) and kept track # of all clearly invalid pages. Activating this cleaning rule # removes these webpages. # Recommended: Yes. setting_invalidpages = "Yes" ############################################################################### ### DECISION 3: Invalid sentences # Remove invalid sentences? Yes / No # Explanation: Even valid pages can contain invalid sentences, for example, # information on which browser to access the focal website with. # We created a list of thousands of such faulty sentences. # Activating this cleaning rule removes these sentences from the # website texts, leaving behind the valid textual content. # Recommended: Yes. setting_invalid_sentences = "Yes" ############################################################################### ### DECISION 4: Non-English pages # Remove non-English pages? Yes / No # Explanation: Website texts can be multi-lingual, but many natural language # processing applications are sensitive to multi-lingual texts, # which can negatively affect their outputs. This cleaning rule # identifies the language that texts are written in # and removes texts that are not English with a user-specified # degree of certainty. # Recommended: Yes. # If yes: what threshold? # Recommended: 85 setting_english = "Yes" setting_english_threshold = 0.85 ############################################################################### ### DECISION 5: Short texts # Remove content below a length threshold? Yes / No # Explanation: Text length is one of the most reliable indicators of faulty # website texts. This cleaning rule removes texts below a # user-specified character length threshold. # Recommended: Yes. # If yes: what threshold? # Recommended: TBD setting_length = "Yes" setting_length_threshold = 10 ############################################################################### ### DECISION 6: Include page titles in plaintext # Include page title into the plaintext? Yes / No # Explanation: It is possible to include the title of the HTML page in the # processed website texts. This would add additional content # to these texts. However, titles often contain highly repeating # information between pages (e.g., always starting with the full # company name). Activating this rule will add the titles parsed # from the .html files to the plaintext files. # Recommended: No. setting_incltitle = "No" ############################################################################### ### DECISION 7: Exclude specific categories of pages # Remove specific page types? Yes / No # Explanation: Certain types of pages are (for most research purposes) less # relevant than others. For instance, site functionality pages # typically contain technical details but not substantive content. # We inductively identified 16 types of pages and used GPT to classify # the pages. # # We identified the following pages: # # 1. Products & Services: Products or services offered, and example projects # or case studies # 2. News & Events: Articles, news updates, press releases, media kits, # upcoming events, or blog posts related to the organization or its industry # 3. Home: Main page or landing page # 4. Contact & Locations: Contact information like address, email, contact # or feedback forms, or information on physical locations or branches # 5. Investor Relations: Financial reports, corporate governance information, # or other shareholder information # 6. About Us: Information about the organization or individual(s) behind the website # 7. Legal: Site's terms and conditions, privacy policy, or other legal information # 8. Resources, Support & Documentation: Assistance, troubleshooting guides, # or customer support, and related resources available for download; # FAQs; collections of images or videos # 9. Sustainability & Social Responsibility: Sustainability-related and social # efforts by the organization # 10. Site Functionality: Website functionality, such as sitemaps, search # functionality, user authentication or registration, the site's user forum # or community, the e-commerce cart, etc. # 11. Donate & Support: Asking for donations or support for a cause # 12. Jobs & Opportunities: Information about job opportunities or career paths # at the organization # 13. Partners & Affiliates: Partners, sponsors, or affiliates of the organization # 14. Team & Leadership: Team members, executives, or key personnel # 15. Testimonials & Reviews: Customer testimonials or reviews about the # offering of the organization # 16. Other: Pages that don't fit into any of the other categories # # Activating this rule will filter out pages of the type of page included in the list. # # Recommended: Yes, page types 3 (Home), 7 (Legal), 8 (Resources, Support & Documentation), # and 10 (Site functionality) setting_pagetype = "Yes" setting_pagetypelist = ['3', '7', '8', '10'] ############################################################################### ### END OF USER INPUT SECTION ############################################################################### # Create a set containing the overall GVKEYs for which pages need to be excluded (here: duplicate websites) if setting_duplicate == "Yes": gvkeys_todrop = pd.read_excel("exclusion_list.xlsx", sheet_name='gvkeys') gvkeys_todrop = gvkeys_todrop['gvkey_withslash'].tolist() gvkeys_todrop = set(gvkeys_todrop) else: gvkeys_todrop = set([]) # Create a set containing the individual pages that have been manually identified as invalid. if setting_invalidpages == "Yes": pages_todrop = pd.read_excel("exclusion_list.xlsx", sheet_name='pages') pages_todrop = pages_todrop['id_page'].tolist() pages_todrop = set(pages_todrop) else: pages_todrop = set([]) # Create a list containing junk sentences. if setting_invalid_sentences == "Yes": sentences_to_remove = pd.read_excel("exclusion_list.xlsx", sheet_name='sentences') sentences_to_remove = sentences_to_remove['sentence'].tolist() sentences_to_remove = sorted(sentences_to_remove, key=len, reverse=True) else: sentences_to_remove = [] # Load up the page categorization. if setting_pagetype == "Yes": gpt_df = pd.read_csv("categorization_applied.csv", low_memory=False, dtype=str) pagetype_droplist = gpt_df[gpt_df['classification_GPT'].isin(setting_pagetypelist)] pagetype_droplist = pagetype_droplist['id'].tolist() pagetype_droplist = set(pagetype_droplist) else: pagetype_droplist = set([]) # Note that, for cost reasons, this categorization was only applied for pages that meet the above settings. titles_file = pd.read_csv('input_categorization_allpages.csv', low_memory=False, dtype=str) i = 0 with open('metadata.csv', "w", encoding="utf-8", newline='') as createfile: csvwriter = csv.writer(createfile) csvwriter.writerow(['gvkey','gvkeywithslash','year','level','pagenr','id','numberofwords','duplicate','invalid_page','drop_pagetype','primarylang','primarylang_conf','languagelist','included_final']) with zipfile.ZipFile('TXT_uncleaned.zip', 'r') as zip: for filename in zip.namelist()[1:]: with zip.open(filename) as file: text = file.read().decode('utf-8') splitname = filename.split("\\") splitname = splitname[-1] splitname = splitname.replace("TXT_uncleaned/","") splitname = splitname.split("_") year = splitname[0] gvkey = splitname[1] gvkeywithslash = "/"+gvkey level = splitname[2] number = splitname[3] number = number.replace(".txt","") id_page = str(year)+"_"+str(gvkey)+"_"+str(level)+"_"+str(number) if setting_incltitle == "Yes": # VLOOKUP to get website, and clean. df_focalpage = titles_file[(titles_file['id'].astype(str) == str(id_page))] title = df_focalpage['HTML_title'].iloc[0] title = title+" " else: title = "" # Exclude if duplicate if gvkeywithslash in gvkeys_todrop: duplicatepage = 1 else: duplicatepage = 0 # Exclude if invalid page if id_page in pages_todrop: invalid_page = 1 else: invalid_page = 0 # Exclude if page type not to be included if id_page in pagetype_droplist: drop_pagetype = 1 else: drop_pagetype = 0 # Remove junk sentences if setting_invalid_sentences == "Yes": for sentence in sentences_to_remove: #text = re.sub(re.escape(sentence), " ", text, flags=re.IGNORECASE) text = text.replace(sentence, " ") text = text.replace(' ', ' ') text = text.replace(' ', ' ') text = text.replace(' ', ' ') text = text.replace(' ', ' ') text = text.replace(' ', ' ') text = text.strip() # Classify the language langlist = [] DetectorFactory.seed = 123456789 try: langlist = [detect_langs(text)] langlist = langlist[0] mainlang = str(langlist[0]) primarylang = str(mainlang[0:2]) primarylang_conf = mainlang[3:] primarylang_conf = float(primarylang_conf) except Exception as e: primarylang = "error" primarylang_conf = 0 langlist = e # Remove anything non alphanumeric text_alpha = re.sub(r'[^A-Za-z\s]+', '', text) # Remove underscores (which are kept with the above) text_alpha = text_alpha.replace('_', '') # Note; this is done like this because otherwise numbers and the likes are included. This is a more sound representation of text length for usual applications. numberofwords = len(text_alpha.split()) include = 1 # Use settings to determine keeping vs dropping: if setting_duplicate == "Yes" and duplicatepage == 1: include = 0 if setting_invalidpages == "Yes" and invalid_page == 1: include = 0 if setting_english == "Yes" and ((primarylang == "en" and primarylang_conf < setting_english_threshold) or primarylang != "en"): include = 0 if setting_pagetype == "Yes" and drop_pagetype == 1: include = 0 if setting_length == "Yes" and numberofwords <= setting_length_threshold: include = 0 if setting_length == "No" and numberofwords <= 1: include = 0 csvwriter.writerow([gvkey,gvkeywithslash,year,level,number,id_page,numberofwords,duplicatepage,invalid_page,drop_pagetype,primarylang,primarylang_conf,langlist,include]) if include == 1: with open("TXT_cleaned\\"+str(id_page)+".txt", 'w', encoding = 'utf-8') as textfile: textfile.write(title+text+" \n\n") with open("TXT_combined\\"+str(year)+"_"+str(gvkey)+".txt", 'a', encoding = 'utf-8') as textfile: textfile.write(title+text+" \n\n") i += 1 if i % 10000 == 0: print(i)