import os import openai import csv import pandas as pd import time import tiktoken import sys ### NOTE: This script uses the openai package version 0.27.8. More recent versions of the package may not work with this script. # Remove duplicates? Yes / No # Explanation: Some website addresses are associated with more than one firm # identifier. We manually decided on the primary firm identifier # that each website address belongs to. Activating this cleaning # rule removes all of the non-primary, duplicate observations. # Recommended: Yes. setting_duplicate = "Yes" # Create a set containing the overall GVKEYs for which pages need to be excluded (here: duplicate websites) if setting_duplicate == "Yes": gvkeys_todrop = pd.read_excel("exclusion_list.xlsx", sheet_name='gvkeys') gvkeys_todrop = gvkeys_todrop['gvkey_withslash'].tolist() gvkeys_todrop = set(gvkeys_todrop) else: gvkeys_todrop = set([]) # This is what GPT3.5 / 4 use. encoding = tiktoken.get_encoding("cl100k_base") openai.api_key = "FILL IN YOUR OWN API KEY HERE" list_definition = """1. Products & Services: Products or services offered, and example projects or case studies 2. News & Events: Articles, news updates, press releases, media kits, upcoming events, or blog posts related to the organization or its industry 3. Home: Main page or landing page 4. Contact & Locations: Contact information like address, email, contact or feedback forms, or information on physical locations or branches 5. Investor Relations: Financial reports, corporate governance information, or other shareholder information 6. About Us: Information about the organization or individual(s) behind the website 7. Legal: Site's terms and conditions, privacy policy, or other legal information 8. Resources, Support & Documentation: Assistance, troubleshooting guides, or customer support, and related resources available for download; FAQs; collections of images or videos 9. Sustainability & Social Responsibility: Sustainability-related and social efforts by the organization 10. Site Functionality: Website functionality, such as sitemaps, search functionality, user authentication or registration, the site's user forum or community, the e-commerce cart, etc. 11. Donate & Support: Asking for donations or support for a cause 12. Jobs & Opportunities: Information about job opportunities or career paths at the organization 13. Partners & Affiliates: Partners, sponsors, or affiliates of the organization 14. Team & Leadership: Team members, executives, or key personnel 15. Testimonials & Reviews: Customer testimonials or reviews about the offering of the organization 16. Other: Pages that don't fit into any of the other categories""" df = pd.read_csv('input_categorization_allpages.csv', dtype=str) df['gvkey_withslash'] = df['gvkey'].apply(lambda x: '/' + x) try: createfile = open('categorization_applied.csv', "r", encoding="utf-8", newline='') except IOError: with open('categorization_applied.csv', "w", encoding="utf-8", newline='') as createfile: csvwriter = csv.writer(createfile) csvwriter.writerow(['gvkey','year','id','site','HTML_title', 'path', 'path_cleaned','classification_GPT']) print("Output file created") try: createfile = open('categorization_errors.csv', "r", encoding="utf-8", newline='') except IOError: with open('categorization_errors.csv', "w", encoding="utf-8", newline='') as createfile: csvwriter = csv.writer(createfile) csvwriter.writerow(['gvkey','year','id','site','HTML_title', 'path', 'path_cleaned','error','retries']) print("Error file created") # Read the tracking file, create a list from it. csvreader = pd.read_csv('categorization_applied.csv', low_memory=False) processed_files = csvreader['id'].values.astype(str).tolist() # If this is the first time running, create an empty list if processed_files is None: processed_files = [] tracker = len(processed_files) totaltokens = 0 retries = 0 print("Starting categorization") with open('categorization_applied.csv', "a", encoding="utf-8", newline='') as f: csvwriter = csv.writer(f) for index, row in df.iterrows(): # Do not categorize duplicates if row["gvkey_withslash"] in gvkeys_todrop: continue # Do not categorize front pages if row["id"].split("_")[2] == "0": continue # Do not categorize pages already categorized if row["id"] in processed_files: continue else: retries = 0 while retries <= 10: try: if totaltokens >= 89250: time.sleep(2) print("Close to limit; sleeping for two seconds") totaltokens = 0 if row["HTML_title"] != "missing" and row["HTML_title"] != "Untitled": content = "Title: "+str(row["HTML_title"])+", URL: "+str(row["path"]) else: content = "URL: "+str(row["path"]) response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ { "role": "system", "content": "You are a research assistant tasked with classifying web pages into one of the following categories: "+list_definition+"You will be provided with the title of the HTML page and the URL, with the domain name removed. Please select only one of these categories and report the category and nothing else. If you cannot determine the category based on the information provided, please use the '16. Other' category. Only provide the category's number and nothing else." }, { "role": "user", "content": content } ], # We set a low temperature, which is more appropriate for classification tasks that require consistency rather than creativity. # "Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic." # We want it to be as replicable as possible. # Limiting output tokens to two minimizes variation in output formatting. temperature=0, max_tokens=2, request_timeout=5, top_p=1, frequency_penalty=0, presence_penalty=0 ) chat_response = response['choices'][0]['message']['content'] chat_response = chat_response.replace(".","") chat_response = chat_response.strip() nrtokens_in = response["usage"]["prompt_tokens"] nrtokens_out = response["usage"]["completion_tokens"] totaltokens = totaltokens + nrtokens_in + nrtokens_out outrow = [row["gvkey"],row["year"],row['id'],row["site"],row["HTML_title"],row["path"],row["path_cleaned"],chat_response] csvwriter.writerow(outrow) f.flush() processed_files.append(row["id"]) tracker += 1 print(str(row['id'])+" --- "+str(tracker)+" "+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) retries = 99 except (openai.error.APIError, openai.error.ServiceUnavailableError, openai.error.RateLimitError, openai.error.Timeout) as error: print(str(error)+" "+str(row["id"])) print(retries) time.sleep(0.5+retries*2) with open('categorization_errors.csv', "a", encoding="utf-8", newline='') as f2: csvwriter2 = csv.writer(f2) outrow2 = [row["gvkey"],row["year"],row['id'],row["site"],row["HTML_title"],row["path"],row["path_cleaned"],error,retries] csvwriter2.writerow(outrow2) retries += 1 if retries == 10: print("Ten retries: ending script.") sys.exit() print("Done, retries = "+str(retries)+" retries at end, "+str(tracker)+" rows completed out of "+str(len(df)))