import pandas as pd import numpy as np import requests from bs4 import BeautifulSoup import time import csv from urllib3.exceptions import HTTPError as BaseHTTPError from url_normalize import url_normalize import pyautogui import datetime from json.decoder import JSONDecodeError from requests.exceptions import ConnectionError pd.set_option('display.max_columns', None) ################################################################################ ### Set-up ################################################################################ ########################################### ### The block below will create a new file if not yet present ### Here, we create the file that we use to track what has been scraped. ########################################### try: createfile = open("scrapedURLs.csv", "r", encoding="utf-8") except IOError: with open("scrapedURLs.csv", "w", encoding="utf-8", newline='') as createfile: csvwriter = csv.writer(createfile) csvwriter.writerow(['gvkey','year','level','nr','site','id','valid_scrape','timestamp','filename']) print("Tracking file created") ########################################### # Read the input data (the full list of one click deeper). df = pd.read_csv("URLs_1_deeper.csv", low_memory=False, , converters={'gvkey': str}) # If you want to make a selection of years, uncomment the following: # df = df[(df['year'].astype(int) > 2013)] print("Data loaded") ########################################### # Read the tracking file, create a list from it csvreader = pd.read_csv("scrapedURLs.csv", low_memory=False) newurls_keeptracklist = csvreader['id'].values.astype(str).tolist() # If this is the first time running, create an empty list if newurls_keeptracklist is None: newurls_keeptracklist = [] ################################################################################ ### Scraping ################################################################################ # Set up headers for the requests package (else some sites block it) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"} # We will write newly scraped pages to the scrapedURLs.csv file with open("scrapedURLs.csv", "a", encoding="utf-8", newline='') as file: csvwriter = csv.writer(file) i = 0 # Loop through the rows of the input file. for index, row in df.iterrows(): # Read relevant data from the row. year = row['year'] gvkey = row['gvkey'] gvkey = str(gvkey) # We take as the reference point the middle day in the year. # This will ensure that if there is any page in the year that it will be grabbed. # This is needed due to having to use the 'closest' page to the reference point. timestamp = datetime.date(year = int(year), month = 7, day = 2).strftime("%Y%m%d") # Grab the URL and clean it, to be sure. website = row['deeperlink'] website = website.lower() website = website.replace('http://','') website = website.replace('https://','') website = website.replace('www.','') website = "https://www."+website # Remove right-most / if website[-1:] == '/': website = website[:-1] # Check if the url can be normalized; if not, skip and document. try: website = url_normalize(website) except Exception as e: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"URL normalization error",0,""]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i += 1 continue if website[-1:] == '/': website = website[:-1] # If the page is not yet collected, collect. if newurls_keeptracklist.count(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) > 0: i += 1 continue else: print(str(i)+" "+str(year)+" "+str(gvkey)+" "+str(row['level'])+" "+str(row['nr'])+" "+website) # We first check if and which pages are available. waybackurl = "http://archive.org/wayback/available?url="+website+"×tamp="+str(timestamp) while True: try: response = requests.get(waybackurl) snapshots = response.json()["archived_snapshots"] except ConnectionError: print("-------------------------------------------") print("Connection error: sleeping for one minute ") print("-------------------------------------------") time.sleep(60) continue except JSONDecodeError: print("-------------------------------------------") print("JSON Decode error: sleeping for one minute ") print("-------------------------------------------") time.sleep(60) continue break # Check if there are any pages. # If not, document and continue. if len(snapshots) == 0: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"No snapshot available",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i+=1 continue else: snapshot = snapshots["closest"] snapshotyear = snapshot['timestamp'] # Check if the closest snapshot is in the same year. # If not, document and continue. if str(snapshotyear[0:4]) != str(year): csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"No snapshot in year",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i+=1 continue else: snapshot = snapshot["url"] # Scrape the archived page. try: page = requests.get(snapshot, allow_redirects=True, timeout = 60, headers=headers) # There are many possible errors. # Catch all these and write. Can be revisited later (often, retrying later would solve the issue). except Exception as e: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),str(e),str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i += 1 continue # If the status code is not valid, document and continue. Can also be retried later. if page.status_code != 200: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),page.status_code,str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i += 1 continue # Process the collected website and save as HTML. try: webpage = page.content # Catch HTML processing errors. try: soup = BeautifulSoup(webpage, "html.parser") except UnboundLocalError: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"HTML parse error",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i+=1 continue except TypeError: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"HTML type error",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i+=1 continue # In some cases, the HTML is completely empty. if len(str(soup)) == 0: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"Empty HTML",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i += 1 continue # Write the HTML and document successful collection. # In extremely rare cases (1 out of 3.2 million in our case), there could be an encoding issue that is found when writing the file. htmlerror = 0 with open("HTML\\"+str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])+".html", "w", encoding="utf-8") as htmlcode_out: try: htmlcode_out.write(str(soup)) csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"Collected",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) except UnicodeEncodeError: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"HTML encode error",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) htmlerror = 1 # The block below removes the empty created file in case of the extremely rare encoding issue. if htmlerror == 1 and os.path.exists("HTML\\"+str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])+".html"): os.remove("HTML\\"+str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])+".html") htmlerror = 0 print("Removed "+str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])+".html") # Catch issue with the data collection. Can also be retried later. except RecursionError: csvwriter.writerow([str(gvkey),str(year),str(row['level']),str(row['nr']),website,str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr']),"Recursion error",str(timestamp)]) newurls_keeptracklist.append(str(year)+"_"+str(gvkey)+"_"+str(row['level'])+"_"+str(row['nr'])) i += 1 continue # Update the CSV file. file.flush() i += 1