"Open

In [None]:
!pip install beautifulsoup4 google-api-python-client google-auth google-auth-httplib2 httplib2 requests
!pip install UserAgent
!pip install newspaper3k
!pip install openai
!pip install transformers


In [None]:
import requests
import requests.exceptions
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import deque
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import newspaper
from newspaper import Article
import openai
import transformers
from transformers import GPT2Tokenizer
import time


openai.api_key = "Your OpenAI Key"


def truncate_string_to_max_tokens(input_string, max_tokens=1700):
 # Tokenize the input string
 #tokens = tokenizer.tokenize(input_string)

 # Truncate the tokens to a maximum of 2000 tokens
 truncated_string = input_string[:max_tokens]

 # Convert the truncated tokens back to a string
 #truncated_string = tokenizer.convert_tokens_to_string(truncated_tokens)

 return truncated_string




def process_url(current_url, base_url):
 data = []
 try:
 response = requests.get(current_url)
 soup = BeautifulSoup(response.content, 'html.parser')
 title_tag = soup.find('title')
 page_title = title_tag.text.strip() if title_tag else "No Title"
 print(page_title)

 for link in soup.find_all('a'):
 href = link.get('href')
 if href and not href.startswith('#') and not href.startswith('mailto:'):
 full_url = urljoin(base_url, href)
 if base_url in full_url:
 try:
 article = Article(full_url)
 article.download()
 article.parse()
 content = article.text
 except Exception as e:
 print(f"Error downloading or parsing page: {full_url}")
 content = ''
 content = truncate_string_to_max_tokens(content,max_tokens=2000)
 # Get recommended anchor text from GPT-4
 prompt = f"Content to analyze: \n ### {content} ### \n Recommended Anchor Text:"
 gpt_response = openai.ChatCompletion.create(
 model="gpt-3.5-turbo",
 messages=[
 {
 "role": "system",
 "content": """ Based on the following content, suggest an appropriate anchor text with a focus on SEO, intent, and clickability.
 """
 },
 {
 "role": "user",
 "content": f"{prompt}"
 }
 ],
 max_tokens=20,
 n=1,
 stop=None,
 temperature=0.5,
 )
 anchor_text_recommendation = gpt_response["choices"][0]["message"]["content"].strip()
 print(anchor_text_recommendation)
 entry = {
 'url': full_url,
 'anchor_text': link.text.strip(),
 'page_title': page_title,
 'linked_content': content,
 'recommended_anchor_text': anchor_text_recommendation
 }
 data.append(entry)

 # Save to CSV
 df_entry = pd.DataFrame([entry])
 with open('output.csv', mode='a') as f:
 df_entry.to_csv(f, header=False, index=False)

 except requests.exceptions.SSLError:
 pass

 return data


def crawl_website(base_url, max_pages=None):
 visited_links = set()
 links_to_visit = deque([base_url])
 data = []
 pages_crawled = 0

 # Initialize CSV output file with headers
 df_header = pd.DataFrame(columns=['url', 'anchor_text', 'page_title', 'linked_content', 'recommended_anchor_text'])
 df_header.to_csv('output.csv', index=False)

 with ThreadPoolExecutor(max_workers=5) as executor:
 while links_to_visit and (max_pages is None or pages_crawled < max_pages):
 futures = []
 while links_to_visit and (max_pages is None or pages_crawled < max_pages):
 current_url = links_to_visit.popleft()
 if current_url not in visited_links:
 visited_links.add(current_url)
 futures.append(executor.submit(process_url, current_url, base_url))
 pages_crawled += 1

 for future in as_completed(futures):
 try:
 new_data = future.result()
 for entry in new_data:
 full_url = entry['url']
 if full_url not in visited_links:
 links_to_visit.append(full_url)
 data.append(entry)
 except requests.exceptions.SSLError:
 pass

 return data

def main():
 base_url = 'https://frac.tl'
 max_crawl_limit = 3
 data = crawl_website(base_url, max_pages=max_crawl_limit)

 # Deduplicate data and drop blanks
 deduped_data = []
 unique_data = set()
 for entry in data:
 unique_key = (entry['url'], entry['anchor_text'], entry['page_title'])
 if unique_key not in unique_data and entry['anchor_text']:
 deduped_data.append(entry)
 unique_data.add(unique_key)

 df = pd.DataFrame(deduped_data)
 print(df)
 return df

result = main()