"Open

In [None]:
!pip install openai
!pip install google-search-results
!pip install newspaper3k
!pip install sentence-transformers
!pip install sklearn


In [None]:
import openai
import pandas as pd
import numpy as np
import requests
import json
import os
from serpapi import GoogleSearch
from newspaper import Article
from newspaper import Article, ArticleException

import nltk
import concurrent.futures
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


nltk.download('punkt')

serpapi_key = 'Your SerpAPI Key'
openai.api_key = 'Your OpenAI Api Key'

def download_and_parse_article(url, max_tokens=2000):
 try:
 article = Article(url)
 article.download()
 article.parse()
 article_text = article.text
 except ArticleException:
 article_text = ""

 # Truncate the article text to the maximum number of tokens
 tokens = tokenizer.tokenize(article_text)
 if len(tokens) > max_tokens:
 article_text = tokenizer.convert_tokens_to_string(tokens[:max_tokens])

 return article_text


def get_google_news_data(query, num_results=10, max_tokens=1000):
 params = {
 "api_key": serpapi_key,
 "engine": "google",
 "q": query,
 "tbm": "nws",
 "num": num_results
 }
 response = requests.get('https://serpapi.com/search.json', params=params)
 data = json.loads(response.text)
 articles = []
 for result in data['news_results']:
 article = {
 'title': result['title'],
 'link': result['link'],
 'date': result['date'],
 'source': result['source']
 "location": "Italy",
 }
 articles.append(article)
 print(article['title'])
 return articles


def extract_main_themes(text):
 text = text.replace('{', '{').replace('}', '}') # Add this line to replace curly braces

 prompt = f"You are an all-knowing journalist. You have an exceptional ability to understand what matters in a story. Please provide a concise overview of the main themes and concepts present in the following text: {text}"
 response = openai.ChatCompletion.create(
 model="gpt-3.5-turbo",
 messages=[
 {"role": "system", "content": "You are an expert at analyzing text and extracting main themes and concepts."},
 {"role": "user", "content": prompt},
 ],
 max_tokens=50,
 n=1,
 stop=None,
 temperature=0.5,
 )
 main_themes = response['choices'][0]['message']['content'].strip()
 return main_themes




def cluster_articles(articles, num_clusters=5):
 model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
 embeddings = model.encode([article['main_themes'] for article in articles])
 kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)

 return kmeans.labels_




def create_master_evaluation(grouped_articles):
 master_evaluations = []

 for group in grouped_articles.values():
 main_themes = [article['main_themes'] for article in group]
 prompt = f"Please provide a master evaluation that summarizes the main themes and concepts of the following articles' themes:\n\n{main_themes}"
 response = openai.ChatCompletion.create(
 model="gpt-4",
 messages=[
 {"role": "system", "content": "You are an expert at analyzing and summarizing main themes and concepts of articles."},
 {"role": "user", "content": prompt},
 ],
 max_tokens=150,
 n=1,
 stop=None,
 temperature=0.5,
 )
 master_evaluation = response['choices'][0]['message']['content'].strip()
 master_evaluations.append(master_evaluation)

 return master_evaluations




def generate_content(grouped_articles, master_evaluations):
 content = []

 for cluster_id, articles in grouped_articles.items():
 master_evaluation = master_evaluations[cluster_id]

 for article in articles:
 title = article['title']
 prompt = f"Based on the master evaluation of the cluster '{master_evaluation}', please provide a story title, description, and dataset sources for newsjacking ideation."

 try:
 response = openai.ChatCompletion.create(
 model="gpt-4",
 messages=[
 {"role": "system", "content": "You are an expert at generating newsjacking ideas based on clustered article evaluations."},
 {"role": "user", "content": prompt},
 ],
 max_tokens=200,
 n=1,
 stop=None,
 temperature=0.5,
 )
 generated_content = response['choices'][0]['message']['content'].strip()
 content.append({
 'cluster_id': cluster_id,
 'master_evaluation': master_evaluation,
 'title': title,
 'generated_content': generated_content
 })
 print(f"Added content: {generated_content}")
 except Exception as e:
 print(f"Error generating content: {str(e)}")
 content.append({
 'cluster_id': cluster_id,
 'master_evaluation': master_evaluation,
 'title': title,
 'generated_content': f"Error generating content: {str(e)}"
 })

 return content











if __name__ == '__main__':
 query = "Your Topic"
 num_results = 100 # Set the number of Google News Articles to Scrape
 num_clusters = 10 # Set the number of clusters you would like to create (usually around 10% of the number of results you ask for, but you can play with different values)
 num_ideas = 5 # Set the number of content ideas per cluster

 articles = get_google_news_data(query, num_results,max_tokens=2000)

 #with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
 #article_texts = list(executor.map(download_and_parse_article, [article['link'] for article in articles]))

 with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
 article_texts = list(executor.map(download_and_parse_article, [article['link'] for article in articles]))

 with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:
 main_themes_list = list(executor.map(extract_main_themes, article_texts))



 for i, main_themes in enumerate(main_themes_list):
 articles[i]['main_themes'] = main_themes
 #print(f"article {i+1}: {article['main_themes']}")

 labels = cluster_articles(articles, num_clusters)

 grouped_articles = {i: [] for i in range(num_clusters)}
 for i, label in enumerate(labels):
 grouped_articles[label].append(articles[i])

 master_evaluations = create_master_evaluation(grouped_articles)
 content = generate_content(grouped_articles, master_evaluations)
 content_df = pd.DataFrame(content)

 print("Generated Content DataFrame:")
 print(content_df)




In [None]:
content_df.to_csv('text.csv')