{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "private_outputs": true, "provenance": [], "authorship_tag": "ABX9TyOpQqUlBBGpiR2NnGQVR2NV", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "M3TtyjflEtOu" }, "outputs": [], "source": [ "!pip install openai\n", "!pip install google-search-results\n", "!pip install newspaper3k\n", "!pip install sentence-transformers\n", "!pip install sklearn\n" ] }, { "cell_type": "code", "source": [ "import openai\n", "import pandas as pd\n", "import numpy as np\n", "import requests\n", "import json\n", "import os\n", "from serpapi import GoogleSearch\n", "from newspaper import Article\n", "from newspaper import Article, ArticleException\n", "\n", "import nltk\n", "import concurrent.futures\n", "from sklearn.cluster import KMeans\n", "from sentence_transformers import SentenceTransformer\n", "from transformers import GPT2Tokenizer\n", "\n", "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')\n", "\n", "\n", "nltk.download('punkt')\n", "\n", "serpapi_key = 'Your SerpAPI Key'\n", "openai.api_key = 'Your OpenAI Api Key'\n", "\n", "def download_and_parse_article(url, max_tokens=2000):\n", " try:\n", " article = Article(url)\n", " article.download()\n", " article.parse()\n", " article_text = article.text\n", " except ArticleException:\n", " article_text = \"\"\n", "\n", " # Truncate the article text to the maximum number of tokens\n", " tokens = tokenizer.tokenize(article_text)\n", " if len(tokens) > max_tokens:\n", " article_text = tokenizer.convert_tokens_to_string(tokens[:max_tokens])\n", "\n", " return article_text\n", "\n", "\n", "def get_google_news_data(query, num_results=10, max_tokens=1000):\n", " params = {\n", " \"api_key\": serpapi_key,\n", " \"engine\": \"google\",\n", " \"q\": query,\n", " \"tbm\": \"nws\",\n", " \"num\": num_results\n", " }\n", " response = requests.get('https://serpapi.com/search.json', params=params)\n", " data = json.loads(response.text)\n", " articles = []\n", " for result in data['news_results']:\n", " article = {\n", " 'title': result['title'],\n", " 'link': result['link'],\n", " 'date': result['date'],\n", " 'source': result['source']\n", " \"location\": \"Italy\",\n", " }\n", " articles.append(article)\n", " print(article['title'])\n", " return articles\n", "\n", "\n", "def extract_main_themes(text):\n", " text = text.replace('{', '{').replace('}', '}') # Add this line to replace curly braces\n", "\n", " prompt = f\"You are an all-knowing journalist. You have an exceptional ability to understand what matters in a story. Please provide a concise overview of the main themes and concepts present in the following text: {text}\"\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are an expert at analyzing text and extracting main themes and concepts.\"},\n", " {\"role\": \"user\", \"content\": prompt},\n", " ],\n", " max_tokens=50,\n", " n=1,\n", " stop=None,\n", " temperature=0.5,\n", " )\n", " main_themes = response['choices'][0]['message']['content'].strip()\n", " return main_themes\n", "\n", "\n", "\n", "\n", "def cluster_articles(articles, num_clusters=5):\n", " model = SentenceTransformer('paraphrase-MiniLM-L6-v2')\n", " embeddings = model.encode([article['main_themes'] for article in articles])\n", " kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(embeddings)\n", "\n", " return kmeans.labels_\n", "\n", "\n", "\n", "\n", "def create_master_evaluation(grouped_articles):\n", " master_evaluations = []\n", "\n", " for group in grouped_articles.values():\n", " main_themes = [article['main_themes'] for article in group]\n", " prompt = f\"Please provide a master evaluation that summarizes the main themes and concepts of the following articles' themes:\\n\\n{main_themes}\"\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-4\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are an expert at analyzing and summarizing main themes and concepts of articles.\"},\n", " {\"role\": \"user\", \"content\": prompt},\n", " ],\n", " max_tokens=150,\n", " n=1,\n", " stop=None,\n", " temperature=0.5,\n", " )\n", " master_evaluation = response['choices'][0]['message']['content'].strip()\n", " master_evaluations.append(master_evaluation)\n", "\n", " return master_evaluations\n", "\n", "\n", "\n", "\n", "def generate_content(grouped_articles, master_evaluations):\n", " content = []\n", "\n", " for cluster_id, articles in grouped_articles.items():\n", " master_evaluation = master_evaluations[cluster_id]\n", "\n", " for article in articles:\n", " title = article['title']\n", " prompt = f\"Based on the master evaluation of the cluster '{master_evaluation}', please provide a story title, description, and dataset sources for newsjacking ideation.\"\n", "\n", " try:\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-4\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are an expert at generating newsjacking ideas based on clustered article evaluations.\"},\n", " {\"role\": \"user\", \"content\": prompt},\n", " ],\n", " max_tokens=200,\n", " n=1,\n", " stop=None,\n", " temperature=0.5,\n", " )\n", " generated_content = response['choices'][0]['message']['content'].strip()\n", " content.append({\n", " 'cluster_id': cluster_id,\n", " 'master_evaluation': master_evaluation,\n", " 'title': title,\n", " 'generated_content': generated_content\n", " })\n", " print(f\"Added content: {generated_content}\")\n", " except Exception as e:\n", " print(f\"Error generating content: {str(e)}\")\n", " content.append({\n", " 'cluster_id': cluster_id,\n", " 'master_evaluation': master_evaluation,\n", " 'title': title,\n", " 'generated_content': f\"Error generating content: {str(e)}\"\n", " })\n", "\n", " return content\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "if __name__ == '__main__':\n", " query = \"Your Topic\"\n", " num_results = 100 # Set the number of Google News Articles to Scrape\n", " num_clusters = 10 # Set the number of clusters you would like to create (usually around 10% of the number of results you ask for, but you can play with different values)\n", " num_ideas = 5 # Set the number of content ideas per cluster\n", "\n", " articles = get_google_news_data(query, num_results,max_tokens=2000)\n", "\n", " #with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:\n", " #article_texts = list(executor.map(download_and_parse_article, [article['link'] for article in articles]))\n", "\n", " with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:\n", " article_texts = list(executor.map(download_and_parse_article, [article['link'] for article in articles]))\n", "\n", " with concurrent.futures.ThreadPoolExecutor(max_workers=40) as executor:\n", " main_themes_list = list(executor.map(extract_main_themes, article_texts))\n", "\n", "\n", "\n", " for i, main_themes in enumerate(main_themes_list):\n", " articles[i]['main_themes'] = main_themes\n", " #print(f\"article {i+1}: {article['main_themes']}\")\n", "\n", " labels = cluster_articles(articles, num_clusters)\n", "\n", " grouped_articles = {i: [] for i in range(num_clusters)}\n", " for i, label in enumerate(labels):\n", " grouped_articles[label].append(articles[i])\n", "\n", " master_evaluations = create_master_evaluation(grouped_articles)\n", " content = generate_content(grouped_articles, master_evaluations)\n", " content_df = pd.DataFrame(content)\n", "\n", " print(\"Generated Content DataFrame:\")\n", " print(content_df)\n", "\n", "\n" ], "metadata": { "id": "Zf6ABC0EAhhY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "content_df.to_csv('text.csv')" ], "metadata": { "id": "-2vdNG7cHzuo" }, "execution_count": null, "outputs": [] } ] }