{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "private_outputs": true, "provenance": [], "authorship_tag": "ABX9TyNEijzfvGyqBzD3laRiD3yo", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "Created by Kristin Tynski (Kristin@frac.tl) Co-Founder at Frac.tl\n" ], "metadata": { "id": "pVZkkw00XFYL" } }, { "cell_type": "code", "source": [ "!pip install bs4\n", "!pip install nltk\n", "!pip install xlsxwriter\n", "!pip install newspaper3k\n", "!pip install openai" ], "metadata": { "id": "HLKQql_rBG5g" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6AoXFUxW_-J9" }, "outputs": [], "source": [ "# Import necessary libraries\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import pandas as pd\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "from nltk.probability import FreqDist\n", "from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder\n", "\n", "# Define a function to scrape Google search results and create a dataframe\n", "def scrape_google(query):\n", " # Set headers to mimic a web browser\n", " headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}\n", " # Build the URL with the query parameter\n", " url = f'https://www.google.com/search?q={query}'\n", " # Send a request to the URL and store the HTML content\n", " html = requests.get(url, headers=headers).content\n", " # Use BeautifulSoup to parse the HTML content\n", " soup = BeautifulSoup(html, 'html.parser')\n", " # Find all the search result elements\n", " search_results = soup.find_all('div', {'class': 'g'})\n", " # Initialize an empty list to store the search results\n", " results = []\n", " # Loop through each search result and extract the relevant information\n", " for result in search_results:\n", " try:\n", " title = result.find('h3').text\n", " url = result.find('a')['href']\n", " results.append((title, url))\n", " except:\n", " continue\n", " # Create a dataframe from the search results\n", " df = pd.DataFrame(results, columns=['Title', 'URL'])\n", " df.to_csv(\"Scraped_URLs_From_SERPS.csv\")\n", " return df\n", "\n" ] }, { "cell_type": "code", "source": [], "metadata": { "id": "QCnGWQKpBkR7" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import newspaper\n", "from newspaper import Article\n", "\n", "def scrape_article(url):\n", " try:\n", " article = Article(url)\n", " article.download()\n", " article.parse()\n", " return article.text\n", " except:\n", " return \"\"\n", "\n" ], "metadata": { "id": "AVmr-Ac1Bsxz" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import nltk\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "nltk.download('averaged_perceptron_tagger')\n", "nltk.download('tagsets')\n", "nltk.download('words')\n", "nltk.download('maxent_ne_chunker')\n", "nltk.download('vader_lexicon')\n", "nltk.download('inaugural')\n", "nltk.download('webtext')\n", "nltk.download('treebank')\n", "nltk.download('gutenberg')\n", "nltk.download('genesis')\n", "nltk.download('trigram_collocations')\n", "nltk.download('quadgram_collocations')\n", "\n", "# Define a function to perform NLP analysis and return a string of keyness results\n", "def analyze_text(text):\n", " # Tokenize the text and remove stop words\n", " tokens = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stopwords.words('english')]\n", " # Get the frequency distribution of the tokens\n", " fdist = FreqDist(tokens)\n", " # Create a bigram finder and get the top 20 bigrams by keyness\n", " bigram_measures = BigramAssocMeasures()\n", " finder = BigramCollocationFinder.from_words(tokens)\n", " bigrams = finder.nbest(bigram_measures.raw_freq, 20)\n", " # Create a string from the keyness results\n", " results_str = ''\n", " results_str += 'Top 20 Words:\\n'\n", " for word, freq in fdist.most_common(20):\n", " results_str += f'{word}: {freq}\\n'\n", " results_str += '\\nTop 20 Bigrams:\\n'\n", " for bigram in bigrams:\n", " results_str += f'{bigram[0]} {bigram[1]}\\n'\n", " return results_str\n", "\n", "# Define the main function to scrape Google search results and analyze the article text\n", "def main(query):\n", " # Scrape Google search results and create a dataframe\n", " df = scrape_google(query)\n", " # Scrape article text for each search result and store it in the dataframe\n", " for index, row in df.iterrows():\n", " url = row['URL']\n", " article_text = scrape_article(url)\n", " df.at[index, 'Article Text'] = article_text\n", " # Analyze the article text for each search result and store the keyness results in the dataframe\n", " for index, row in df.iterrows():\n", " text = row['Article Text']\n", " keyness_results = analyze_text(text)\n", " df.at[index, 'Keyness Results'] = keyness_results\n", " # Return the final dataframe\n", " df.to_csv(\"NLP_Data_On_SERP_Links_Text.csv\")\n", " return df\n" ], "metadata": { "id": "EDSiUJFvBu66" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import nltk\n", "from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder\n", "from nltk.collocations import QuadgramAssocMeasures, QuadgramCollocationFinder\n", "\n", "\n", "\n", "# Define the main function to scrape Google search results and analyze the article text\n", "def analyze_serps(query):\n", " # Scrape Google search results and create a dataframe\n", " df = scrape_google(query)\n", " # Scrape article text for each search result and store it in the dataframe\n", " for index, row in df.iterrows():\n", " url = row['URL']\n", " article_text = scrape_article(url)\n", " df.at[index, 'Article Text'] = article_text\n", " # Analyze the article text for each search result and store the NLP results in the dataframe\n", " for index, row in df.iterrows():\n", " text = row['Article Text']\n", " # Tokenize the text and remove stop words\n", " tokens = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stopwords.words('english') and 'contact' not in word.lower() and 'admin' not in word.lower()]\n", " # Calculate the frequency distribution of the tokens\n", " fdist = FreqDist(tokens)\n", " # Calculate the 20 most common words\n", " most_common = fdist.most_common(20)\n", " # Calculate the 20 least common words\n", " least_common = fdist.most_common()[-20:]\n", " # Calculate the 20 most common bigrams\n", " bigram_measures = BigramAssocMeasures()\n", " finder = BigramCollocationFinder.from_words(tokens)\n", " bigrams = finder.nbest(bigram_measures.raw_freq, 20)\n", " # Calculate the 20 most common trigrams\n", " trigram_measures = TrigramAssocMeasures()\n", " finder = TrigramCollocationFinder.from_words(tokens)\n", " trigrams = finder.nbest(trigram_measures.raw_freq, 20)\n", " # Calculate the 20 most common quadgrams\n", " quadgram_measures = QuadgramAssocMeasures()\n", " finder = QuadgramCollocationFinder.from_words(tokens)\n", " quadgrams = finder.nbest(quadgram_measures.raw_freq, 20)\n", " # Calculate the part-of-speech tags for the text\n", " pos_tags = nltk.pos_tag(tokens)\n", " # Store the NLP results in the dataframe\n", " df.at[index, 'Most Common Words'] = ', '.join([word[0] for word in most_common])\n", " df.at[index, 'Least Common Words'] = ', '.join([word[0] for word in least_common])\n", " df.at[index, 'Most Common Bigrams'] = ', '.join([f'{bigram[0]} {bigram[1]}' for bigram in bigrams])\n", " df.at[index, 'Most Common Trigrams'] = ', '.join([f'{trigram[0]} {trigram[1]} {trigram[2]}' for trigram in trigrams])\n", " df.at[index, 'Most Common Quadgrams'] = ', '.join([f'{quadgram[0]} {quadgram[1]} {quadgram[2]} {quadgram[3]}' for quadgram in quadgrams])\n", " df.at[index, 'POS Tags'] = ', '.join([f'{token}/{tag}' for token, tag in pos_tags])\n", " # Replace any remaining commas with spaces in the Article Text column\n", " df.at[index, 'Article Text'] = ' '.join(row['Article Text'].replace(',', ' ').split())\n", " # Save the final dataframe as an Excel file\n", " writer = pd.ExcelWriter('NLP_Based_SERP_Results.xlsx', engine='xlsxwriter')\n", " df.to_excel(writer, sheet_name='Sheet1', index=False)\n", " writer.save()\n", " # Return the final dataframe\n", " return df\n", "\n" ], "metadata": { "id": "RoDhKV-8ByKC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import statistics\n", "import collections\n", "\n", "# Define a function to summarize the NLP results from the dataframe\n", "def summarize_nlp(df):\n", " # Calculate the total number of search results\n", " total_results = len(df)\n", " # Calculate the average length of the article text\n", " avg_length = round(df['Article Text'].apply(len).mean(), 2)\n", " # Get the most common words across all search results\n", " all_words = ', '.join(df['Most Common Words'].sum().split(', '))\n", " # Get the most common bigrams across all search results\n", " all_bigrams = ', '.join(df['Most Common Bigrams'].sum().split(', '))\n", " # Get the most common trigrams across all search results\n", " all_trigrams = ', '.join(df['Most Common Trigrams'].sum().split(', '))\n", " # Get the most common quadgrams across all search results\n", " all_quadgrams = ', '.join(df['Most Common Quadgrams'].sum().split(', '))\n", " # Get the most common part-of-speech tags across all search results\n", " all_tags = ', '.join(df['POS Tags'].sum().split(', '))\n", " # Calculate the median number of words in the article text\n", " median_words = statistics.median(df['Article Text'].apply(lambda x: len(x.split())).tolist())\n", " # Calculate the frequency of each word across all search results\n", " word_freqs = collections.Counter(all_words.split(', '))\n", " # Calculate the frequency of each bigram across all search results\n", " bigram_freqs = collections.Counter(all_bigrams.split(', '))\n", " # Calculate the frequency of each trigram across all search results\n", " trigram_freqs = collections.Counter(all_trigrams.split(', '))\n", " # Calculate the frequency of each quadgram across all search results\n", " quadgram_freqs = collections.Counter(all_quadgrams.split(', '))\n", " # Calculate the top 20% of most frequent words\n", " top_words = ', '.join([word[0] for word in word_freqs.most_common(int(len(word_freqs) * 0.2))])\n", " # Calculate the top 20% of most frequent bigrams\n", " top_bigrams = ', '.join([bigram[0] for bigram in bigram_freqs.most_common(int(len(bigram_freqs) * 0.2))])\n", " # Calculate the top 20% of most frequent trigrams\n", " top_trigrams = ', '.join([trigram[0] for trigram in trigram_freqs.most_common(int(len(trigram_freqs) * 0.2))])\n", " # Calculate the top 20% of most frequent quadgrams\n", " top_quadgrams = ', '.join([quadgram[0] for quadgram in quadgram_freqs.most_common(int(len(quadgram_freqs) * 0.2))])\n", "\n", " #print(f'Total results: {total_results}')\n", " #print(f'Average article length: {avg_length} characters')\n", " #print(f'Median words per article: {median_words}')\n", " #print(f'Most common words: {top_words} ({len(word_freqs)} total words)')\n", " #print(f'Most common bigrams: {top_bigrams} ({len(bigram_freqs)} total bigrams)')\n", " #print(f'Most common trigrams: {top_trigrams} ({len(trigram_freqs)} total trigrams)')\n", " #print(f'Most common quadgrams: {top_quadgrams} ({len(quadgram_freqs)} total quadgrams)')\n", " #print(f'Most common part-of-speech tags: {all_tags}')\n", " summary = \"\"\n", " summary += f'Total results: {total_results}\\n'\n", " summary += f'Average article length: {avg_length} characters\\n'\n", " summary += f'Median words per article: {median_words}\\n'\n", " summary += f'Most common words: {top_words} ({len(word_freqs)} total words)\\n'\n", " summary += f'Most common bigrams: {top_bigrams} ({len(bigram_freqs)} total bigrams)\\n'\n", " summary += f'Most common trigrams: {top_trigrams} ({len(trigram_freqs)} total trigrams)\\n'\n", " summary += f'Most common quadgrams: {top_quadgrams} ({len(quadgram_freqs)} total quadgrams)\\n'\n", " return summary\n", "\n" ], "metadata": { "id": "4JrKikJeJz4X" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "import openai\n", "from IPython.display import Markdown\n", "openai.api_key = \"Your API Key\"\n", "\n", "def generate_article_outline(prompt,query):\n", " prompt=f\"Write long form and highly comprehensive and well formatted article outline for the query: {query} in highly nested markdown. The outline should be similar to that of a wikipedia table of contents and incorporates the following NLP SEO guidance from SERP evaluation and ignores data unrelated to the query:\\n\\n{prompt}\\n\\n\"\n", "\n", " response = openai.ChatCompletion.create(\n", " model='gpt-3.5-turbo',\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"Simulate an exceptionally talented journalist and editor. Given the following instructions, think step by step and produce the best possible output you can.\"},\n", " {\"role\": \"user\", \"content\": prompt}],\n", " max_tokens=2048,\n", " n=1,\n", " stop=None,\n", " temperature=0.3,\n", " )\n", "\n", " outline = response['choices'][0]['message']['content'].strip()\n", " return outline\n", "\n", "\n", "\n", "# Test the script on a sample query\n", "query = 'Transgender Rights'\n", "results = analyze_serps(query)\n", "summary = summarize_nlp(results)\n", "print(summary)\n", "getoutline = generate_article_outline(summary,query)\n", "display(Markdown(getoutline))" ], "metadata": { "id": "xeH01QSvLl3-" }, "execution_count": null, "outputs": [] } ] }