{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "private_outputs": true,
      "provenance": [],
      "authorship_tag": "ABX9TyNEijzfvGyqBzD3laRiD3yo",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/ktynski/Marketing_Automations_Notebooks_With_GPT/blob/main/Automatic_Article_Outline_Generation_by_Analyzing_The_Article_Text_of_Top_Ranking_Pages_for_a_Given_Keyword_(Public).ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Created by Kristin Tynski (Kristin@frac.tl) Co-Founder at Frac.tl\n"
      ],
      "metadata": {
        "id": "pVZkkw00XFYL"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install bs4\n",
        "!pip install nltk\n",
        "!pip install xlsxwriter\n",
        "!pip install newspaper3k\n",
        "!pip install openai"
      ],
      "metadata": {
        "id": "HLKQql_rBG5g"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6AoXFUxW_-J9"
      },
      "outputs": [],
      "source": [
        "# Import necessary libraries\n",
        "import requests\n",
        "from bs4 import BeautifulSoup\n",
        "import pandas as pd\n",
        "from nltk.tokenize import word_tokenize\n",
        "from nltk.corpus import stopwords\n",
        "from nltk.probability import FreqDist\n",
        "from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder\n",
        "\n",
        "# Define a function to scrape Google search results and create a dataframe\n",
        "def scrape_google(query):\n",
        "    # Set headers to mimic a web browser\n",
        "    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}\n",
        "    # Build the URL with the query parameter\n",
        "    url = f'https://www.google.com/search?q={query}'\n",
        "    # Send a request to the URL and store the HTML content\n",
        "    html = requests.get(url, headers=headers).content\n",
        "    # Use BeautifulSoup to parse the HTML content\n",
        "    soup = BeautifulSoup(html, 'html.parser')\n",
        "    # Find all the search result elements\n",
        "    search_results = soup.find_all('div', {'class': 'g'})\n",
        "    # Initialize an empty list to store the search results\n",
        "    results = []\n",
        "    # Loop through each search result and extract the relevant information\n",
        "    for result in search_results:\n",
        "        try:\n",
        "            title = result.find('h3').text\n",
        "            url = result.find('a')['href']\n",
        "            results.append((title, url))\n",
        "        except:\n",
        "            continue\n",
        "    # Create a dataframe from the search results\n",
        "    df = pd.DataFrame(results, columns=['Title', 'URL'])\n",
        "    df.to_csv(\"Scraped_URLs_From_SERPS.csv\")\n",
        "    return df\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "QCnGWQKpBkR7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import newspaper\n",
        "from newspaper import Article\n",
        "\n",
        "def scrape_article(url):\n",
        "    try:\n",
        "        article = Article(url)\n",
        "        article.download()\n",
        "        article.parse()\n",
        "        return article.text\n",
        "    except:\n",
        "        return \"\"\n",
        "\n"
      ],
      "metadata": {
        "id": "AVmr-Ac1Bsxz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "nltk.download('punkt')\n",
        "nltk.download('averaged_perceptron_tagger')\n",
        "nltk.download('tagsets')\n",
        "nltk.download('words')\n",
        "nltk.download('maxent_ne_chunker')\n",
        "nltk.download('vader_lexicon')\n",
        "nltk.download('inaugural')\n",
        "nltk.download('webtext')\n",
        "nltk.download('treebank')\n",
        "nltk.download('gutenberg')\n",
        "nltk.download('genesis')\n",
        "nltk.download('trigram_collocations')\n",
        "nltk.download('quadgram_collocations')\n",
        "\n",
        "# Define a function to perform NLP analysis and return a string of keyness results\n",
        "def analyze_text(text):\n",
        "    # Tokenize the text and remove stop words\n",
        "    tokens = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stopwords.words('english')]\n",
        "    # Get the frequency distribution of the tokens\n",
        "    fdist = FreqDist(tokens)\n",
        "    # Create a bigram finder and get the top 20 bigrams by keyness\n",
        "    bigram_measures = BigramAssocMeasures()\n",
        "    finder = BigramCollocationFinder.from_words(tokens)\n",
        "    bigrams = finder.nbest(bigram_measures.raw_freq, 20)\n",
        "    # Create a string from the keyness results\n",
        "    results_str = ''\n",
        "    results_str += 'Top 20 Words:\\n'\n",
        "    for word, freq in fdist.most_common(20):\n",
        "        results_str += f'{word}: {freq}\\n'\n",
        "    results_str += '\\nTop 20 Bigrams:\\n'\n",
        "    for bigram in bigrams:\n",
        "        results_str += f'{bigram[0]} {bigram[1]}\\n'\n",
        "    return results_str\n",
        "\n",
        "# Define the main function to scrape Google search results and analyze the article text\n",
        "def main(query):\n",
        "    # Scrape Google search results and create a dataframe\n",
        "    df = scrape_google(query)\n",
        "    # Scrape article text for each search result and store it in the dataframe\n",
        "    for index, row in df.iterrows():\n",
        "        url = row['URL']\n",
        "        article_text = scrape_article(url)\n",
        "        df.at[index, 'Article Text'] = article_text\n",
        "    # Analyze the article text for each search result and store the keyness results in the dataframe\n",
        "    for index, row in df.iterrows():\n",
        "        text = row['Article Text']\n",
        "        keyness_results = analyze_text(text)\n",
        "        df.at[index, 'Keyness Results'] = keyness_results\n",
        "    # Return the final dataframe\n",
        "    df.to_csv(\"NLP_Data_On_SERP_Links_Text.csv\")\n",
        "    return df\n"
      ],
      "metadata": {
        "id": "EDSiUJFvBu66"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import nltk\n",
        "from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder\n",
        "from nltk.collocations import QuadgramAssocMeasures, QuadgramCollocationFinder\n",
        "\n",
        "\n",
        "\n",
        "# Define the main function to scrape Google search results and analyze the article text\n",
        "def analyze_serps(query):\n",
        "    # Scrape Google search results and create a dataframe\n",
        "    df = scrape_google(query)\n",
        "    # Scrape article text for each search result and store it in the dataframe\n",
        "    for index, row in df.iterrows():\n",
        "        url = row['URL']\n",
        "        article_text = scrape_article(url)\n",
        "        df.at[index, 'Article Text'] = article_text\n",
        "    # Analyze the article text for each search result and store the NLP results in the dataframe\n",
        "    for index, row in df.iterrows():\n",
        "        text = row['Article Text']\n",
        "        # Tokenize the text and remove stop words\n",
        "        tokens = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stopwords.words('english') and 'contact' not in word.lower() and 'admin' not in word.lower()]\n",
        "        # Calculate the frequency distribution of the tokens\n",
        "        fdist = FreqDist(tokens)\n",
        "        # Calculate the 20 most common words\n",
        "        most_common = fdist.most_common(20)\n",
        "        # Calculate the 20 least common words\n",
        "        least_common = fdist.most_common()[-20:]\n",
        "        # Calculate the 20 most common bigrams\n",
        "        bigram_measures = BigramAssocMeasures()\n",
        "        finder = BigramCollocationFinder.from_words(tokens)\n",
        "        bigrams = finder.nbest(bigram_measures.raw_freq, 20)\n",
        "        # Calculate the 20 most common trigrams\n",
        "        trigram_measures = TrigramAssocMeasures()\n",
        "        finder = TrigramCollocationFinder.from_words(tokens)\n",
        "        trigrams = finder.nbest(trigram_measures.raw_freq, 20)\n",
        "        # Calculate the 20 most common quadgrams\n",
        "        quadgram_measures = QuadgramAssocMeasures()\n",
        "        finder = QuadgramCollocationFinder.from_words(tokens)\n",
        "        quadgrams = finder.nbest(quadgram_measures.raw_freq, 20)\n",
        "        # Calculate the part-of-speech tags for the text\n",
        "        pos_tags = nltk.pos_tag(tokens)\n",
        "        # Store the NLP results in the dataframe\n",
        "        df.at[index, 'Most Common Words'] = ', '.join([word[0] for word in most_common])\n",
        "        df.at[index, 'Least Common Words'] = ', '.join([word[0] for word in least_common])\n",
        "        df.at[index, 'Most Common Bigrams'] = ', '.join([f'{bigram[0]} {bigram[1]}' for bigram in bigrams])\n",
        "        df.at[index, 'Most Common Trigrams'] = ', '.join([f'{trigram[0]} {trigram[1]} {trigram[2]}' for trigram in trigrams])\n",
        "        df.at[index, 'Most Common Quadgrams'] = ', '.join([f'{quadgram[0]} {quadgram[1]} {quadgram[2]} {quadgram[3]}' for quadgram in quadgrams])\n",
        "        df.at[index, 'POS Tags'] = ', '.join([f'{token}/{tag}' for token, tag in pos_tags])\n",
        "        # Replace any remaining commas with spaces in the Article Text column\n",
        "        df.at[index, 'Article Text'] = ' '.join(row['Article Text'].replace(',', ' ').split())\n",
        "    # Save the final dataframe as an Excel file\n",
        "    writer = pd.ExcelWriter('NLP_Based_SERP_Results.xlsx', engine='xlsxwriter')\n",
        "    df.to_excel(writer, sheet_name='Sheet1', index=False)\n",
        "    writer.save()\n",
        "    # Return the final dataframe\n",
        "    return df\n",
        "\n"
      ],
      "metadata": {
        "id": "RoDhKV-8ByKC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import statistics\n",
        "import collections\n",
        "\n",
        "# Define a function to summarize the NLP results from the dataframe\n",
        "def summarize_nlp(df):\n",
        "    # Calculate the total number of search results\n",
        "    total_results = len(df)\n",
        "    # Calculate the average length of the article text\n",
        "    avg_length = round(df['Article Text'].apply(len).mean(), 2)\n",
        "    # Get the most common words across all search results\n",
        "    all_words = ', '.join(df['Most Common Words'].sum().split(', '))\n",
        "    # Get the most common bigrams across all search results\n",
        "    all_bigrams = ', '.join(df['Most Common Bigrams'].sum().split(', '))\n",
        "    # Get the most common trigrams across all search results\n",
        "    all_trigrams = ', '.join(df['Most Common Trigrams'].sum().split(', '))\n",
        "    # Get the most common quadgrams across all search results\n",
        "    all_quadgrams = ', '.join(df['Most Common Quadgrams'].sum().split(', '))\n",
        "    # Get the most common part-of-speech tags across all search results\n",
        "    all_tags = ', '.join(df['POS Tags'].sum().split(', '))\n",
        "    # Calculate the median number of words in the article text\n",
        "    median_words = statistics.median(df['Article Text'].apply(lambda x: len(x.split())).tolist())\n",
        "    # Calculate the frequency of each word across all search results\n",
        "    word_freqs = collections.Counter(all_words.split(', '))\n",
        "    # Calculate the frequency of each bigram across all search results\n",
        "    bigram_freqs = collections.Counter(all_bigrams.split(', '))\n",
        "    # Calculate the frequency of each trigram across all search results\n",
        "    trigram_freqs = collections.Counter(all_trigrams.split(', '))\n",
        "    # Calculate the frequency of each quadgram across all search results\n",
        "    quadgram_freqs = collections.Counter(all_quadgrams.split(', '))\n",
        "    # Calculate the top 20% of most frequent words\n",
        "    top_words = ', '.join([word[0] for word in word_freqs.most_common(int(len(word_freqs) * 0.2))])\n",
        "    # Calculate the top 20% of most frequent bigrams\n",
        "    top_bigrams = ', '.join([bigram[0] for bigram in bigram_freqs.most_common(int(len(bigram_freqs) * 0.2))])\n",
        "    # Calculate the top 20% of most frequent trigrams\n",
        "    top_trigrams = ', '.join([trigram[0] for trigram in trigram_freqs.most_common(int(len(trigram_freqs) * 0.2))])\n",
        "    # Calculate the top 20% of most frequent quadgrams\n",
        "    top_quadgrams = ', '.join([quadgram[0] for quadgram in quadgram_freqs.most_common(int(len(quadgram_freqs) * 0.2))])\n",
        "\n",
        "    #print(f'Total results: {total_results}')\n",
        "    #print(f'Average article length: {avg_length} characters')\n",
        "    #print(f'Median words per article: {median_words}')\n",
        "    #print(f'Most common words: {top_words} ({len(word_freqs)} total words)')\n",
        "    #print(f'Most common bigrams: {top_bigrams} ({len(bigram_freqs)} total bigrams)')\n",
        "    #print(f'Most common trigrams: {top_trigrams} ({len(trigram_freqs)} total trigrams)')\n",
        "    #print(f'Most common quadgrams: {top_quadgrams} ({len(quadgram_freqs)} total quadgrams)')\n",
        "    #print(f'Most common part-of-speech tags: {all_tags}')\n",
        "    summary = \"\"\n",
        "    summary += f'Total results: {total_results}\\n'\n",
        "    summary += f'Average article length: {avg_length} characters\\n'\n",
        "    summary += f'Median words per article: {median_words}\\n'\n",
        "    summary += f'Most common words: {top_words} ({len(word_freqs)} total words)\\n'\n",
        "    summary += f'Most common bigrams: {top_bigrams} ({len(bigram_freqs)} total bigrams)\\n'\n",
        "    summary += f'Most common trigrams: {top_trigrams} ({len(trigram_freqs)} total trigrams)\\n'\n",
        "    summary += f'Most common quadgrams: {top_quadgrams} ({len(quadgram_freqs)} total quadgrams)\\n'\n",
        "    return summary\n",
        "\n"
      ],
      "metadata": {
        "id": "4JrKikJeJz4X"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import openai\n",
        "from IPython.display import Markdown\n",
        "openai.api_key = \"Your API Key\"\n",
        "\n",
        "def generate_article_outline(prompt,query):\n",
        "    prompt=f\"Write long form and highly comprehensive and well formatted article outline for the query: {query} in highly nested markdown. The outline should be similar to that of a wikipedia table of contents and incorporates the following NLP SEO guidance from SERP evaluation and ignores data unrelated to the query:\\n\\n{prompt}\\n\\n\"\n",
        "\n",
        "    response = openai.ChatCompletion.create(\n",
        "        model='gpt-3.5-turbo',\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"Simulate an exceptionally talented journalist and editor. Given the following instructions, think step by step and produce the best possible output you can.\"},\n",
        "            {\"role\": \"user\", \"content\": prompt}],\n",
        "        max_tokens=2048,\n",
        "        n=1,\n",
        "        stop=None,\n",
        "        temperature=0.3,\n",
        "    )\n",
        "\n",
        "    outline = response['choices'][0]['message']['content'].strip()\n",
        "    return outline\n",
        "\n",
        "\n",
        "\n",
        "# Test the script on a sample query\n",
        "query = 'Transgender Rights'\n",
        "results = analyze_serps(query)\n",
        "summary = summarize_nlp(results)\n",
        "print(summary)\n",
        "getoutline = generate_article_outline(summary,query)\n",
        "display(Markdown(getoutline))"
      ],
      "metadata": {
        "id": "xeH01QSvLl3-"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}