{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "private_outputs": true, "provenance": [], "machine_shape": "hm", "authorship_tag": "ABX9TyO90FGPXlp4ao3YjIJ8BAZ0", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "gpuClass": "standard" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "## Install the needed libraries" ], "metadata": { "id": "A4C35T6RFP-m" } }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8HesWGeMwf4j" }, "outputs": [], "source": [ "!pip install openai\n", "!pip install -U pytube\n", "!pip install pydub\n", "!pip install transformers\n", "!pip install -U transformers\n", "!pip install transformers sentencepiece\n", "!pip install pafy\n", "!pip install youtube_dl\n", "!pip install yt-dlp" ] }, { "cell_type": "markdown", "source": [ "## Run the cell below to generate the transcription, transcription summary, Tweet thread, article outline, and full article. The files for each will be generated in the folder on the left when the script finishes running. You will need to enter your OpenAI api key (at the top of the script below) and the Youtube video you want to use (at the bottom of the script)." ], "metadata": { "id": "wJnuKnTvFVMU" } }, { "cell_type": "code", "source": [ "import openai\n", "import pandas as pd\n", "from pytube import YouTube\n", "from transformers import T5Tokenizer\n", "from transformers import T5Tokenizer, T5ForConditionalGeneration\n", "from transformers import GPT2TokenizerFast\n", "from transformers import pipeline\n", "import textwrap\n", "from concurrent.futures import ThreadPoolExecutor\n", "import logging\n", "import warnings\n", "import yt_dlp\n", "import os\n", "# Supress warnings\n", "logging.basicConfig(level=logging.CRITICAL)\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "# OpenAI API key\n", "openai.api_key = \"Your OpenAI API Key\"\n", "\n", "def get_transcript(youtubelink):\n", " video_url = youtubelink\n", "\n", " # Create a yt-dlp instance\n", " ydl_opts = {\n", " 'format': 'bestaudio/best',\n", " 'extractaudio': True,\n", " 'audioformat': 'mp3',\n", " 'outtmpl': 'audio_file.mp3',\n", " 'noplaylist': True,\n", " }\n", " with yt_dlp.YoutubeDL(ydl_opts) as ydl:\n", " # Extract video information\n", " video_info = ydl.extract_info(video_url, download=False)\n", " # Download the audio\n", " ydl.download([video_url])\n", "\n", " audio_file = \"audio_file.mp3\"\n", "\n", "\n", "\n", " with open(audio_file, \"rb\") as audio:\n", " transcript = openai.Audio.translate(\"whisper-1\", audio)\n", "\n", " thetext = transcript['text']\n", "\n", " with open(\"full_transcript.txt\", \"w\") as file:\n", " file.write(thetext)\n", "\n", " # Remove the audio file after processing\n", " os.remove(audio_file)\n", "\n", " return thetext\n", "\n", "\n", "\n", "def count_tokens(input_data, max_tokens=20000, input_type='text'):\n", " tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n", "\n", " if input_type == 'text':\n", " tokens = tokenizer.tokenize(input_data)\n", " elif input_type == 'tokens':\n", " tokens = input_data\n", " else:\n", " raise ValueError(\"Invalid input_type. Must be 'text' or 'tokens'\")\n", "\n", " # Print the number of tokens\n", " token_count = len(tokens)\n", " return token_count\n", "\n", "\n", "\n", "def truncate_text_by_tokens(text, max_tokens=3000):\n", " tokenizer = GPT2TokenizerFast.from_pretrained(\"gpt2\")\n", "\n", " # Tokenize the input text\n", " tokens = tokenizer.tokenize(text)\n", "\n", " # Truncate tokens to final_max_tokens\n", " truncated_tokens = tokens[:max_tokens]\n", "\n", " trunc_token_len = count_tokens(truncated_tokens, input_type='tokens')\n", "\n", " print(\"Truncated Summary Token Length:\"+ str(trunc_token_len))\n", "\n", " # Convert the truncated tokens back to text\n", " truncated_text = tokenizer.convert_tokens_to_string(truncated_tokens)\n", "\n", " return truncated_text\n", "\n", "\n", "\n", "def summarize_chunk(classifier, chunk):\n", " summary = classifier(chunk)\n", " return summary[0][\"summary_text\"]\n", "\n", "\n", "\n", "def summarize_text(text, model_name=\"t5-small\", max_workers=8):\n", " classifier = pipeline(\"summarization\", model=model_name)\n", " summarized_text = \"\"\n", "\n", " # Split the input text into smaller chunks\n", " chunks = textwrap.wrap(text, width=500, break_long_words=False)\n", "\n", " # Parallelize the summarization of the chunks\n", " with ThreadPoolExecutor(max_workers=max_workers) as executor:\n", " summaries = executor.map(lambda chunk: summarize_chunk(classifier, chunk), chunks)\n", " summarized_text = \" \".join(summaries)\n", " text_len_in_tokens = count_tokens(text)\n", " print(\"Tokens in full transcript\" + str(text_len_in_tokens))\n", " summary_token_len = count_tokens(summarized_text)\n", " print(\"Summary Token Length:\"+ str(summary_token_len))\n", "\n", " if summary_token_len > 2500:\n", " summarized_text = truncate_text_by_tokens(summarized_text, max_tokens=2500)\n", "\n", " else:\n", " summarized_text = summarized_text\n", "\n", "\n", " with open(\"transcript_summary.txt\", \"w\") as file:\n", " file.write(summarized_text)\n", "\n", "\n", " print(\"summarized by t5\")\n", " return summarized_text.strip()\n", "\n", "\n", "\n", "def gpt_summarize_transcript(transcript_text,token_len):\n", " # Check the length of the transcript\n", "\n", " # Generate the summary using the OpenAI ChatCompletion API\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are an expert at summarizing long documents into concise and comprehensive summaries. Your summaries often capture the essence of the original text.\"},\n", " {\"role\": \"user\", \"content\": \"I have a long transcript that I would like you to summarize for me. Please think carefully and do the best job you possibly can.\"},\n", " {\"role\": \"system\", \"content\": \"Absolutely, I will provide a concise and comprehensive summary of the transcript.\"},\n", " {\"role\": \"user\", \"content\": \"Excellent, here is the transcript: \" + transcript_text}\n", " ],\n", " max_tokens=3800 - token_len,\n", " n=1,\n", " stop=None,\n", " temperature=0.5,\n", " )\n", "\n", " # Extract the generated summary from the response\n", " summary = response['choices'][0]['message']['content']\n", " print(\"summarized by GPT3\")\n", "\n", " with open(\"transcript_summary.txt\", \"w\") as file:\n", " file.write(summary)\n", "\n", "\n", " # Return the summary\n", " return summary.strip()\n", "\n", "\n", "\n", "def generate_tweet_thread(transcript_text):\n", " # Generate the tweets using the OpenAI ChatCompletion API\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are an expert at writing tweet threads that are incredibly interesting and potentially newsworthy. You are known to go viral.\"},\n", " {\"role\": \"user\", \"content\": \"I have text that I would like you to use as the basis for coming up with multiple tweets for a long-form twitter thread. Please think step by step and do the best job you possibly can. Each tweet should be on a new line\"},\n", " {\"role\": \"system\", \"content\": \"Absolutely, I will provide a list of tweets on new lines for easy parsing. This tweet thread should be written to go viral. I will make sure each tweet is less than 250 characters.\"},\n", " {\"role\": \"user\", \"content\": \"Excellent, here is the transcript: \" + transcript_text},\n", " {\"role\": \"system\", \"content\": \"My list will be formatted as: Tweet 1 \\n\\n Tweet 2 \\n\\n Tweet 3 \\n\\n etc.\"}\n", "\n", " ],\n", " max_tokens=900,\n", " n=1,\n", " stop=None,\n", " temperature=0.5,\n", " )\n", "\n", " # Extract the generated tweets from the response\n", " tweets = response['choices'][0]['message']['content']\n", " print(tweets)\n", "\n", " # Split the tweets into separate parts\n", " tweets = tweets.split(\"\\n\\n\")\n", " print(tweets)\n", "\n", " # Create a dataframe from the tweets\n", " df = pd.DataFrame({\"tweet\": tweets})\n", " df.to_csv('Tweet_Thread.csv')\n", "\n", " # Return the tweets as a list\n", " return tweets\n", "\n", "\n", "\n", "def generate_long_form_article(transcript_text,token_len):\n", " # Generate the article outline using the OpenAI ChatCompletion API\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are an expert at writing long-form article outlines that are informative, engaging, and well-researched. Your articles often go viral and are widely shared.\"},\n", " {\"role\": \"user\", \"content\": \"I have some text that I would like you to use as the basis for a long-form article outline. Please think carefully and do the best job you can to come up with an outline for the article.\"},\n", " {\"role\": \"system\", \"content\": \"Absolutely, I will provide a comprehensive and well-structured outline for the article based on the content. I will provide the result numbered with roman numerals \"},\n", " {\"role\": \"user\", \"content\": \"Excellent, here is the transcript: \" + transcript_text},\n", " {\"role\": \"system\", \"content\": \"Here are the sections without any start text, numbered by roman numerals\"}\n", "\n", " ],\n", " max_tokens=3700 - token_len,\n", " n=1,\n", " stop=None,\n", " temperature=0.5,\n", " )\n", "\n", " # Extract the article outline from the response\n", " outline = response['choices'][0]['message']['content']\n", " outline_token_count = count_tokens(outline)\n", " sections = outline.strip().split(\"\\n\\n\")\n", " parsed_data = []\n", " for section in sections:\n", " lines = section.strip().split(\"\\n\")\n", " section_title = lines[0].strip()\n", " section_items = [item.strip() for item in lines[1:]]\n", " parsed_data.append([section_title, section_items])\n", "\n", " with open(\"article_outline.txt\", \"w\") as file:\n", " file.write(str(parsed_data))\n", "\n", "\n", "\n", " generated_sections = []\n", " # Loop through each section in the outline\n", " for section in parsed_data:\n", " # Generate the section using the OpenAI ChatCompletion API\n", " response = openai.ChatCompletion.create(\n", " model=\"gpt-3.5-turbo\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are an expert at writing long-form articles that are informative, engaging, and well-researched. Your articles often go viral and are widely shared. You will be given an article outline for context, and instructions on which section of the outline to complete.\"},\n", " {\"role\": \"user\", \"content\": \"I have a section of an article that I would like you to write for me. Please think carefully and do the best job you can to come up with a well-written and comprehensive section. Please also take into consideration the article's outline so that you can write without overlapping pevious points and build on each section.\"},\n", " {\"role\": \"system\", \"content\": \"Absolutely, I will provide a comprehensive and well-written section based taking into consideration the outline. I will provide only the section text without any additional text\"},\n", " {\"role\": \"user\", \"content\": \"Excellent, here is the outline to use to understand your goal better: \" + outline + \" and the section to write: \" + str(section)}\n", " ],\n", " max_tokens=3700-outline_token_count,\n", " n=1,\n", " stop=None,\n", " temperature=0.2,\n", " )\n", "\n", " # Extract the generated section from the response\n", " generated_section = response['choices'][0]['message']['content']\n", "\n", "\n", " # Add the generated section to the list of generated sections\n", " generated_sections.append(generated_section)\n", "\n", " # Combine the generated sections into a finished article\n", " article = \"\\n\\n\".join(generated_sections)\n", "\n", " # Save the article to a text file\n", " with open(\"long_form_article.txt\", \"w\") as file:\n", " file.write(article)\n", "\n", " # Return the article\n", " return article\n", "\n", "\n", "\n", "# Get the transcript from the video\n", "transcription = get_transcript(\"Your Youtube Video URL\")\n", "\n", "# Get the token length of the transcript\n", "token_count = count_tokens(transcription)\n", "print(token_count)\n", "\n", "\n", "\n", "# Summarize with either GPT3 or T5 depending on length of transcript:\n", "if token_count > 3000:\n", " summarized_text = summarize_text(transcription)\n", " new_token_count = count_tokens(summarized_text)\n", "else:\n", " summarized_text = gpt_summarize_transcript(transcription,token_count)\n", " new_token_count = count_tokens(summarized_text)\n", "\n", "\n", "\n", "# Generate the tweet thread using the summary\n", "tweets = generate_tweet_thread(summarized_text)\n", "\n", "\n", "\n", "# Generate the long-form article using the summary\n", "article = generate_long_form_article(summarized_text,new_token_count)\n", "\n", "\n", "\n", "\n" ], "metadata": { "id": "3LbeY54zU3BI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "tweets" ], "metadata": { "id": "agTZeRpq5G-N" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "article" ], "metadata": { "id": "ho6liusu5H2s" }, "execution_count": null, "outputs": [] } ] }