{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Trending YouTube Videos\n", "\n", "The purpose of this notebook is to retrieve the top 200 trending videos in the US (US), UK (GB) and Canada (CA) using the YouTube data API v3. The channel details are also fetched for each video. The output is written in the following format:\n", "\n", "\n", "Format: [regionTrending, trendingRank, timeFetched, videoId, videoTitle, videoCategoryId, videoPublishTime, videoDuration, videoTags, videoViews, videoLikes, videoDislikes, videoCommentCount, videoDescription, vieoLicenced, channelName, channelId, channelDescription, channelPublishedAt, channelViewCount, channelSubsCount, channelVideoCount]" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "api_key = \"AIzaSyAjFH7uhw4uG2g2Y8jK0RsL__X8s-MNfyM\" # Enter your API key here" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from googleapiclient.discovery import build\n", "\n", "\n", "import googleapiclient.errors\n", "import datetime #To timestamp the videos\n", "import csv\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "youtube = build('youtube', 'v3', developerKey=api_key)\n", "UTCnow = datetime.datetime.utcnow() #UTC standard time. Not GMT\n", "resultsPerPage = 50 #set to 50 \n", "\n", "# US\n", "trendingListUS = []\n", "nextToken = ''\n", "#There are 4 pages with 50 results each. Total of 200 Videos\n", "for currentPage in range(4):\n", " reqTrend = youtube.videos().list(\n", " part=\"snippet,contentDetails,statistics\",\n", " chart=\"mostPopular\",\n", " regionCode=\"US\",\n", " maxResults=resultsPerPage,\n", " pageToken = nextToken\n", " )\n", " resTrend = reqTrend.execute()\n", " trendingListUS.extend(resTrend['items'])\n", " if 'nextPageToken' not in resTrend:\n", " break\n", " nextToken = resTrend['nextPageToken']\n", "\n", "#Adding timestamp, trendingRank and region name to each entry\n", "trendingRank = 1\n", "for video in trendingListUS:\n", " video['timeFetched'] = UTCnow\n", " video['region'] = 'US'\n", " video['trendingRank'] = trendingRank\n", " trendingRank = trendingRank + 1\n", "#print(trendingListUS)\n", "\n", "\n", "# GB - Great Britain\n", "trendingListGB = []\n", "nextToken = ''\n", "#There are 4 pages with 50 results each. Total of 200 Videos\n", "for currentPage in range(4):\n", " reqTrend = youtube.videos().list(\n", " part=\"snippet,contentDetails,statistics\",\n", " chart=\"mostPopular\",\n", " regionCode=\"GB\",\n", " maxResults=resultsPerPage,\n", " pageToken = nextToken\n", " )\n", " resTrend = reqTrend.execute()\n", " trendingListGB.extend(resTrend['items'])\n", " if 'nextPageToken' not in resTrend:\n", " break\n", " nextToken = resTrend['nextPageToken']\n", "\n", "trendingRank = 1\n", "for video in trendingListGB:\n", " video['timeFetched'] = UTCnow\n", " video['region'] = 'GB'\n", " video['trendingRank'] = trendingRank\n", " trendingRank = trendingRank + 1\n", "#print(trendingListGB)\n", "\n", "# CA - Canada\n", "trendingListCA = []\n", "nextToken = ''\n", "#There are 4 pages with 50 results each. Total of 200 Videos\n", "for currentPage in range(4):\n", " reqTrend = youtube.videos().list(\n", " part=\"snippet,contentDetails,statistics\",\n", " chart=\"mostPopular\",\n", " regionCode=\"CA\",\n", " maxResults=resultsPerPage,\n", " pageToken = nextToken\n", " )\n", " resTrend = reqTrend.execute()\n", " trendingListCA.extend(resTrend['items'])\n", " if 'nextPageToken' not in resTrend:\n", " break\n", " nextToken = resTrend['nextPageToken']\n", "\n", "trendingRank = 1\n", "for video in trendingListCA:\n", " video['timeFetched'] = UTCnow\n", " video['region'] = 'CA'\n", " video['trendingRank'] = trendingRank\n", " trendingRank = trendingRank + 1\n", "#print(trendingListCA)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Combining the lists\n", "trendingList =[]\n", "trendingListFormatted = []\n", "trendingList.extend(trendingListUS)\n", "trendingList.extend(trendingListGB)\n", "trendingList.extend(trendingListCA)\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Fetching channel details in order\n", "\n", "channelList = []\n", "for ii in range(len(trendingList)):\n", " reqChannel = youtube.channels().list(part=\"snippet,statistics\",\n", " id=trendingList[ii]['snippet'][\"channelId\"])\n", " channelRes = reqChannel.execute()\n", " #print(channelRes)\n", " channelList.extend([channelRes['items']])\n", " trendingList[ii][\"channelInfoDump\"] = channelRes['items'][0]\n", "#print(channelList)\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File name of Data dump: dataDump04-Jul-2020-15-50-29.txt\n" ] } ], "source": [ "# Dumping all collected data into a file for backup\n", "\n", "fileName = 'dataDump' + UTCnow.strftime(\"%d-%b-%Y-%H-%M-%S\") + '.txt'\n", "\n", "print(f\"File name of Data dump: {fileName}\")\n", "#print(str(trendingList[0]))\n", "with open(fileName, 'w',encoding='utf-8') as f:\n", " for line in trendingList:\n", " f.write(str(line) + '\\n')\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "#Checking if certain dict items are present in the list\n", "\n", "for ii in range(len(trendingList)):\n", " if 'tags' not in trendingList[ii]['snippet']:\n", " trendingList[ii]['snippet']['tags'] = ''\n", " if 'title' not in trendingList[0]['snippet']:\n", " trendingList[ii]['snippet']['title'] = ''\n", " if 'categoryId' not in trendingList[0]['snippet']:\n", " trendingList[ii]['snippet']['categoryId'] = ''\n", " if 'description' not in trendingList[0]['snippet']:\n", " trendingList[ii]['snippet']['description'] = ''\n", " if 'likeCount' not in trendingList[ii]['statistics']:\n", " trendingList[ii]['statistics']['likeCount'] = ''\n", " if 'viewCount' not in trendingList[ii]['statistics']:\n", " trendingList[ii]['statistics']['viewCount'] = ''\n", " if 'dislikeCount' not in trendingList[ii]['statistics']:\n", " trendingList[ii]['statistics']['dislikeCount'] = ''\n", " if 'commentCount' not in trendingList[ii]['statistics']:\n", " trendingList[ii]['statistics']['commentCount'] = ''\n", " if 'licensedContent' not in trendingList[ii]['contentDetails']:\n", " trendingList[ii]['contentDetails']['licensedContent'] = ''\n", "# Format: [regionTrending, trendingRank, timeFetched, videoId,videoTitle,videoCategoryId,videoPublishTime,videoDuration, videoTags,videoViews,videoLikes,videoDislikes,videoCommentCount,videoDescription, vieoLicenced, channelName, channelId, channelDescription, channelPublishedAt, channelViewCount, channelSubsCount, channelVideoCount]\n", "\n", "headerLine = ['regionTrending', 'trendingRank', 'timeFetched', 'videoId', 'videoTitle', 'videoCategoryId', 'videoPublishTime', 'videoDuration', 'videoTags', 'videoViews', 'videoLikes', 'videoDislikes', 'videoCommentCount', 'videoDescription', 'vieoLicenced', 'channelName', 'channelId', 'channelDescription', 'channelPublishedAt', 'channelViewCount', 'channelSubsCount', 'channelVideoCount']\n", "\n", "# print(trendingList[0]['region'], trendingList[0]['trendingRank'], trendingList[0]['timeFetched'], trendingList[0]['id'], trendingList[0]['snippet']['title'], trendingList[0]['snippet']['categoryId'], trendingList[0]['snippet']['publishedAt'], trendingList[0]['contentDetails']['duration'], trendingList[0]['snippet']['tags'])\n", "\n", "# print(trendingList[0]['region'], trendingList[0]['trendingRank'], trendingList[0]['timeFetched'], trendingList[0]['id'], trendingList[0]['snippet']['title'], trendingList[0]['snippet']['categoryId'], trendingList[0]['snippet']['publishedAt'], trendingList[0]['contentDetails']['duration'], trendingList[0]['snippet']['tags'], trendingList[0]['statistics']['viewCount'], trendingList[0]['statistics']['likeCount'], trendingList[0]['statistics']['dislikeCount'], trendingList[0]['statistics']['commentCount'], trendingList[0]['snippet']['description'], trendingList[0]['contentDetails']['licensedContent'], trendingList[0]['channelInfoDump']['snippet']['title'], trendingList[0]['channelInfoDump']['id'], trendingList[0]['channelInfoDump']['snippet']['description'], trendingList[0]['channelInfoDump']['snippet']['publishedAt'], trendingList[0]['channelInfoDump']['statistics']['viewCount'], trendingList[0]['channelInfoDump']['statistics']['subscriberCount'], trendingList[0]['channelInfoDump']['statistics']['videoCount'])\n", "# channelName, channelDescription, channelPublishedAt, channelViewCount, channelSubsCount, channelVideoCount\n", "\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Creating list of lists\n", "\n", "trendingFinalList = []\n", "trendingFinalList.append(headerLine)\n", "#print(trendingFinalList)\n", "tempData = []\n", "for ii in range(len(trendingList)):\n", " videoDescNew = trendingList[ii]['snippet']['description'].replace(\"\\n\",\"\\\\n\")\n", " videoDescNew = videoDescNew.replace(\"\\r\",\"\\\\r\")\n", " #videoDescNew = videoDescNew.replace(\"\\u\",\"\\\\u\")\n", " #videoDescNew = videoDescNew.replace(\"\\x\",\"\\\\x\")\n", " channelDescNew = (trendingList[ii]['channelInfoDump']['snippet']['description']).replace(\"\\n\",\"\\\\n\")\n", " channelDescNew = channelDescNew.replace(\"\\r\",\"\\\\r\")\n", " #channelDescNew = channelDescNew.replace(\"\\u\",\"\\\\u\")\n", " #channelDescNew = channelDescNew.replace(\"\\x\",\"\\\\x\")\n", " tempData = [(trendingList[ii]['region']), (trendingList[ii]['trendingRank']), trendingList[ii]['timeFetched'], trendingList[ii]['id'], (trendingList[ii]['snippet']['title']), trendingList[ii]['snippet']['categoryId'], trendingList[ii]['snippet']['publishedAt'], trendingList[ii]['contentDetails']['duration'], (trendingList[ii]['snippet']['tags']), trendingList[ii]['statistics']['viewCount'], trendingList[ii]['statistics']['likeCount'], trendingList[ii]['statistics']['dislikeCount'], trendingList[ii]['statistics']['commentCount'], videoDescNew, trendingList[ii]['contentDetails']['licensedContent'], (trendingList[ii]['channelInfoDump']['snippet']['title']), trendingList[ii]['channelInfoDump']['id'], channelDescNew, trendingList[ii]['channelInfoDump']['snippet']['publishedAt'], trendingList[ii]['channelInfoDump']['statistics']['viewCount'], trendingList[ii]['channelInfoDump']['statistics']['subscriberCount'], trendingList[ii]['channelInfoDump']['statistics']['videoCount'] ]\n", " #print(tempData)\n", " trendingFinalList.append(tempData)\n", " \n", "#print(trendingFinalList)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "File name of CSV file: csvOut04-Jul-2020-15-50-29.csv\n" ] } ], "source": [ "# Writing to a CSV file\n", "\n", "fileName = 'csvOut' + UTCnow.strftime(\"%d-%b-%Y-%H-%M-%S\") + '.csv'\n", "\n", "print(f\"File name of CSV file: {fileName}\")\n", "\n", "\n", "with open(fileName,'w',encoding='utf-8', newline ='') as result_file:\n", " wr = csv.writer(result_file,dialect='excel')\n", " wr.writerows(trendingFinalList)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7.7 64-bit ('mlEnv': conda)", "language": "python", "name": "python37764bitmlenvconda75c86b840a424a4e95d50ae2ee417e09" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }