{
"cells": [
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"from datetime import datetime\n",
"from datetime import date\n"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"\n",
"allFiles = os.listdir('scrapedData')\n",
"csvFileNames = []\n",
"for fileName in allFiles:\n",
" if \"csvOut\" in fileName:\n",
" csvFileNames.append(fileName)\n",
" \n",
"\n",
"combined_csv = pd.concat( [ pd.read_csv(\"scrapedData\\\\\" + f) for f in csvFileNames ] ) \n"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" regionTrending | \n",
" trendingRank | \n",
" timeFetched | \n",
" videoId | \n",
" videoTitle | \n",
" videoCategoryId | \n",
" videoPublishTime | \n",
" videoDuration | \n",
" videoTags | \n",
" videoViews | \n",
" videoLikes | \n",
" videoDislikes | \n",
" videoCommentCount | \n",
" videoDescription | \n",
" vieoLicenced | \n",
" channelName | \n",
" channelId | \n",
" channelDescription | \n",
" channelPublishedAt | \n",
" channelViewCount | \n",
" channelSubsCount | \n",
" channelVideoCount | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" US | \n",
" 1 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" h0U2QUGKbSE | \n",
" Kanye West – Wash Us In The Blood feat. Travis... | \n",
" 22 | \n",
" 2020-06-30T14:00:11Z | \n",
" PT3M42S | \n",
" ['kanye', 'kanye west', 'ye', 'yeezus', 'yeezy... | \n",
" 4214043.0 | \n",
" 269209.0 | \n",
" 13107.0 | \n",
" 22665.0 | \n",
" Stream/Download “Wash Us In The Blood” ft. Tra... | \n",
" False | \n",
" Kanye West | \n",
" UCs6eXM7s8Vl5WcECcRHc2qQ | \n",
" NaN | \n",
" 2006-01-10T22:52:29Z | \n",
" 40986002 | \n",
" 6200000 | \n",
" 9 | \n",
"
\n",
" \n",
" 1 | \n",
" US | \n",
" 2 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" LiB65FQnm6w | \n",
" Cash vs Flight 1v1 Basketball! Shave Beard or ... | \n",
" 24 | \n",
" 2020-06-30T17:53:01Z | \n",
" PT25M42S | \n",
" ['cash vs flight 1v1 basketball', 'flight reac... | \n",
" 1497970.0 | \n",
" 103096.0 | \n",
" 1083.0 | \n",
" 10384.0 | \n",
" Get Your Energy Like Me Using GG! \\nI receive ... | \n",
" True | \n",
" CashNasty | \n",
" UCvyTdLw8SkVmUcHYXSDEGwA | \n",
" I make videos. I make you laugh. I be happy. | \n",
" 2013-06-22T01:48:44Z | \n",
" 758385956 | \n",
" 3630000 | \n",
" 2132 | \n",
"
\n",
" \n",
" 2 | \n",
" US | \n",
" 3 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" T8pi91qWnRw | \n",
" Moneybagg Yo – Said Sum (Official Music Video) | \n",
" 10 | \n",
" 2020-06-30T16:59:58Z | \n",
" PT2M59S | \n",
" NaN | \n",
" 689435.0 | \n",
" 51327.0 | \n",
" 916.0 | \n",
" 1915.0 | \n",
" Moneybagg Yo's new track 'Said Sum' out now: h... | \n",
" False | \n",
" MoneyBagg Yo | \n",
" UCrdPrDuDCbG8xayk5QkRLQA | \n",
" NaN | \n",
" 2016-10-06T03:39:25Z | \n",
" 344603686 | \n",
" 1300000 | \n",
" 84 | \n",
"
\n",
" \n",
" 3 | \n",
" US | \n",
" 4 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" rAxNSpO78fE | \n",
" I Surprised Brent Rivera With A Custom iPad Pr... | \n",
" 24 | \n",
" 2020-06-30T16:59:43Z | \n",
" PT13M59S | \n",
" NaN | \n",
" 2863644.0 | \n",
" 212914.0 | \n",
" 3235.0 | \n",
" 64229.0 | \n",
" This was Insane, I Can't Believe We Did This F... | \n",
" True | \n",
" ZHC | \n",
" UClQubH2NeMmGLTLgNdLBwXg | \n",
" Thanks for subscribing! Its my mission to make... | \n",
" 2013-08-07T03:22:54Z | \n",
" 914474888 | \n",
" 13700000 | \n",
" 272 | \n",
"
\n",
" \n",
" 4 | \n",
" US | \n",
" 5 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" mrF-KjnDTxc | \n",
" Genoa 1-3 Juventus | Dybala, CR7 & Douglas Cos... | \n",
" 17 | \n",
" 2020-06-30T22:30:01Z | \n",
" PT4M14S | \n",
" ['Dybala', 'CR7', 'Douglas Costa', 'Genoa', 'D... | \n",
" 4259625.0 | \n",
" 106818.0 | \n",
" 1644.0 | \n",
" 4661.0 | \n",
" Juventus re-open their four point lead at the ... | \n",
" True | \n",
" Serie A | \n",
" UCBJeMCIeLQos7wacox4hmLQ | \n",
" Welcome to the Official Serie A channel. Over ... | \n",
" 2012-10-30T13:54:30Z | \n",
" 1406914395 | \n",
" 4660000 | \n",
" 18934 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" regionTrending trendingRank timeFetched videoId \\\n",
"0 US 1 2020-07-01 15:09:45.453481 h0U2QUGKbSE \n",
"1 US 2 2020-07-01 15:09:45.453481 LiB65FQnm6w \n",
"2 US 3 2020-07-01 15:09:45.453481 T8pi91qWnRw \n",
"3 US 4 2020-07-01 15:09:45.453481 rAxNSpO78fE \n",
"4 US 5 2020-07-01 15:09:45.453481 mrF-KjnDTxc \n",
"\n",
" videoTitle videoCategoryId \\\n",
"0 Kanye West – Wash Us In The Blood feat. Travis... 22 \n",
"1 Cash vs Flight 1v1 Basketball! Shave Beard or ... 24 \n",
"2 Moneybagg Yo – Said Sum (Official Music Video) 10 \n",
"3 I Surprised Brent Rivera With A Custom iPad Pr... 24 \n",
"4 Genoa 1-3 Juventus | Dybala, CR7 & Douglas Cos... 17 \n",
"\n",
" videoPublishTime videoDuration \\\n",
"0 2020-06-30T14:00:11Z PT3M42S \n",
"1 2020-06-30T17:53:01Z PT25M42S \n",
"2 2020-06-30T16:59:58Z PT2M59S \n",
"3 2020-06-30T16:59:43Z PT13M59S \n",
"4 2020-06-30T22:30:01Z PT4M14S \n",
"\n",
" videoTags videoViews videoLikes \\\n",
"0 ['kanye', 'kanye west', 'ye', 'yeezus', 'yeezy... 4214043.0 269209.0 \n",
"1 ['cash vs flight 1v1 basketball', 'flight reac... 1497970.0 103096.0 \n",
"2 NaN 689435.0 51327.0 \n",
"3 NaN 2863644.0 212914.0 \n",
"4 ['Dybala', 'CR7', 'Douglas Costa', 'Genoa', 'D... 4259625.0 106818.0 \n",
"\n",
" videoDislikes videoCommentCount \\\n",
"0 13107.0 22665.0 \n",
"1 1083.0 10384.0 \n",
"2 916.0 1915.0 \n",
"3 3235.0 64229.0 \n",
"4 1644.0 4661.0 \n",
"\n",
" videoDescription vieoLicenced \\\n",
"0 Stream/Download “Wash Us In The Blood” ft. Tra... False \n",
"1 Get Your Energy Like Me Using GG! \\nI receive ... True \n",
"2 Moneybagg Yo's new track 'Said Sum' out now: h... False \n",
"3 This was Insane, I Can't Believe We Did This F... True \n",
"4 Juventus re-open their four point lead at the ... True \n",
"\n",
" channelName channelId \\\n",
"0 Kanye West UCs6eXM7s8Vl5WcECcRHc2qQ \n",
"1 CashNasty UCvyTdLw8SkVmUcHYXSDEGwA \n",
"2 MoneyBagg Yo UCrdPrDuDCbG8xayk5QkRLQA \n",
"3 ZHC UClQubH2NeMmGLTLgNdLBwXg \n",
"4 Serie A UCBJeMCIeLQos7wacox4hmLQ \n",
"\n",
" channelDescription channelPublishedAt \\\n",
"0 NaN 2006-01-10T22:52:29Z \n",
"1 I make videos. I make you laugh. I be happy. 2013-06-22T01:48:44Z \n",
"2 NaN 2016-10-06T03:39:25Z \n",
"3 Thanks for subscribing! Its my mission to make... 2013-08-07T03:22:54Z \n",
"4 Welcome to the Official Serie A channel. Over ... 2012-10-30T13:54:30Z \n",
"\n",
" channelViewCount channelSubsCount channelVideoCount \n",
"0 40986002 6200000 9 \n",
"1 758385956 3630000 2132 \n",
"2 344603686 1300000 84 \n",
"3 914474888 13700000 272 \n",
"4 1406914395 4660000 18934 "
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.options.display.max_columns = None\n",
"combined_csv.head()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"11400\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" regionTrending | \n",
" trendingRank | \n",
" timeFetched | \n",
" videoId | \n",
" videoTitle | \n",
" videoCategoryId | \n",
" videoPublishTime | \n",
" videoDuration | \n",
" videoTags | \n",
" videoViews | \n",
" videoLikes | \n",
" videoDislikes | \n",
" videoCommentCount | \n",
" videoDescription | \n",
" videoLicensed | \n",
" channelName | \n",
" channelId | \n",
" channelDescription | \n",
" channelPublishedAt | \n",
" channelViewCount | \n",
" channelSubsCount | \n",
" channelVideoCount | \n",
" thumbnail_link | \n",
" comments_disabled | \n",
" ratings_disabled | \n",
" video_error_or_removed | \n",
" publishedDateCorrectFormat | \n",
" trendingDateCorrectFormat | \n",
" dayDifference | \n",
" publishedZTime | \n",
" publishedZTimeFloat | \n",
" publishedDayOfWeek | \n",
" newOrOldData | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" US | \n",
" 1 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" h0U2QUGKbSE | \n",
" Kanye West – Wash Us In The Blood feat. Travis... | \n",
" 22 | \n",
" 2020-06-30T14:00:11Z | \n",
" PT3M42S | \n",
" ['kanye', 'kanye west', 'ye', 'yeezus', 'yeezy... | \n",
" 4214043.0 | \n",
" 269209.0 | \n",
" 13107.0 | \n",
" 22665.0 | \n",
" Stream/Download “Wash Us In The Blood” ft. Tra... | \n",
" False | \n",
" Kanye West | \n",
" UCs6eXM7s8Vl5WcECcRHc2qQ | \n",
" NaN | \n",
" 2006-01-10T22:52:29Z | \n",
" 40986002 | \n",
" 6200000 | \n",
" 9 | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" 30-06-20 | \n",
" 01-07-20 | \n",
" 1 | \n",
" 14:00:11 | \n",
" 14.003056 | \n",
" 3 | \n",
" new | \n",
"
\n",
" \n",
" 1 | \n",
" US | \n",
" 2 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" LiB65FQnm6w | \n",
" Cash vs Flight 1v1 Basketball! Shave Beard or ... | \n",
" 24 | \n",
" 2020-06-30T17:53:01Z | \n",
" PT25M42S | \n",
" ['cash vs flight 1v1 basketball', 'flight reac... | \n",
" 1497970.0 | \n",
" 103096.0 | \n",
" 1083.0 | \n",
" 10384.0 | \n",
" Get Your Energy Like Me Using GG! \\nI receive ... | \n",
" True | \n",
" CashNasty | \n",
" UCvyTdLw8SkVmUcHYXSDEGwA | \n",
" I make videos. I make you laugh. I be happy. | \n",
" 2013-06-22T01:48:44Z | \n",
" 758385956 | \n",
" 3630000 | \n",
" 2132 | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" 30-06-20 | \n",
" 01-07-20 | \n",
" 1 | \n",
" 17:53:01 | \n",
" 17.883611 | \n",
" 3 | \n",
" new | \n",
"
\n",
" \n",
" 2 | \n",
" US | \n",
" 3 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" T8pi91qWnRw | \n",
" Moneybagg Yo – Said Sum (Official Music Video) | \n",
" 10 | \n",
" 2020-06-30T16:59:58Z | \n",
" PT2M59S | \n",
" NaN | \n",
" 689435.0 | \n",
" 51327.0 | \n",
" 916.0 | \n",
" 1915.0 | \n",
" Moneybagg Yo's new track 'Said Sum' out now: h... | \n",
" False | \n",
" MoneyBagg Yo | \n",
" UCrdPrDuDCbG8xayk5QkRLQA | \n",
" NaN | \n",
" 2016-10-06T03:39:25Z | \n",
" 344603686 | \n",
" 1300000 | \n",
" 84 | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" 30-06-20 | \n",
" 01-07-20 | \n",
" 1 | \n",
" 16:59:58 | \n",
" 16.999444 | \n",
" 3 | \n",
" new | \n",
"
\n",
" \n",
" 3 | \n",
" US | \n",
" 4 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" rAxNSpO78fE | \n",
" I Surprised Brent Rivera With A Custom iPad Pr... | \n",
" 24 | \n",
" 2020-06-30T16:59:43Z | \n",
" PT13M59S | \n",
" NaN | \n",
" 2863644.0 | \n",
" 212914.0 | \n",
" 3235.0 | \n",
" 64229.0 | \n",
" This was Insane, I Can't Believe We Did This F... | \n",
" True | \n",
" ZHC | \n",
" UClQubH2NeMmGLTLgNdLBwXg | \n",
" Thanks for subscribing! Its my mission to make... | \n",
" 2013-08-07T03:22:54Z | \n",
" 914474888 | \n",
" 13700000 | \n",
" 272 | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" 30-06-20 | \n",
" 01-07-20 | \n",
" 1 | \n",
" 16:59:43 | \n",
" 16.995278 | \n",
" 3 | \n",
" new | \n",
"
\n",
" \n",
" 4 | \n",
" US | \n",
" 5 | \n",
" 2020-07-01 15:09:45.453481 | \n",
" mrF-KjnDTxc | \n",
" Genoa 1-3 Juventus | Dybala, CR7 & Douglas Cos... | \n",
" 17 | \n",
" 2020-06-30T22:30:01Z | \n",
" PT4M14S | \n",
" ['Dybala', 'CR7', 'Douglas Costa', 'Genoa', 'D... | \n",
" 4259625.0 | \n",
" 106818.0 | \n",
" 1644.0 | \n",
" 4661.0 | \n",
" Juventus re-open their four point lead at the ... | \n",
" True | \n",
" Serie A | \n",
" UCBJeMCIeLQos7wacox4hmLQ | \n",
" Welcome to the Official Serie A channel. Over ... | \n",
" 2012-10-30T13:54:30Z | \n",
" 1406914395 | \n",
" 4660000 | \n",
" 18934 | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" notAvailable | \n",
" 30-06-20 | \n",
" 01-07-20 | \n",
" 1 | \n",
" 22:30:01 | \n",
" 22.500278 | \n",
" 3 | \n",
" new | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" regionTrending trendingRank timeFetched videoId \\\n",
"0 US 1 2020-07-01 15:09:45.453481 h0U2QUGKbSE \n",
"1 US 2 2020-07-01 15:09:45.453481 LiB65FQnm6w \n",
"2 US 3 2020-07-01 15:09:45.453481 T8pi91qWnRw \n",
"3 US 4 2020-07-01 15:09:45.453481 rAxNSpO78fE \n",
"4 US 5 2020-07-01 15:09:45.453481 mrF-KjnDTxc \n",
"\n",
" videoTitle videoCategoryId \\\n",
"0 Kanye West – Wash Us In The Blood feat. Travis... 22 \n",
"1 Cash vs Flight 1v1 Basketball! Shave Beard or ... 24 \n",
"2 Moneybagg Yo – Said Sum (Official Music Video) 10 \n",
"3 I Surprised Brent Rivera With A Custom iPad Pr... 24 \n",
"4 Genoa 1-3 Juventus | Dybala, CR7 & Douglas Cos... 17 \n",
"\n",
" videoPublishTime videoDuration \\\n",
"0 2020-06-30T14:00:11Z PT3M42S \n",
"1 2020-06-30T17:53:01Z PT25M42S \n",
"2 2020-06-30T16:59:58Z PT2M59S \n",
"3 2020-06-30T16:59:43Z PT13M59S \n",
"4 2020-06-30T22:30:01Z PT4M14S \n",
"\n",
" videoTags videoViews videoLikes \\\n",
"0 ['kanye', 'kanye west', 'ye', 'yeezus', 'yeezy... 4214043.0 269209.0 \n",
"1 ['cash vs flight 1v1 basketball', 'flight reac... 1497970.0 103096.0 \n",
"2 NaN 689435.0 51327.0 \n",
"3 NaN 2863644.0 212914.0 \n",
"4 ['Dybala', 'CR7', 'Douglas Costa', 'Genoa', 'D... 4259625.0 106818.0 \n",
"\n",
" videoDislikes videoCommentCount \\\n",
"0 13107.0 22665.0 \n",
"1 1083.0 10384.0 \n",
"2 916.0 1915.0 \n",
"3 3235.0 64229.0 \n",
"4 1644.0 4661.0 \n",
"\n",
" videoDescription videoLicensed \\\n",
"0 Stream/Download “Wash Us In The Blood” ft. Tra... False \n",
"1 Get Your Energy Like Me Using GG! \\nI receive ... True \n",
"2 Moneybagg Yo's new track 'Said Sum' out now: h... False \n",
"3 This was Insane, I Can't Believe We Did This F... True \n",
"4 Juventus re-open their four point lead at the ... True \n",
"\n",
" channelName channelId \\\n",
"0 Kanye West UCs6eXM7s8Vl5WcECcRHc2qQ \n",
"1 CashNasty UCvyTdLw8SkVmUcHYXSDEGwA \n",
"2 MoneyBagg Yo UCrdPrDuDCbG8xayk5QkRLQA \n",
"3 ZHC UClQubH2NeMmGLTLgNdLBwXg \n",
"4 Serie A UCBJeMCIeLQos7wacox4hmLQ \n",
"\n",
" channelDescription channelPublishedAt \\\n",
"0 NaN 2006-01-10T22:52:29Z \n",
"1 I make videos. I make you laugh. I be happy. 2013-06-22T01:48:44Z \n",
"2 NaN 2016-10-06T03:39:25Z \n",
"3 Thanks for subscribing! Its my mission to make... 2013-08-07T03:22:54Z \n",
"4 Welcome to the Official Serie A channel. Over ... 2012-10-30T13:54:30Z \n",
"\n",
" channelViewCount channelSubsCount channelVideoCount thumbnail_link \\\n",
"0 40986002 6200000 9 notAvailable \n",
"1 758385956 3630000 2132 notAvailable \n",
"2 344603686 1300000 84 notAvailable \n",
"3 914474888 13700000 272 notAvailable \n",
"4 1406914395 4660000 18934 notAvailable \n",
"\n",
" comments_disabled ratings_disabled video_error_or_removed \\\n",
"0 notAvailable notAvailable notAvailable \n",
"1 notAvailable notAvailable notAvailable \n",
"2 notAvailable notAvailable notAvailable \n",
"3 notAvailable notAvailable notAvailable \n",
"4 notAvailable notAvailable notAvailable \n",
"\n",
" publishedDateCorrectFormat trendingDateCorrectFormat dayDifference \\\n",
"0 30-06-20 01-07-20 1 \n",
"1 30-06-20 01-07-20 1 \n",
"2 30-06-20 01-07-20 1 \n",
"3 30-06-20 01-07-20 1 \n",
"4 30-06-20 01-07-20 1 \n",
"\n",
" publishedZTime publishedZTimeFloat publishedDayOfWeek newOrOldData \n",
"0 14:00:11 14.003056 3 new \n",
"1 17:53:01 17.883611 3 new \n",
"2 16:59:58 16.999444 3 new \n",
"3 16:59:43 16.995278 3 new \n",
"4 22:30:01 22.500278 3 new "
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.options.display.max_columns = None\n",
"#combined_csv = pd.read_csv(\"scrapedData\\\\\" + csvFileNames[0])\n",
"numRows = len(combined_csv.index)\n",
"print(numRows)\n",
"columnCountCurrent = (len(combined_csv.index))\n",
"notAvailableColumn = [\"notAvailable\"] * numRows\n",
"newStringColumn = [\"new\"] * numRows\n",
"combined_csv = combined_csv.rename({'vieoLicenced': 'videoLicensed'}, axis=1)\n",
"\n",
"combined_csv['thumbnail_link'] = notAvailableColumn\n",
"combined_csv['comments_disabled'] = notAvailableColumn\n",
"combined_csv['ratings_disabled'] = notAvailableColumn\n",
"combined_csv['video_error_or_removed'] = notAvailableColumn\n",
"\n",
"# Creating publishedDateCorrectFormat column\n",
"videoPublishTimeColumn = (combined_csv['videoPublishTime'])\n",
"videoPublishTimeColumnSplit1 = [i.split('T', 1)[0] for i in videoPublishTimeColumn]\n",
"videoPublishTimeZ = [i.split('T', 1)[1] for i in videoPublishTimeColumn]\n",
"videoPublishTimeZFinal = [s.replace('Z', '') for s in videoPublishTimeZ]\n",
"#Converting the Z time into float\n",
"videoPublishTimeZFloat = [float(i.split(':', 2)[0]) + float(i.split(':', 2)[1])/60 + float(i.split(':', 2)[2])/3600 for i in videoPublishTimeZFinal]\n",
"\n",
"videoPublishTimeStandard = [datetime.strptime(i, '%Y-%m-%d') for i in videoPublishTimeColumnSplit1] # THis is in standard datetime object format\n",
"publishedDayOfWeek = [((i.weekday()+1)%7)+1 for i in videoPublishTimeStandard]\n",
"\n",
"publishedDateCorrectFormat = [x.strftime(\"%d-%m-%y\") for x in videoPublishTimeStandard]\n",
"combined_csv['publishedDateCorrectFormat'] = publishedDateCorrectFormat\n",
"\n",
"\n",
"\n",
"\n",
"# Creating trendingDateCorrectFormat\n",
"timeFetchedColumn = combined_csv['timeFetched']\n",
"timeFetchedColumnSplit1 = [i.split(' ', 1)[0] for i in timeFetchedColumn]\n",
"timeFetchedStandard = [datetime.strptime(i, '%Y-%m-%d') for i in timeFetchedColumnSplit1]\n",
"trendingDateCorrectFormat = [x.strftime(\"%d-%m-%y\") for x in timeFetchedStandard]\n",
"combined_csv['trendingDateCorrectFormat'] = trendingDateCorrectFormat\n",
"\n",
"\n",
"# Calculating day difference\n",
"dayDifference = [(timeFetchedStandard[i] - videoPublishTimeStandard[i]).days for i in range(numRows)]\n",
"combined_csv['dayDifference'] = dayDifference\n",
"\n",
"# publishedZTime column - format hh:mm:ss\n",
"combined_csv['publishedZTime'] = videoPublishTimeZFinal\n",
"\n",
"# publishedZTimeFloat column - converted to float\n",
"combined_csv['publishedZTimeFloat'] = videoPublishTimeZFloat\n",
"\n",
"# publishedDayOfWeek column - 1 - sunday , 7 - saturday\n",
"combined_csv['publishedDayOfWeek'] = publishedDayOfWeek\n",
"\n",
"# newOrOldData column\n",
"combined_csv['newOrOldData'] = newStringColumn\n",
"\n",
"\n",
"combined_csv.head()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"combined_csv.to_csv( \"newDataOnly_csv_newFormat.csv\", index=False )"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"oldData = pd.read_excel (r'finalOldData.xlsx')"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"combinedOldNew_csv = pd.concat( [oldData, combined_csv] ) \n",
"combinedOldNew_csv.to_csv( \"oldAndNewData.csv\", index=False )"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" regionTrending | \n",
" trendingRank | \n",
" timeFetched | \n",
" videoId | \n",
" videoTitle | \n",
" videoCategoryId | \n",
" videoPublishTime | \n",
" videoDuration | \n",
" videoTags | \n",
" videoViews | \n",
" videoLikes | \n",
" videoDislikes | \n",
" videoCommentCount | \n",
" videoDescription | \n",
" videoLicensed | \n",
" channelName | \n",
" channelId | \n",
" channelDescription | \n",
" channelPublishedAt | \n",
" channelViewCount | \n",
" channelSubsCount | \n",
" channelVideoCount | \n",
" thumbnail_link | \n",
" comments_disabled | \n",
" ratings_disabled | \n",
" video_error_or_removed | \n",
" publishedDateCorrectFormat | \n",
" trendingDateCorrectFormat | \n",
" dayDifference | \n",
" publishedZTime | \n",
" publishedZTimeFloat | \n",
" publishedDayOfWeek | \n",
" newOrOldData | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" US | \n",
" 0 | \n",
" 2017-11-14 00:00:00 | \n",
" 2kyS6SvSYSE | \n",
" WE WANT TO TALK ABOUT OUR MARRIAGE | \n",
" 22 | \n",
" 2017-11-13T17:13:01.000Z | \n",
" notAvailable | \n",
" SHANtell martin | \n",
" 748374.0 | \n",
" 57527.0 | \n",
" 2966.0 | \n",
" 15954.0 | \n",
" SHANTELL'S CHANNEL - https://www.youtube.com/s... | \n",
" notAvailable | \n",
" CaseyNeistat | \n",
" 0 | \n",
" notAvailable | \n",
" notAvailable | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg | \n",
" False | \n",
" False | \n",
" False | \n",
" 2017-11-13 00:00:00 | \n",
" 2017-11-14 00:00:00 | \n",
" 1 | \n",
" 17:13:01 | \n",
" 17.216944 | \n",
" 2 | \n",
" old | \n",
"
\n",
" \n",
" 1 | \n",
" US | \n",
" 0 | \n",
" 2017-11-14 00:00:00 | \n",
" 1ZAPwfrtAFY | \n",
" The Trump Presidency: Last Week Tonight with J... | \n",
" 24 | \n",
" 2017-11-13T07:30:00.000Z | \n",
" notAvailable | \n",
" last week tonight trump presidency|\"last week ... | \n",
" 2418783.0 | \n",
" 97185.0 | \n",
" 6146.0 | \n",
" 12703.0 | \n",
" One year after the presidential election, John... | \n",
" notAvailable | \n",
" LastWeekTonight | \n",
" 0 | \n",
" notAvailable | \n",
" notAvailable | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg | \n",
" False | \n",
" False | \n",
" False | \n",
" 2017-11-13 00:00:00 | \n",
" 2017-11-14 00:00:00 | \n",
" 1 | \n",
" 07:30:00 | \n",
" 7.500000 | \n",
" 2 | \n",
" old | \n",
"
\n",
" \n",
" 2 | \n",
" US | \n",
" 0 | \n",
" 2017-11-14 00:00:00 | \n",
" 5qpjK5DgCt4 | \n",
" Racist Superman | Rudy Mancuso, King Bach & Le... | \n",
" 23 | \n",
" 2017-11-12T19:05:24.000Z | \n",
" notAvailable | \n",
" racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"... | \n",
" 3191434.0 | \n",
" 146033.0 | \n",
" 5339.0 | \n",
" 8181.0 | \n",
" WATCH MY PREVIOUS VIDEO â–¶ \\n\\nSUBSCRIBE â–º ... | \n",
" notAvailable | \n",
" Rudy Mancuso | \n",
" 0 | \n",
" notAvailable | \n",
" notAvailable | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg | \n",
" False | \n",
" False | \n",
" False | \n",
" 2017-11-12 00:00:00 | \n",
" 2017-11-14 00:00:00 | \n",
" 2 | \n",
" 19:05:24 | \n",
" 19.090000 | \n",
" 1 | \n",
" old | \n",
"
\n",
" \n",
" 3 | \n",
" US | \n",
" 0 | \n",
" 2017-11-14 00:00:00 | \n",
" puqaWrEC7tY | \n",
" Nickelback Lyrics: Real or Fake? | \n",
" 24 | \n",
" 2017-11-13T11:00:04.000Z | \n",
" notAvailable | \n",
" rhett and link|\"gmm\"|\"good mythical morning\"|\"... | \n",
" 343168.0 | \n",
" 10172.0 | \n",
" 666.0 | \n",
" 2146.0 | \n",
" Today we find out if Link is a Nickelback amat... | \n",
" notAvailable | \n",
" Good Mythical Morning | \n",
" 0 | \n",
" notAvailable | \n",
" notAvailable | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg | \n",
" False | \n",
" False | \n",
" False | \n",
" 2017-11-13 00:00:00 | \n",
" 2017-11-14 00:00:00 | \n",
" 1 | \n",
" 11:00:04 | \n",
" 11.001111 | \n",
" 2 | \n",
" old | \n",
"
\n",
" \n",
" 4 | \n",
" US | \n",
" 0 | \n",
" 2017-11-14 00:00:00 | \n",
" d380meD0W0M | \n",
" I Dare You: GOING BALD!? | \n",
" 24 | \n",
" 2017-11-12T18:01:41.000Z | \n",
" notAvailable | \n",
" ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"... | \n",
" 2095731.0 | \n",
" 132235.0 | \n",
" 1989.0 | \n",
" 17518.0 | \n",
" I know it's been a while since we did this sho... | \n",
" notAvailable | \n",
" nigahiga | \n",
" 0 | \n",
" notAvailable | \n",
" notAvailable | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" https://i.ytimg.com/vi/d380meD0W0M/default.jpg | \n",
" False | \n",
" False | \n",
" False | \n",
" 2017-11-12 00:00:00 | \n",
" 2017-11-14 00:00:00 | \n",
" 2 | \n",
" 18:01:41 | \n",
" 18.028056 | \n",
" 1 | \n",
" old | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" regionTrending trendingRank timeFetched videoId \\\n",
"0 US 0 2017-11-14 00:00:00 2kyS6SvSYSE \n",
"1 US 0 2017-11-14 00:00:00 1ZAPwfrtAFY \n",
"2 US 0 2017-11-14 00:00:00 5qpjK5DgCt4 \n",
"3 US 0 2017-11-14 00:00:00 puqaWrEC7tY \n",
"4 US 0 2017-11-14 00:00:00 d380meD0W0M \n",
"\n",
" videoTitle videoCategoryId \\\n",
"0 WE WANT TO TALK ABOUT OUR MARRIAGE 22 \n",
"1 The Trump Presidency: Last Week Tonight with J... 24 \n",
"2 Racist Superman | Rudy Mancuso, King Bach & Le... 23 \n",
"3 Nickelback Lyrics: Real or Fake? 24 \n",
"4 I Dare You: GOING BALD!? 24 \n",
"\n",
" videoPublishTime videoDuration \\\n",
"0 2017-11-13T17:13:01.000Z notAvailable \n",
"1 2017-11-13T07:30:00.000Z notAvailable \n",
"2 2017-11-12T19:05:24.000Z notAvailable \n",
"3 2017-11-13T11:00:04.000Z notAvailable \n",
"4 2017-11-12T18:01:41.000Z notAvailable \n",
"\n",
" videoTags videoViews videoLikes \\\n",
"0 SHANtell martin 748374.0 57527.0 \n",
"1 last week tonight trump presidency|\"last week ... 2418783.0 97185.0 \n",
"2 racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"... 3191434.0 146033.0 \n",
"3 rhett and link|\"gmm\"|\"good mythical morning\"|\"... 343168.0 10172.0 \n",
"4 ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"... 2095731.0 132235.0 \n",
"\n",
" videoDislikes videoCommentCount \\\n",
"0 2966.0 15954.0 \n",
"1 6146.0 12703.0 \n",
"2 5339.0 8181.0 \n",
"3 666.0 2146.0 \n",
"4 1989.0 17518.0 \n",
"\n",
" videoDescription videoLicensed \\\n",
"0 SHANTELL'S CHANNEL - https://www.youtube.com/s... notAvailable \n",
"1 One year after the presidential election, John... notAvailable \n",
"2 WATCH MY PREVIOUS VIDEO â–¶ \\n\\nSUBSCRIBE â–º ... notAvailable \n",
"3 Today we find out if Link is a Nickelback amat... notAvailable \n",
"4 I know it's been a while since we did this sho... notAvailable \n",
"\n",
" channelName channelId channelDescription channelPublishedAt \\\n",
"0 CaseyNeistat 0 notAvailable notAvailable \n",
"1 LastWeekTonight 0 notAvailable notAvailable \n",
"2 Rudy Mancuso 0 notAvailable notAvailable \n",
"3 Good Mythical Morning 0 notAvailable notAvailable \n",
"4 nigahiga 0 notAvailable notAvailable \n",
"\n",
" channelViewCount channelSubsCount channelVideoCount \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 0 0 0 \n",
"\n",
" thumbnail_link comments_disabled \\\n",
"0 https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg False \n",
"1 https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg False \n",
"2 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False \n",
"3 https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg False \n",
"4 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False \n",
"\n",
" ratings_disabled video_error_or_removed publishedDateCorrectFormat \\\n",
"0 False False 2017-11-13 00:00:00 \n",
"1 False False 2017-11-13 00:00:00 \n",
"2 False False 2017-11-12 00:00:00 \n",
"3 False False 2017-11-13 00:00:00 \n",
"4 False False 2017-11-12 00:00:00 \n",
"\n",
" trendingDateCorrectFormat dayDifference publishedZTime \\\n",
"0 2017-11-14 00:00:00 1 17:13:01 \n",
"1 2017-11-14 00:00:00 1 07:30:00 \n",
"2 2017-11-14 00:00:00 2 19:05:24 \n",
"3 2017-11-14 00:00:00 1 11:00:04 \n",
"4 2017-11-14 00:00:00 2 18:01:41 \n",
"\n",
" publishedZTimeFloat publishedDayOfWeek newOrOldData \n",
"0 17.216944 2 old \n",
"1 7.500000 2 old \n",
"2 19.090000 1 old \n",
"3 11.001111 2 old \n",
"4 18.028056 1 old "
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"combinedOldNew_csv.head()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.7 64-bit ('mlEnv': conda)",
"language": "python",
"name": "python37764bitmlenvconda75c86b840a424a4e95d50ae2ee417e09"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}