{ "cells": [ { "cell_type": "markdown", "id": "171f9813-9b2f-4a72-afbd-79c1a207de4a", "metadata": {}, "source": [ "# Project 3\n", "\n", "## Part 1: Data Cleaning, Feature Engineering, EDA\n", "\n", "I have my post content, now I need to inspect it and clean it up if necessary." ] }, { "cell_type": "markdown", "id": "97011b79-9a03-49c8-9ba5-e4c05b9fbd77", "metadata": {}, "source": [ "### 0. Imports and Preliminaries" ] }, { "cell_type": "code", "execution_count": 1, "id": "72dde85e-0a46-465f-bfc8-b0efc957aeb1", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from nltk.tokenize import word_tokenize, regexp_tokenize\n", "import string # for punctuation\n", "import ipynb_utils as ipyutils # custom variables and utility functions" ] }, { "cell_type": "code", "execution_count": 2, "id": "e65020e5-25e3-4c7d-aaca-4d1ac6648066", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8718, 6)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# load data\n", "data_path = '../data/scrapes.json'\n", "post_df = pd.read_json(data_path, orient='index')\n", "post_df.shape" ] }, { "cell_type": "code", "execution_count": 3, "id": "0b5366b2-aca4-402a-9c8f-d2ac33d63dd1", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uidtimetitlebody-textmediacomments
045Newbiequestionsaboutascendantsandborders6012022-09-05Newbie questions about ascendants and bordersI'm new to actually learning astrology, not ju...0
1100Thousandsofunchartedplanetsatyourfingertips...Thousands of uncharted planets at your fingert...True0
234Astrologyandcognitivedissonance3232022-09-05Astrology and cognitive dissonanceOpen to anyone who wouldn't mind sharing a rec...1
338whatdoy’allthinkofpersonacharts?1802022-09-05what do y’all think of persona charts?I feel a bit skeptical of them, since I feel l...0
4160RESOURCEREQUEST:Videos(orarticles)withtips/...2022-09-05RESOURCE REQUEST: Videos (or articles) with ti...I think my problem is that I don’t know the pr...2
\n", "
" ], "text/plain": [ " uid time \\\n", "0 45Newbiequestionsaboutascendantsandborders601 2022-09-05 \n", "1 100Thousandsofunchartedplanetsatyourfingertips... \n", "2 34Astrologyandcognitivedissonance323 2022-09-05 \n", "3 38whatdoy’allthinkofpersonacharts?180 2022-09-05 \n", "4 160RESOURCEREQUEST:Videos(orarticles)withtips/... 2022-09-05 \n", "\n", " title \\\n", "0 Newbie questions about ascendants and borders \n", "1 Thousands of uncharted planets at your fingert... \n", "2 Astrology and cognitive dissonance \n", "3 what do y’all think of persona charts? \n", "4 RESOURCE REQUEST: Videos (or articles) with ti... \n", "\n", " body-text media comments \n", "0 I'm new to actually learning astrology, not ju... 0 \n", "1 True 0 \n", "2 Open to anyone who wouldn't mind sharing a rec... 1 \n", "3 I feel a bit skeptical of them, since I feel l... 0 \n", "4 I think my problem is that I don’t know the pr... 2 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "post_df.head()" ] }, { "cell_type": "markdown", "id": "de32996d-24a6-41aa-a4ee-cdc71ee01938", "metadata": {}, "source": [ "### 1. EDA and Cleaning" ] }, { "cell_type": "code", "execution_count": 4, "id": "87e0fb52-7e99-459a-9d39-4595a57882ba", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uidtimetitlebody-textmediacomments
1100Thousandsofunchartedplanetsatyourfingertips...Thousands of uncharted planets at your fingert...True0
9136GetthefastestFiberInternetinthetri-statefro...Get the fastest Fiber Internet in the tri-stat...True0
17214Thebestplayersinthesoccerworldcometogetherf...The best players in the soccer world come toge...True0
25254Yougottabereadyforanythingifyouwanttokeepup...You gotta be ready for anything if you want to...True0
40105Getfiredupforthebestseasonyetwithnewsoccerc...Get fired up for the best season yet with new ...True0
\n", "
" ], "text/plain": [ " uid time \\\n", "1 100Thousandsofunchartedplanetsatyourfingertips... \n", "9 136GetthefastestFiberInternetinthetri-statefro... \n", "17 214Thebestplayersinthesoccerworldcometogetherf... \n", "25 254Yougottabereadyforanythingifyouwanttokeepup... \n", "40 105Getfiredupforthebestseasonyetwithnewsoccerc... \n", "\n", " title body-text media \\\n", "1 Thousands of uncharted planets at your fingert... True \n", "9 Get the fastest Fiber Internet in the tri-stat... True \n", "17 The best players in the soccer world come toge... True \n", "25 You gotta be ready for anything if you want to... True \n", "40 Get fired up for the best season yet with new ... True \n", "\n", " comments \n", "1 0 \n", "9 0 \n", "17 0 \n", "25 0 \n", "40 0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "post_df[post_df['time'] == ''].head() # are these all ads?" ] }, { "cell_type": "code", "execution_count": 5, "id": "a4e5e9aa-8112-4a8b-aff0-b0426ef80677", "metadata": {}, "outputs": [], "source": [ "# save a filter for these rows that might be ads\n", "adfilter = post_df['time'] == ''" ] }, { "cell_type": "code", "execution_count": 6, "id": "497e70ca-86dd-42d3-aa62-cf631df6f98e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 Thousands of uncharted planets at your fingert...\n", "9 Get the fastest Fiber Internet in the tri-stat...\n", "17 The best players in the soccer world come toge...\n", "25 You gotta be ready for anything if you want to...\n", "40 Get fired up for the best season yet with new ...\n", " ... \n", "2816 Fan of sports?⚽🏀🏈⚾ Enjoy the sports betting ex...\n", "2817 The only thing more nerve-racking than proposi...\n", "2818 Explore new ways to play with hundreds of game...\n", "2819 Stem cell therapy is safe over the long term a...\n", "2820 I'm developing an open-world space ARPG, comin...\n", "Name: title, Length: 305, dtype: object" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# inspect these rows in a bit more detail\n", "post_df[adfilter]['title']" ] }, { "cell_type": "code", "execution_count": 7, "id": "35861fa9-bc47-44d6-83d8-747f98153a12", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8413, 6)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# conclusion - all the posts with no timestamp are ads. \n", "# Even if they aren't all ads, I'm willing to forego this smallish subset\n", "# of entries that mostly sound suspiciously like ads..\n", "post_df.drop(post_df[adfilter].index, inplace=True)\n", "post_df.shape" ] }, { "cell_type": "code", "execution_count": 8, "id": "df4bfa0d-152f-4a44-a406-45c3ed2cc2e0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "uid object\n", "time object\n", "title object\n", "body-text object\n", "media int64\n", "comments int64\n", "dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# convert media column to 0/1 (boolean)\n", "post_df['media'] = (post_df['media'] == 'True').astype(int)\n", "post_df.dtypes" ] }, { "cell_type": "markdown", "id": "bf0a5d35-5027-40b5-b30f-c6cc260033bf", "metadata": {}, "source": [ "Columns are about as clean as can be for now." ] }, { "cell_type": "markdown", "id": "d3e70db7-bc16-46be-a22b-846d3e01d48c", "metadata": {}, "source": [ "### 2. Feature Engineering\n", "\n", "Some useful features to have at my fingertips would be title-length, title-word-count, post-length, post-word-count" ] }, { "cell_type": "code", "execution_count": 9, "id": "18d8c4c9-7a0c-4951-b468-b7b59be40036", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
uidtimetitlebody-textmediacomments
045Newbiequestionsaboutascendantsandborders6012022-09-05Newbie questions about ascendants and bordersI'm new to actually learning astrology, not ju...00
234Astrologyandcognitivedissonance3232022-09-05Astrology and cognitive dissonanceOpen to anyone who wouldn't mind sharing a rec...01
338whatdoy’allthinkofpersonacharts?1802022-09-05what do y’all think of persona charts?I feel a bit skeptical of them, since I feel l...00
4160RESOURCEREQUEST:Videos(orarticles)withtips/...2022-09-05RESOURCE REQUEST: Videos (or articles) with ti...I think my problem is that I don’t know the pr...02
564peoplewhohavehadsaturntransittheir10th,whatw...2022-09-05people who have had saturn transit their 10th,...How did it affect your career? Did it impact y...011
\n", "
" ], "text/plain": [ " uid time \\\n", "0 45Newbiequestionsaboutascendantsandborders601 2022-09-05 \n", "2 34Astrologyandcognitivedissonance323 2022-09-05 \n", "3 38whatdoy’allthinkofpersonacharts?180 2022-09-05 \n", "4 160RESOURCEREQUEST:Videos(orarticles)withtips/... 2022-09-05 \n", "5 64peoplewhohavehadsaturntransittheir10th,whatw... 2022-09-05 \n", "\n", " title \\\n", "0 Newbie questions about ascendants and borders \n", "2 Astrology and cognitive dissonance \n", "3 what do y’all think of persona charts? \n", "4 RESOURCE REQUEST: Videos (or articles) with ti... \n", "5 people who have had saturn transit their 10th,... \n", "\n", " body-text media comments \n", "0 I'm new to actually learning astrology, not ju... 0 0 \n", "2 Open to anyone who wouldn't mind sharing a rec... 0 1 \n", "3 I feel a bit skeptical of them, since I feel l... 0 0 \n", "4 I think my problem is that I don’t know the pr... 0 2 \n", "5 How did it affect your career? Did it impact y... 0 11 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "post_df.head()" ] }, { "cell_type": "code", "execution_count": 10, "id": "cf74699f-189b-4a53-b814-d5dd36da757c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'(?u)\\\\b\\\\w\\\\w+\\\\b'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ipyutils.PAT_TOKEN # settled on using the same as CountVectorizer default" ] }, { "cell_type": "code", "execution_count": 11, "id": "6a157d00-ce0b-4ec6-a6d5-62192d10335c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 6\n", "2 4\n", "Name: title-wc, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# title word count\n", "post_df['title-wc'] = [len(regexp_tokenize(t, ipyutils.PAT_TOKEN)) \n", " for t in post_df['title']]\n", "post_df['title-wc'].head(2)" ] }, { "cell_type": "code", "execution_count": 12, "id": "8b41255e-aedc-4a40-ac33-8541d1e0f3e4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 45\n", "2 34\n", "Name: title-cc, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# title character count\n", "post_df['title-cc'] = [len(t) for t in post_df['title']]\n", "post_df['title-cc'].head(2)" ] }, { "cell_type": "code", "execution_count": 13, "id": "a5ab804b-40f2-4868-a6b7-c87114b2f477", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 107\n", "2 49\n", "3 29\n", "4 94\n", "5 22\n", "Name: body-wc, dtype: int64" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# body word count\n", "post_df['body-wc'] = [len(regexp_tokenize(t, ipyutils.PAT_TOKEN)) \n", " for t in post_df['body-text']]\n", "post_df['body-wc'].head()" ] }, { "cell_type": "code", "execution_count": 14, "id": "b0004f82-8d03-4bc6-a42d-af09307af5b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 601\n", "2 323\n", "3 180\n", "4 597\n", "5 116\n", "Name: body-cc, dtype: int64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# body character count\n", "post_df['body-cc'] = [len(t) for t in post_df['body-text']]\n", "post_df['body-cc'].head()" ] }, { "cell_type": "markdown", "id": "3d8451b3-93db-4001-91f4-a8e73521dbab", "metadata": {}, "source": [ "### Write Clean Data To Disk" ] }, { "cell_type": "code", "execution_count": 15, "id": "cd46c88b-74e6-4aac-83aa-79a5592623f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['time', 'title', 'body-text', 'title-cc', 'title-wc', 'body-cc',\n", " 'body-wc', 'media', 'comments'],\n", " dtype='object')" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reorganize columns and remove uid column - only needed it to find duplicates\n", "post_df = post_df[['time','title','body-text',\n", " 'title-cc','title-wc','body-cc','body-wc',\n", " 'media','comments']]\n", "post_df.columns" ] }, { "cell_type": "code", "execution_count": 16, "id": "5caa4187-3fa3-4779-93fd-e9c86a74b428", "metadata": {}, "outputs": [], "source": [ "post_df.reset_index()\n", "post_df.to_json('../data/scrapes-clean.json', orient='index')" ] }, { "cell_type": "code", "execution_count": 17, "id": "cd3e5565-e7cf-430c-a813-1d5a1fc1622a", "metadata": {}, "outputs": [], "source": [ "# END" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.5" } }, "nbformat": 4, "nbformat_minor": 5 }