{ "cells": [ { "cell_type": "markdown", "id": "18cce9c7-72d8-4b79-b7a6-33952df71d25", "metadata": {}, "source": [ "# Update Database Structure\n", "\n", "Changes in the data stored and format will affect how the information is processed and stored. An update method was created to change the storage.\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "e69e1dc0-e361-4626-8bb8-58e31d5c8374", "metadata": {}, "outputs": [], "source": [ "import ipywidgets as widgets\n", "from IPython.core.display import display, HTML, update_display\n", "import json, os, pickle\n", "from random import seed, randint\n", "from tweet_requester.analysis import TweetAnalyzer\n", "from tweet_requester.display import TweetInteractiveClassifier, \\\n", "JsonLInteractiveClassifier, TSess, prepare_google_credentials, PROCESSING_STAGES, logging\n", "from twitter_secrets import C_BEARER_TOKEN \n", "JL_DATA=\"./tweetsRickyRenuncia-final.jsonl\"\n", "BASE_DIR=\"./Evaluating Content\"\n", "# Update database\n", "#April 30, 2021 the RR team rehydrated with twarc their data.\n", "april302021 = 1619755200.0\n", "# git_commit=\"9219b7a01ce28f5bc0d61c913b3f914f967614fd\"\n", "git_commit=\"2ac78595cceef98a56c518c24f2187360e1527e3\"\n", "tweet_session = TSess(\n", " C_BEARER_TOKEN, \n", " compression_level=5, \n", " sleep_time=3, \n", " cache_dir=\"./tweet_cache/\", \n", " hash_split=True\n", " )\n", "google_credentials = prepare_google_credentials(\n", " credentials_file=\"./google_translate_keys.json\"\n", ")" ] }, { "cell_type": "code", "execution_count": 2, "id": "68b8894e-acb9-4dbd-959e-e08077ea00fb", "metadata": {}, "outputs": [], "source": [ "classifier = JsonLInteractiveClassifier(\n", " tweet_ids_file=\"tweetsRickyRenuncia-final.txt\", \n", " session=tweet_session, mute=True, \n", " google_credentials=google_credentials,\n", " pre_initialized=True, sqlite_db=\"tweets.db\"\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "id": "656b2e20-7462-484a-9623-caafdfaf573c", "metadata": {}, "outputs": [], "source": [ "classifier.close()" ] }, { "cell_type": "code", "execution_count": 4, "id": "338b9ea0-9b85-4a9f-a5b1-c9970b132e45", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:Database version is 0.3 >= 0.2. Skipping update.\n", "WARNING:root:Database version is greater than expected 0.3 > 0.2. This update does not apply.\n" ] } ], "source": [ "import logging\n", "logging.basicConfig(level=logging.WARNING)\n", "classifier.update_database_v01_v02(dateCreated=april302021, git_commit=git_commit)\n", "classifier.update_database_v02_v03(git_commit=git_commit)\n", "classifier.update_database_v03_v04(git_commit=git_commit)" ] }, { "cell_type": "code", "execution_count": 4, "id": "8d5eb91e-7142-4cca-9971-55e406cef2ee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " PROCESSING_STAGE | COUNT \n", "------------------------- | --------\n", " UNPROCESSED | 493031 \n", " REVIEWING | 373 \n", " FINALIZED | 68 \n", " UNAVAILABLE_EMBEDING | 1328 \n", " RETWEET | 2814 \n", " PREPROCESSED | 2714 \n", "\n", "\n", "Sample: \n", "\t ('1002186716046864386', 6)\n", "\t ('1102716035176775681', 6)\n", "\t ('1138785914757533696', 6)\n", "\t ('1148321742697504769', 6)\n", "\t ('1149490876592218113', 6)\n" ] } ], "source": [ "classifier.connect()\n", "cur = classifier.cursor()\n", "\n", "cur.execute(\"\"\"\n", "SELECT state, count(*) from tweet\n", "GROUP BY state ORDER BY state;\"\"\")\n", "rows = cur.fetchall()\n", "print(\"{:>25} | {:<8}\".format(\"PROCESSING_STAGE\", \"COUNT\"))\n", "print(\"{:>25} | {:<8}\".format(\"-\"*25, \"-\"*8))\n", "for row in rows:\n", " print(\"{:>25} | {:<8}\".format(PROCESSING_STAGES(row[0]).name, row[1]))\n", "\n", "cur.execute(\"\"\"\n", "SELECT * from tweet\n", "WHERE tweet_id in (\n", "SELECT tweet_id FROM tweet\n", "WHERE state in (?));\"\"\",\n", "(PROCESSING_STAGES.PREPROCESSED.value,))\n", "rows_sample = cur.fetchall()\n", "\n", "print(\"\\n\\nSample: \")\n", "n=0\n", "cur.close()\n", "for row in rows_sample:\n", " print(\"\\t\",row)\n", " n+=1\n", " if n > 4:\n", " break" ] }, { "cell_type": "code", "execution_count": 5, "id": "3b154777-39a3-4776-966c-6f13d62f0b19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "

“Lárgate para el carajo pa’ la China o pa’ el Japón” 😂😂😂#RickyRenuncia pic.twitter.com/anixxn5PUO

— JunyPR3 (@herreraevanol) July 15, 2019
\n", "\n", "

Translation to 'en':
"Get off to hell for China or for Japan" 😂😂😂 #RickyRenuncia https://t.co/anixxn5PUO\n", "

#RickyRenuncia TRENDING a nivel MUNDIAL. ¡No te quites boricua!🇵🇷✊🏼 pic.twitter.com/9oZopQC89y

— Fulanita (@YanieYampier) July 15, 2019
\n", "\n", "

Translation to 'en':
#RickyRenuncia TRENDING WORLDWIDE. Don't take off boricua! 🇵🇷✊🏼 https://t.co/9oZopQC89y\n", "

#RickyRenuncia #PUTA
Canción completa: https://t.co/iqqykDyibz pic.twitter.com/U1mtZeNwzp

— PJ Sin Suela (@pjsinsuela) July 15, 2019
\n", "\n", "

Translation to 'en':
#RickyRenuncia #PUTA Full song: https://t.co/iqqykDyibz https://t.co/U1mtZeNwzp\n", "

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "classifier.display_accepted(page=3, per_page=3)" ] }, { "cell_type": "code", "execution_count": 5, "id": "4ade538e-a06e-40b8-8654-aa3fa48ac23f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "

Thank you!

Exited from evaluation

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "classifier.StartEvaluations()" ] }, { "cell_type": "code", "execution_count": 6, "id": "dfb9563f-9c6c-466a-b686-448fc76bd9dc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('1150839690184069127', 1)\n", "('1150842425390317573', 1)\n", "('1150848287299244036', 1)\n", "('1150849765376241664', 1)\n", "('1150850130704248833', 1)\n", "('1150857492173398018', 1)\n", "('1150859636851040256', 1)\n", "('1150862160760909824', 1)\n", "('1150862260237164545', 1)\n", "('1150863149429592064', 1)\n" ] } ], "source": [ "classifier.connect()\n", "cur = classifier.cursor()\n", "cur.execute(\"\"\"\n", "SELECT * from tweet\n", "WHERE tweet_id in (\n", "SELECT tweet_id FROM tweet\n", "WHERE state in (?));\"\"\",\n", "(PROCESSING_STAGES.REVIEWING.value,))\n", "\n", "rows = cur.fetchall()\n", "n=0\n", "cur.close()\n", "for row in rows:\n", " print(row)\n", " classifier.tweet_set_state(\n", " tweet_id=row[0],\n", " state=PROCESSING_STAGES.UNPROCESSED\n", " )\n", " n+=1\n", " if n > 9:\n", " break\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "c88fbd7b-9c59-4c00-ae92-4375ac2a8202", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "

MAÑANA todoooosss para SJ!!!! Y si no puedes ir explota tus redes con #RickyRenuncia, no se queden callados. Esto es cuestión de todos poner un granito de arena, dejen las excusas y APOYEN. pic.twitter.com/S9Jtje2KO4

— Ro (@rfxvi) July 16, 2019
\n", "\n", "

Translation to 'en':
TOMORROW todoooosss for SJ !!!! And if you can't go exploit your networks with #RickyRenuncia , do not be silent. This is a matter for everyone to put a grain of sand, leave the excuses and SUPPORT. https://t.co/S9Jtje2KO4\n", "

Las paredes ya están pintadas, el país sigue jodío y Ricky no ha renunciado #RickyRenuncia pic.twitter.com/34VfA3qQ6Z

— Edgo (@edgo787) July 16, 2019
\n", "\n", "

Translation to 'en':
The walls are already painted, the country is still screwed and Ricky has not given up #RickyRenuncia https://t.co/34VfA3qQ6Z\n", "

#RickyRenuncia https://t.co/hwPQSAbaCT

— Davinchi Almodovar (@eldavinchi) July 16, 2019
\n", "\n", "

Translation to 'en':
#RickyRenuncia https://t.co/hwPQSAbaCT\n", "

I had 8 questions for the embattled Governor of Puerto Rico. Here’s how he responded. pic.twitter.com/7OwX5Ja5B8

— David Begnaud (@DavidBegnaud) July 16, 2019
\n", "\n", "

Translation to 'en':
I had 8 questions for the embattled Governor of Puerto Rico. Here’s how he responded. https://t.co/7OwX5Ja5B8\n", "

El ambiente en Mayagüez ahora mismo. #RickyRenunciaYa #RickyRenuncia pic.twitter.com/NLKDvQ5tzO

— Mulata (@NataliaNicole) July 16, 2019
\n", "\n", "

Translation to 'en':
The atmosphere in Mayagüez right now. #RickyRenunciaYa #RickyRenuncia https://t.co/NLKDvQ5tzO\n", "

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "page=5\n", "per_page=5\n", "classifier.display_accepted(page=page, per_page=per_page)" ] }, { "cell_type": "code", "execution_count": null, "id": "2e9716bc-96bd-46df-a63f-49a0e20489a0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "

Preprocessed 94

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from datetime import datetime\n", "from time import sleep\n", "import logging\n", "last_pull=datetime.now().timestamp()-900\n", "current_time=end = datetime.now().timestamp()\n", "while True:\n", " if current_time - last_pull > 900:\n", " start_pull = datetime.now().timestamp()\n", " try:\n", " classifier.preprocess_batch(n=150)\n", " except Exception as err:\n", " logging.error(err)\n", " break\n", " # Average the download time to the middle of the transaction.\n", " last_pull = (start_pull + datetime.now().timestamp())/2.0\n", " else:\n", " current_time = datetime.now().timestamp()\n", " # sleep for time left for 15 minutes\n", " sleep(900 - (current_time - last_pull))\n", " current_time = datetime.now().timestamp()" ] }, { "cell_type": "code", "execution_count": 8, "id": "f10890a6-4c44-4166-80fc-d3caf2a8cc01", "metadata": {}, "outputs": [ { "data": { "text/html": [ "

Preprocessed 250

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "classifier.preprocess_batch(n=250)" ] }, { "cell_type": "code", "execution_count": 10, "id": "9250987c-b836-49ae-94e1-5a8e0e70f555", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting tweet-requester\n", " Downloading tweet_requester-0.0.1-py3-none-any.whl (21 kB)\n", "Collecting google-cloud-translate>=3.3.1\n", " Downloading google_cloud_translate-3.3.2-py2.py3-none-any.whl (104 kB)\n", "\u001b[K |████████████████████████████████| 104 kB 367 kB/s eta 0:00:01\n", "\u001b[?25hCollecting ipython>=7.25.0\n", " Downloading ipython-7.26.0-py3-none-any.whl (786 kB)\n", "\u001b[K |████████████████████████████████| 786 kB 443 kB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: proto-plus>=0.4.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-cloud-translate>=3.3.1->tweet-requester) (1.19.0)\n", "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.3.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-cloud-translate>=3.3.1->tweet-requester) (1.7.1)\n", "Requirement already satisfied: google-api-core[grpc]<3.0.0dev,>=1.26.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-cloud-translate>=3.3.1->tweet-requester) (1.31.0)\n", "Requirement already satisfied: packaging>=14.3 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-cloud-translate>=3.3.1->tweet-requester) (20.9)\n", "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (2.25.1)\n", "Requirement already satisfied: setuptools>=40.3.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (57.4.0)\n", "Requirement already satisfied: pytz in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (2021.1)\n", "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (1.53.0)\n", "Requirement already satisfied: protobuf>=3.12.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (3.17.3)\n", "Requirement already satisfied: google-auth<2.0dev,>=1.25.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (1.33.0)\n", "Requirement already satisfied: six>=1.13.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (1.15.0)\n", "Requirement already satisfied: grpcio<2.0dev,>=1.29.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (1.38.1)\n", "Requirement already satisfied: rsa<5,>=3.1.4 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-auth<2.0dev,>=1.25.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (4.7.2)\n", "Requirement already satisfied: pyasn1-modules>=0.2.1 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-auth<2.0dev,>=1.25.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (0.2.8)\n", "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from google-auth<2.0dev,>=1.25.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (4.2.2)\n", "Requirement already satisfied: pexpect>4.3 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (4.8.0)\n", "Requirement already satisfied: backcall in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (0.2.0)\n", "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (3.0.14)\n", "Requirement already satisfied: traitlets>=4.2 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (5.0.5)\n", "Requirement already satisfied: jedi>=0.16 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (0.18.0)\n", "Requirement already satisfied: decorator in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (4.4.2)\n", "Collecting matplotlib-inline\n", " Using cached matplotlib_inline-0.1.2-py3-none-any.whl (8.2 kB)\n", "Requirement already satisfied: pickleshare in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (0.7.5)\n", "Requirement already satisfied: pygments in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from ipython>=7.25.0->tweet-requester) (2.7.4)\n", "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from jedi>=0.16->ipython>=7.25.0->tweet-requester) (0.8.1)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from packaging>=14.3->google-cloud-translate>=3.3.1->tweet-requester) (2.4.7)\n", "Requirement already satisfied: ptyprocess>=0.5 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from pexpect>4.3->ipython>=7.25.0->tweet-requester) (0.7.0)\n", "Requirement already satisfied: wcwidth in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython>=7.25.0->tweet-requester) (0.2.5)\n", "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<2.0dev,>=1.25.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (0.4.8)\n", "Requirement already satisfied: idna<3,>=2.5 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (2020.12.5)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (1.26.3)\n", "Requirement already satisfied: chardet<5,>=3.0.2 in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]<3.0.0dev,>=1.26.0->google-cloud-translate>=3.3.1->tweet-requester) (3.0.4)\n", "Requirement already satisfied: ipython-genutils in /home/torrien/.virtualenvs/jupyterhub/lib/python3.8/site-packages (from traitlets>=4.2->ipython>=7.25.0->tweet-requester) (0.2.0)\n", "Installing collected packages: matplotlib-inline, ipython, google-cloud-translate, tweet-requester\n", " Attempting uninstall: ipython\n", " Found existing installation: ipython 7.20.0\n", " Uninstalling ipython-7.20.0:\n", " Successfully uninstalled ipython-7.20.0\n", " Attempting uninstall: google-cloud-translate\n", " Found existing installation: google-cloud-translate 3.2.1\n", " Uninstalling google-cloud-translate-3.2.1:\n", " Successfully uninstalled google-cloud-translate-3.2.1\n", "Successfully installed google-cloud-translate-3.3.2 ipython-7.26.0 matplotlib-inline-0.1.2 tweet-requester-0.0.1\n", "\u001b[33mWARNING: You are using pip version 21.2.1; however, version 21.2.2 is available.\n", "You should consider upgrading via the '/home/torrien/.virtualenvs/jupyterhub/bin/python -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "# Install a pip package in the current Jupyter kernel\n", "import sys\n", "!{sys.executable} -m pip install tweet-requester" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }