{ "cells": [ { "cell_type": "markdown", "id": "18cce9c7-72d8-4b79-b7a6-33952df71d25", "metadata": {}, "source": [ "# Update Database Structure\n", "\n", "Changes in the data stored and format will affect how the information is processed and stored. An update method was created to change the storage.\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "e69e1dc0-e361-4626-8bb8-58e31d5c8374", "metadata": {}, "outputs": [], "source": [ "import ipywidgets as widgets\n", "from IPython.core.display import display, HTML, update_display\n", "import json, os, pickle\n", "from random import seed, randint\n", "from tweet_requester.analysis import TweetAnalyzer\n", "from tweet_requester.display import TweetInteractiveClassifier, \\\n", "JsonLInteractiveClassifier, TSess, prepare_google_credentials, PROCESSING_STAGES, logging\n", "from twitter_secrets import C_BEARER_TOKEN \n", "JL_DATA=\"./tweetsRickyRenuncia-final.jsonl\"\n", "BASE_DIR=\"./Evaluating Content\"\n", "# Update database\n", "#April 30, 2021 the RR team rehydrated with twarc their data.\n", "april302021 = 1619755200.0\n", "# git_commit=\"9219b7a01ce28f5bc0d61c913b3f914f967614fd\"\n", "git_commit=\"2ac78595cceef98a56c518c24f2187360e1527e3\"\n", "tweet_session = TSess(\n", " C_BEARER_TOKEN, \n", " compression_level=5, \n", " sleep_time=3, \n", " cache_dir=\"./tweet_cache/\", \n", " hash_split=True\n", " )\n", "google_credentials = prepare_google_credentials(\n", " credentials_file=\"./google_translate_keys.json\"\n", ")" ] }, { "cell_type": "code", "execution_count": 2, "id": "68b8894e-acb9-4dbd-959e-e08077ea00fb", "metadata": {}, "outputs": [], "source": [ "classifier = JsonLInteractiveClassifier(\n", " tweet_ids_file=\"tweetsRickyRenuncia-final.txt\", \n", " session=tweet_session, mute=True, \n", " google_credentials=google_credentials,\n", " pre_initialized=True, sqlite_db=\"tweets.db\"\n", ")" ] }, { "cell_type": "code", "execution_count": 3, "id": "656b2e20-7462-484a-9623-caafdfaf573c", "metadata": {}, "outputs": [], "source": [ "classifier.close()" ] }, { "cell_type": "code", "execution_count": 4, "id": "338b9ea0-9b85-4a9f-a5b1-c9970b132e45", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:Database version is 0.3 >= 0.2. Skipping update.\n", "WARNING:root:Database version is greater than expected 0.3 > 0.2. This update does not apply.\n" ] } ], "source": [ "import logging\n", "logging.basicConfig(level=logging.WARNING)\n", "classifier.update_database_v01_v02(dateCreated=april302021, git_commit=git_commit)\n", "classifier.update_database_v02_v03(git_commit=git_commit)\n", "classifier.update_database_v03_v04(git_commit=git_commit)" ] }, { "cell_type": "code", "execution_count": 4, "id": "8d5eb91e-7142-4cca-9971-55e406cef2ee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " PROCESSING_STAGE | COUNT \n", "------------------------- | --------\n", " UNPROCESSED | 493031 \n", " REVIEWING | 373 \n", " FINALIZED | 68 \n", " UNAVAILABLE_EMBEDING | 1328 \n", " RETWEET | 2814 \n", " PREPROCESSED | 2714 \n", "\n", "\n", "Sample: \n", "\t ('1002186716046864386', 6)\n", "\t ('1102716035176775681', 6)\n", "\t ('1138785914757533696', 6)\n", "\t ('1148321742697504769', 6)\n", "\t ('1149490876592218113', 6)\n" ] } ], "source": [ "classifier.connect()\n", "cur = classifier.cursor()\n", "\n", "cur.execute(\"\"\"\n", "SELECT state, count(*) from tweet\n", "GROUP BY state ORDER BY state;\"\"\")\n", "rows = cur.fetchall()\n", "print(\"{:>25} | {:<8}\".format(\"PROCESSING_STAGE\", \"COUNT\"))\n", "print(\"{:>25} | {:<8}\".format(\"-\"*25, \"-\"*8))\n", "for row in rows:\n", " print(\"{:>25} | {:<8}\".format(PROCESSING_STAGES(row[0]).name, row[1]))\n", "\n", "cur.execute(\"\"\"\n", "SELECT * from tweet\n", "WHERE tweet_id in (\n", "SELECT tweet_id FROM tweet\n", "WHERE state in (?));\"\"\",\n", "(PROCESSING_STAGES.PREPROCESSED.value,))\n", "rows_sample = cur.fetchall()\n", "\n", "print(\"\\n\\nSample: \")\n", "n=0\n", "cur.close()\n", "for row in rows_sample:\n", " print(\"\\t\",row)\n", " n+=1\n", " if n > 4:\n", " break" ] }, { "cell_type": "code", "execution_count": 5, "id": "3b154777-39a3-4776-966c-6f13d62f0b19", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "“Lárgate para el carajo pa’ la China o pa’ el Japón” 😂😂😂#RickyRenuncia pic.twitter.com/anixxn5PUO
— JunyPR3 (@herreraevanol) July 15, 2019
Translation to 'en':
"Get off to hell for China or for Japan" 😂😂😂 #RickyRenuncia https://t.co/anixxn5PUO\n",
"
\n", "\n", "#RickyRenuncia TRENDING a nivel MUNDIAL. ¡No te quites boricua!🇵🇷✊🏼 pic.twitter.com/9oZopQC89y
— Fulanita (@YanieYampier) July 15, 2019
Translation to 'en':
#RickyRenuncia TRENDING WORLDWIDE. Don't take off boricua! 🇵🇷✊🏼 https://t.co/9oZopQC89y\n",
"
\n", "\n", "#RickyRenuncia #PUTA
— PJ Sin Suela (@pjsinsuela) July 15, 2019
Canción completa: https://t.co/iqqykDyibz pic.twitter.com/U1mtZeNwzp
Translation to 'en':
#RickyRenuncia #PUTA Full song: https://t.co/iqqykDyibz https://t.co/U1mtZeNwzp\n",
"
\n", "\n", "MAÑANA todoooosss para SJ!!!! Y si no puedes ir explota tus redes con #RickyRenuncia, no se queden callados. Esto es cuestión de todos poner un granito de arena, dejen las excusas y APOYEN. pic.twitter.com/S9Jtje2KO4
— Ro (@rfxvi) July 16, 2019
Translation to 'en':
TOMORROW todoooosss for SJ !!!! And if you can't go exploit your networks with #RickyRenuncia , do not be silent. This is a matter for everyone to put a grain of sand, leave the excuses and SUPPORT. https://t.co/S9Jtje2KO4\n",
"
\n", "\n", "Las paredes ya están pintadas, el país sigue jodío y Ricky no ha renunciado #RickyRenuncia pic.twitter.com/34VfA3qQ6Z
— Edgo (@edgo787) July 16, 2019
Translation to 'en':
The walls are already painted, the country is still screwed and Ricky has not given up #RickyRenuncia https://t.co/34VfA3qQ6Z\n",
"
\n", "\n", "#RickyRenuncia https://t.co/hwPQSAbaCT
— Davinchi Almodovar (@eldavinchi) July 16, 2019
Translation to 'en':
#RickyRenuncia https://t.co/hwPQSAbaCT\n",
"
\n", "\n", "I had 8 questions for the embattled Governor of Puerto Rico. Here’s how he responded. pic.twitter.com/7OwX5Ja5B8
— David Begnaud (@DavidBegnaud) July 16, 2019
Translation to 'en':
I had 8 questions for the embattled Governor of Puerto Rico. Here’s how he responded. https://t.co/7OwX5Ja5B8\n",
"
\n", "\n", "El ambiente en Mayagüez ahora mismo. #RickyRenunciaYa #RickyRenuncia pic.twitter.com/NLKDvQ5tzO
— Mulata (@NataliaNicole) July 16, 2019
Translation to 'en':
The atmosphere in Mayagüez right now. #RickyRenunciaYa #RickyRenuncia https://t.co/NLKDvQ5tzO\n",
"
Preprocessed 94
"
],
"text/plain": [
" Preprocessed 250 "
],
"text/plain": [
"