{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# load related library" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# related library\n", "%matplotlib inline\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import geopandas as gpd\n", "\n", "import ipywidgets\n", "from ipywidgets import widgets \n", "from ipywidgets import * \n", "from IPython.display import display,clear_output\n", "\n", "from ipywidgets import Layout\n", "from traitlets import directional_link\n", "\n", "from datetime import datetime\n", "from datetime import date\n", "from dateutil import rrule\n", "\n", "from boto.s3.connection import S3Connection\n", "import requests\n", "from io import BytesIO\n", "# # when I run it in mybinder, it comes up with RuntimeWarning:\n", "# /srv/conda/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", "# return f(*args, **kwds)\n", "# I looked for information online, it said this warning could be ignored safely." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#ipywidgets.__version__,matplotlib.__version__,pd.__version__,np.__version__,gpd.__version__" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#ipywidgets.__version__,matplotlib.__version__,pd.__version__,np.__version__,gpd.__version__,quilt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#ipywidgets.__version__,matplotlib.__version__,pd.__version__,np.__version__,gpd.__version__" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# prepare data for dropdown: continent list and country list\n", "\n", "# get world info\n", "world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))\n", "# get continent list plus \"all\"\n", "continent_list = [\"all\"] + list(set(world['continent']))\n", "#continent_list.append(\"all\")\n", "\n", "continent_country_dict = {}\n", "for continent in continent_list:\n", " continent_country_dict[continent] = [\"all\"] + list(world[world['continent']== continent]['name'])\n", " #continent_country_dict[continent].append(\"all\")\n", " \n", "#print (\"continent list for dropdown:\")\n", "#print (continent_list)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# load dataset\n", "\n", "# If you want to do it locally, you can download dataset from the links below and load it using pd.read_csv just like comments did.\n", "# https://www.dropbox.com/sh/mt7by5f1wgl6n3z/AACddwkFPq5lPpH3ry83MgSDa?dl=0" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Load local dataset.\n", "a_people = pd.read_csv(\"src/article_people_score_date_geo.csv\")\n", "t_people = pd.read_csv(\"src/talk_people_score_date_geo.csv\")\n", "a_events = pd.read_csv(\"src/article_events_score_date_geo.csv\")\n", "t_events = pd.read_csv(\"src/talk_events_score_date_geo.csv\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "complete_df = {}\n", "complete_df['Articles'] = {}\n", "complete_df['Talks'] = {}\n", "complete_df['Articles']['People'] = a_people\n", "complete_df['Articles']['Events'] = a_events\n", "complete_df['Talks']['People'] = t_people\n", "complete_df['Talks']['Events'] = t_events" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "# side functions\n", "# Please run them before ploting" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def exclude_BC(df):\n", " bc_list = [one for one in df['date'] if one.startswith(\"-\")]\n", " df_withoutBC = df[~df['date'].isin(bc_list)]\n", " #print (\"There are %d dates before Christ,%d entities left after excluding\"%(len(bc_list),len(df_withoutBC)))\n", " return df_withoutBC" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def add_datetime_column(df,unit):\n", " # unit: date,month,year\n", " datetime_format = \"\"\n", " if unit == \"date\":\n", " datetime_format = \"%Y-%m-%d\"\n", " elif unit == \"month\":\n", " datetime_format = \"%Y-%m\"\n", " elif unit == \"year\":\n", " datetime_format = \"%Y\"\n", " new_list = [datetime.strptime(one,datetime_format) for one in df[unit]]\n", " if unit == \"date\":\n", " df = df.assign(datetime_date=pd.Series(new_list).values)\n", " elif unit == \"month\":\n", " df = df.assign(datetime_month=pd.Series(new_list).values)\n", " elif unit == \"year\":\n", " df = df.assign(datetime_year=pd.Series(new_list).values)\n", " return df" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def add_month_column(df):\n", " new_list = [one[:7] for one in df['date']]\n", " df = df.assign(month=pd.Series(new_list).values)\n", " return df" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def add_year_column(df):\n", " new_list = [one[:4] for one in df['date']]\n", " df = df.assign(year=pd.Series(new_list).values)\n", " return df" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def score_median_group_by_column(df,tcn,scn):\n", " #print (df)\n", " out_df = df.groupby(tcn).agg({scn:np.median}).reset_index()\n", " #print (df)\n", " return out_df\n", "# df_month_score=score_median_group_by_column(monthSenti,'month','score')\n", "# print (df_month_score.head())" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def score_percentile_group_by_column(df,tcn,scn,percentile, lex):\n", " #print (df)\n", " \n", " # Debug:\n", " per_str = '{}%'.format(int(percentile*100))\n", " \n", " df_describe = df.groupby(tcn).describe()\n", " df_pos = df_describe[f'pos_score_{lex}'][[per_str]].rename(columns={per_str:f'pos_score_{lex}'})\n", " df_neg = df_describe[f'neg_score_{lex}'][[per_str]].rename(columns={per_str:f'neg_score_{lex}'})\n", " df_len = df_describe['length'][[per_str]].rename(columns={per_str:'length'})\n", " df_tot = df_describe['total'][[per_str]].rename(columns={per_str:'total'})\n", " out_df = pd.concat([df_pos, df_neg, df_len, df_tot], axis=1).reset_index()\n", " #out_df = df.groupby(tcn).quantile(percentile).reset_index()\n", " #print (tcn)\n", " #print (out_df)\n", " return out_df" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# plot function\n", "def prepare_flowplot_percentiles(lexicon,sentiment,group,domain,geo,times,ra,window,unit,min_length):\n", " # lexicon,sentiment,group,domain,geo,time,ra,unit\n", " # lexicon: OL, MPQA, LIWC, ANEW\n", " # sentiment: pos, neg, total\n", " # group: [group_people,group_events,group_others] boolean value\n", " # domain: Articles, Talks\n", " # geo: [continent,country] continent,country could be 'all'\n", " # times: [start_year,start_month,end_year,end_month] int\n", " # unit: 'year' or 'month'\n", " # output_text\n", " statistics = []\n", " dict_a_people = {\"total\":\"1,146,257\",\"date\":\"775,664\",\"BC\":\"27\"}\n", " dict_a_events = {\"total\":\"54,071\",\"date\":\"22,582\",\"BC\":\"33\"}\n", " dict_t_people = {\"total\":\"415,124\",\"date\":\"289,108\",\"BC\":\"26\"}\n", " dict_t_events = {\"total\":\"21,621\",\"date\":\"10,283\",\"BC\":\"27\"}\n", " statistics_dict = {\"Articles\":{\"People\":dict_a_people,\"Events\":dict_a_events},\"Talks\":{\"People\":dict_t_people,\"Events\":dict_t_events}}\n", " birth_occurrance = {\"People\":\"birth\",\"Events\":\"occurrance\"}\n", " \n", " # prepare showed dates depending on time range\n", " # get time range\n", " start_yy,start_mm,end_yy,end_mm = times\n", " start_date = date(start_yy,start_mm,1)\n", " end_date = date(end_yy,end_mm,1)\n", " # prepare x-axis\n", " xaxis_value = [day for day in rrule.rrule(rrule.MONTHLY,dtstart=start_date,until=end_date)]\n", " #print (\"value for x-axis looks like this: \\n%s \\nlen(x-axis): %d\"%(xaxis_value[-1],len(xaxis_value)))\n", " \n", " # prepare dataframe to display\n", " df = pd.DataFrame()\n", " \n", " #group_people,group_events,group_others = group\n", " \n", " # choose corresponding dataframe depend on domain and group\n", " for one in group:\n", " if one.value:\n", " # add statistics for total entities for this group\n", " statistics.append(\"There are totally %s %s entities in Wikipedia %s.\"%(statistics_dict[domain][one.description]['total'],one.description,domain))\n", " statistics.append(\"Among them, %s have %s date information. Inside them we exclude %s entities whose date before Christ (BC).\"%\n", " (statistics_dict[domain][one.description]['date'],birth_occurrance[one.description],statistics_dict[domain][one.description]['BC']))\n", " if df.empty:\n", " df = complete_df[domain][one.description][['pos_score_'+lexicon,'neg_score_'+lexicon,'date','country','continent','length']]\n", " else:\n", " df = pd.concat([df,complete_df[domain][one.description][['pos_score_'+lexicon,'neg_score_'+lexicon,'date','country','continent','length']]])\n", "\n", " if df.empty:\n", " statistics.append(\"Please select value for Group.\")\n", " return False,False,False,False,False,False,False,statistics,False,False\n", " statistics.append(\"After all, there are {:,d} entities with AC date information in this run.\".format(len(df)))\n", " \n", " # filter length\n", " df = df[df['length']>=min_length]\n", " #print (len(df))\n", " statistics.append(\"After filtering with the length of the document, there are {:,d} entities left.\".format(len(df)))\n", " \n", " # filter out entities based on geo\n", " continent,country = geo\n", " # if continent == 'all', do nothing\n", " if not continent == 'all':\n", " if country == 'all':\n", " # filter out entities with target continent\n", " df = df[df['continent']==continent]\n", " else:\n", " if country == 'United States of America':\n", " country = 'United States'\n", " # filter out entities with target country\n", " df = df[df['country']==country]\n", " statistics.append(\"After filtering with area, there are {:,d} entities left.\".format(len(df)))\n", " \n", " df_test = df.copy()\n", " if len(df)==0:\n", " statistics.append(\"No entities fit the requirements, please change the settings.\")\n", " return False,False,False,False,False,False,False,statistics,False,False\n", " \n", " # add target unit and corresponding datetime type for entities\n", " if unit == \"month\":\n", " df = add_month_column(df)\n", " elif unit == \"year\":\n", " df = add_year_column(df)\n", "\n", " \n", " # we need datetime type because we use it to filter out entities within time range\n", " df = add_datetime_column(df,unit)\n", " \n", " # filter out entities based on time range\n", " df = df[df[\"datetime_\"+unit].isin(xaxis_value)]\n", " statistics.append(\"After filtering with date, there are {:,d} entities left, and collected in the plot.\".format(len(df)))\n", " \n", " if len(df)==0:\n", " statistics.append(\"No entities fit the requirements, please change the settings.\")\n", " return False,False,False,False,False,False,False,statistics,False,False\n", " \n", " # get score depending on lexicon and sentiment\n", " df = df.assign(total=df[\"pos_score_\"+lexicon]+df[\"neg_score_\"+lexicon])\n", " if sentiment == \"total\": \n", " score_column = \"total\"\n", " else:\n", " score_column = sentiment+\"_score_\"+lexicon\n", " \n", "\n", " # plot flowplot \n", " # get median\n", " df_median = score_median_group_by_column(df,unit,score_column)\n", " df_median = add_datetime_column(df_median,unit)\n", " df_25percentile = score_percentile_group_by_column(df,'datetime_'+unit,score_column,0.25, lexicon)\n", " df_75percentile = score_percentile_group_by_column(df,'datetime_'+unit,score_column,0.75, lexicon)\n", " #print (\"here\")\n", " #print (df_25percentile)\n", " \n", " # filling blank\n", " fill_temp = pd.DataFrame(xaxis_value,columns=['datetime_'+unit])\n", " fill_df_median = fill_temp.merge(df_median,how='left')\n", " fill_df_median.loc[fill_df_median[score_column].isnull(),score_column]=0\n", " \n", " # Debug\n", " fill_df_25percentile = fill_temp.merge(df_25percentile,how='left', left_on='datetime_month', right_on='datetime_month')\n", " fill_df_25percentile.loc[fill_df_25percentile[score_column].isnull(),score_column]=0\n", " \n", " # Debug\n", " fill_df_75percentile = fill_temp.merge(df_75percentile,how='left', left_on='datetime_month', right_on='datetime_month')\n", " fill_df_75percentile.loc[fill_df_75percentile[score_column].isnull(),score_column]=0\n", " \n", " if ra == True:\n", " # get rolling average\n", "# df_median[\"ra\"] = df_median[score_column].rolling(window,center=True).mean()\n", "# df_25percentile[\"ra\"] = df_25percentile[score_column].rolling(window,center=True).mean()\n", "# df_75percentile[\"ra\"] = df_75percentile[score_column].rolling(window,center=True).mean() \n", " fill_df_median[\"ra\"] = fill_df_median[score_column].rolling(window,center=True).mean()\n", " fill_df_25percentile[\"ra\"] = fill_df_25percentile[score_column].rolling(window,center=True).mean()\n", " fill_df_75percentile[\"ra\"] = fill_df_75percentile[score_column].rolling(window,center=True).mean()\n", " \n", " \n", " # plot flowplot\n", " #ax.plot(df_median['datetime_'+unit],df_median[score_column],'r-')\n", " temp_x = fill_df_median['datetime_'+unit]\n", " temp_x = temp_x.values\n", " #temp_x = pd.Series.values(temp_x)\n", " #ax.fill_between(temp_x,df_25percentile[score_column],df_75percentile[score_column],color='b',alpha=0.2)\n", " \n", " # Debug\n", " df_time_size = df.groupby('datetime_'+unit).size().reset_index(name='size')\n", " fill_df_time_size = fill_temp.merge(df_time_size,how='left').fillna(0)\n", " #print (df_time_size)\n", " \n", " return True, fill_df_median,fill_df_25percentile,fill_df_75percentile,score_column,temp_x,fill_df_time_size,statistics,start_date,end_date\n", "\n", "# # set ax\n", "# myfontsize = 10\n", "# ax.set_title(\"median,percentile of \"+sentiment+\" score for \"+group+\" based on \"+lexicon,fontsize=myfontsize)\n", "# ax.set_xlabel('time')\n", "# ax.set_ylabel('score')\n", "# ax.set_xlim([start_date,end_date])\n", "# [[item.set_color('b') for item in bp_dict[key]['boxes']] for key in bp.keys()]\n", "# [[item.set_color('b') for item in bp_dict[key]['whiskers']] for key in bp.keys()]\n", "# [[item.set_color('r') for item in bp_dict[key]['medians']] for key in bp.keys()]\n", " \n", "# for tick in bp_axes.get_xticklabels():\n", "# tick.set_rotation(90)\n", " #plt.show()\n", "#prepare_flowplot_percentiles(\"OL\",\"total\",\"People\",domain,geo,times,ra,window,unit) \n", "#flowplot_percentiles(\"OL\",\"total\",\"People\",[\"Europe\",\"all\"],\"Articles\",\"month\",[1940,1,1],[1960,1,1])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# widget 1: median of sentiment score for Wikipedia concepts over time grouped by month" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# define actions while click update botton\n", "def on_button_clicked(b):\n", " # prepare and filter dataset\n", " # get parameters from input\n", " # get geo\n", " continent = dropdown_continent.value\n", " country = dropdown_country.value\n", " geo = [continent,country]\n", " # get time\n", " time_start_year = dropdown_start_year.value\n", " time_start_month = dropdown_start_month.value\n", " time_end_year = dropdown_end_year.value\n", " time_end_month = dropdown_end_month.value\n", " time = [time_start_year,time_start_month,time_end_year,time_end_month]\n", " # get lexicon\n", " lexicon = radio_button_lexicon.value\n", " # get sentiment\n", " sentiment = sen_dict[radio_button_sentiment.value]\n", " # get domain\n", " domain = radio_button_domain.value\n", " # get group\n", " # group_people,group_events,group_others = cb_container.children\n", " group = cb_container.children\n", " # get rolling average\n", " ra = checkbox_ra.value\n", " window = dropdown_ra.value\n", " \n", " # get minimum length \n", " min_length = dropdown_length.value\n", " \n", " # define time unit to month\n", " unit = \"month\"\n", " \n", " # output text\n", " #output_label_list = []\n", " output_label_str_container = widgets.VBox(layout=Layout(width='100%',border='solid 1px'))\n", " \n", " exist, df50,df25,df75,score_column,percentile_x,df_time_size,output_label_list,s_date,e_date =prepare_flowplot_percentiles(lexicon,sentiment,group,domain,geo,time,ra,window,unit,min_length)\n", " \n", " if exist == False:\n", " fig = None\n", " else:\n", " # if rolling average==True, get column 'ra' instead of \n", " if ra == True:\n", " score_column = 'ra'\n", "\n", "\n", "\n", " # draw\n", " # Debug\n", " fig,axes = plt.subplots(2,1,sharex='row', figsize=(15,8))\n", " ax0,ax1 = axes.flatten()\n", " ax = [ax0,ax1]\n", " time_size_loc = np.arange(len(df_time_size['datetime_'+unit].tolist()))\n", " ax[0].bar(time_size_loc,df_time_size['size'],width=0.8, align='edge')\n", " df50.plot(x='datetime_'+unit, y=score_column, color='r', ax=ax[1])\n", " ax[1].fill_between(percentile_x,df25[score_column],df75[score_column],color='b',alpha=0.2)\n", "\n", " ax[0].set_xlim(0,time_size_loc.shape[0])\n", " ax[0].get_xaxis().set_visible(False)\n", " ax[1].get_legend().remove()\n", "\n", " # add label for plots\n", " plot_title_dict = {\"ra\":\"rolling average of median score\"}\n", " ax[0].set_title(\"Number of entities from %s to %s matching the current setting (grouped by month)\"%(s_date.strftime(\"%B %Y\"),e_date.strftime(\"%B %Y\")))\n", " ax[0].set_ylabel(\"number of entities\")\n", " ax[1].set_title(\"Corresponding %s for each month\"%(plot_title_dict.get(score_column,\"median score\")))\n", " ax[1].set_ylabel(\"score\")\n", " ax[1].set_xlabel(\"time\")\n", " \n", " #output_label_list.append(\"Totally %d entities are collected in the plot.\"%(sum(df_time_size['size'].tolist())))\n", " output_labels = [widgets.HTML(value=i) for i in output_label_list]\n", " output_label_str_container.children = [i for i in output_labels]\n", " with box_out:\n", " clear_output(wait=True)\n", " display(fig)\n", " with text_out:\n", " clear_output(wait=True)\n", " display(output_label_str_container)\n", " if fig:\n", " plt.close(fig)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5381d8d5ad7945019f9494ab3e52313c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(VBox(children=(HTML(value='

WikiSentiFlow


\\n

Introduction\\n