{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2\n", "%store -r page" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "page_name = 'The_Camp_of_the_Saints'\n", "from external.wikipedia import WikipediaDV, WikipediaAPI\n", "wikipedia_dv = WikipediaDV(WikipediaAPI(domain='en.wikipedia.org'))\n", "page = wikipedia_dv.get_page(page_name)\n", "page.to_frame('value')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.4 Editions per page with Wikiwho" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from wikiwho_wrapper import WikiWho\n", "import pandas as pd\n", "wikiwho = WikiWho(lng='en')\n", "editions = wikiwho.dv.editions(page.page_id)\n", "editions.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = editions\n", "df['year_month'] = pd.to_datetime(df['year_month'])\n", "\n", "# Group the data by year month and page (drop the editor information)\n", "df.drop('editor_id', axis=1).groupby(['year_month','page_id']).sum()\n", "\n", "# add columns with the total actions\n", "df = df.join(pd.DataFrame(\n", " df.loc[:,'adds':'adds_stopword_count'].values +\\\n", " df.loc[:,'dels':'dels_stopword_count'].values +\\\n", " df.loc[:,'reins':'reins_stopword_count'].values, \n", " index=df.index, \n", " columns=['actions', \n", " 'actions_surv_48h', \n", " 'actions_persistent', \n", " 'actions_stopword_count']\n", "))\n", "\n", "# Visualization\n", "from visualization.editions_listener import DFListener\n", "from ipywidgets import interact\n", "listener = DFListener(df)\n", "# interact(listener.editions_per_month, \n", "# begin=df.year_month,\n", "# end=df.year_month.sort_values(ascending=False),\n", "# actions=['All Actions', 'Additions', 'Reinsertions', 'Deletions'])\n", "actions = df.loc[:,'actions':'actions_stopword_count'].columns.append(\n", " df.loc[:,'adds':'reins_stopword_count'].columns)\n", "interact(listener.editions_per_month, \n", " begin=df.year_month,\n", " end=df.year_month.sort_values(ascending=False),\n", " granularity=['Yearly', 'Monthly'],\n", " first_action=actions,\n", " second_action=actions)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Conflict Score of Editors of a Page" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "editors = editions['editor_id'].unique()\n", "editors" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV\n", "from metrics.Conflict_Score import conflictScore_token_list\n", "\n", "api = WikiWhoAPI(lng='en')\n", "wikiwho_dv = WikiWhoDV(api)\n", "revisions = wikiwho_dv.rev_ids_of_article(page.page_id)\n", "tokens = wikiwho_dv.all_content(page.page_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from metrics.Conflict_Score import conflictScore_token_list\n", "conflictScore_token_list(tokens, 162969, revisions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "revisions.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokens.loc[tokens['in'] == -1,'in'] = tokens.loc[tokens['in'] == -1,'o_rev_id']\n", "tokens.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tokens[tokens['o_editor']=='25450560']\n", "\n", "rev_ins = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n", " columns={'rev_time': 'in_rev_time',\n", " 'rev_id': 'in',\n", " 'o_editor': 'in_editor'}\n", ")\n", "\n", "rev_outs = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n", " columns={\n", " 'rev_time': 'out_rev_time', \n", " 'rev_id': 'out',\n", " 'o_editor': 'out_editor'\n", " }\n", ")\n", "\n", "fulltokens = pd.merge(tokens, rev_ins, how='left', on='in')\n", "fulltokens = pd.merge(fulltokens, rev_outs, how='left', on='out')\n", "\n", "fulltokens.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokens.set_index('token_id')\n", "\n", "possible_conflicts = tokens.groupby('token_id').size() > 4\n", "possible_conflicts[possible_conflicts].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tmp = tokens[tokens['token_id'].isin(possible_conflicts[possible_conflicts].index)]\n", "\n", "tmp.groupby('token_id').shift(1)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for name, tgroup in tmp.groupby('token_id'):\n", " pass\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tgroup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tgroup.shift(-1).reset_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rgrouprn = tgroup.rename(\n", " columns={'in': 'rev_id_in', 'out':'rev_id_out'}).reset_index()\n", "rgrouprn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.wide_to_long(rgrouprn,\n", " 'rev_id', \n", " 'index',\n", " 'test', sep='_', suffix='.+')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from wikiwho_wrapper import WikiWhoAPI, DataView as WikiWhoDV\n", "from metrics.Conflict_Score import conflictScore_token_list\n", "\n", "api = WikiWhoAPI(lng='en')\n", "wikiwho_dv = WikiWhoDV(api)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "revisions = wikiwho_dv.rev_ids_of_article('Evolution')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "revisions = revisions.rename(columns={'o_editor': 'editor'})\n", "revisions['Evolution'] = pd.to_datetime(revisions['rev_time'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#revisions['rev_time__'] = pd.to_datetime(dups_sorted['rev_time'],format='%Y-%m-%dT%H:%M:%SZ')\n", "\n", "#revisions['rev_time__'] = dups_sorted['rev_time'].str.,format='%Y-%m-%dT%H:%M:%SZ')\n", "\n", "\n", "\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#rev_id = revisions[revisions['rev_time'] < pd.Timestamp(2016,11,1)].sort_values(\n", "# 'rev_time', ascending=False).iloc[0,:]['rev_id']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spec_revision = wikiwho.dv.specific_rev_content_by_article_title(the_page, rev_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spec_tokens = pd.merge(\n", " spec_revision.drop(columns=['rev_id', 'rev_time', 'rev_editor']),\n", " tokens[['token_id', 'in', 'out']], \n", " how='left', on='token_id')\n", "spec_tokens.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tokens_alt = tokens.copy()\n", "#tokens_alt.loc[tokens_alt['in'] == -1,'in'] = tokens_alt.loc[tokens_alt['in'] == -1,'o_rev_id']\n", "#tokens_alt.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Starts here" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from wikiwho_wrapper import WikiWho\n", "the_page = 'Chicago' # \n", "#the_page = page.page_id\n", "wikiwho = WikiWho(lng='en')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "revisions = wikiwho.dv.rev_ids_of_article(the_page)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "revisions = revisions.rename(columns={'o_editor': 'editor'})\n", "revisions['rev_time'] = pd.to_datetime(revisions['rev_time'])\n", "revisions.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokens = wikiwho.dv.all_content(the_page)\n", "tokens.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokens.loc[tokens['in'] == -1,'in'] = tokens.loc[tokens['in'] == -1,'o_rev_id']\n", "the_tokens = tokens #spec_tokens\n", "dups = the_tokens[the_tokens.duplicated(subset=['token_id'], keep=False)]\n", "#dups = the_tokens#[the_tokens['in'] != -1]\n", "dups.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "stop_words = open('data/stopword_list.txt', 'r').read().split()\n", "dups = dups[~dups['token'].isin(stop_words)]\n", "dups.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dups_long = pd.wide_to_long(\n", " dups.rename(columns={'in': 'rev_id_in',\n", " 'out':'rev_id_out'}).reset_index(),\n", " 'rev_id','index','action', sep='_', suffix='.+'\n", ").reset_index().drop(columns='index').sort_values('token_id')\n", "dups_long.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dups_merged = pd.merge(dups_long, revisions[['rev_time', 'rev_id', 'editor']], \n", " how='left', on='rev_id')\n", "dups_sorted = dups_merged.sort_values(['token_id', 'rev_time'])\n", "\n", "\n", "dups_sorted['time_diff'] = dups_sorted['rev_time'] - dups_sorted.shift(2)['rev_time']\n", "to_remove = ((dups_sorted['o_rev_id'] == dups_sorted['rev_id']) | \n", " (dups_sorted.shift(1)['o_rev_id'] == dups_sorted.shift(1)['rev_id']))\n", "dups_sorted.loc[to_remove,'time_diff'] = np.nan\n", "\n", "# for testing (the bottom line is equivalent to the above 3 but slow)\n", "#dups_sorted['time_diff2'] = dups_sorted.groupby('token_id').apply(lambda group: group['rev_time'] - group.shift(2)['rev_time']).values\n", "#(dups_sorted['time_diff'].fillna(-1) == dups_sorted['time_diff2'].fillna(-1)).all()\n", "\n", "dups_sorted.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dups_dated = dups_sorted\n", "#dups_dated = dups_sorted[dups_sorted['rev_time'] < pd.Timestamp(2016,10,10)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# removes the last out\n", "#dups_dated = dups_dated[dups_dated['rev_id'] != -1]\n", "#dups_not_minus.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "conflicts = ((dups_dated['token_id'] == dups_dated.shift(1)['token_id']) &\n", " (dups_dated['token_id'] == dups_dated.shift(2)['token_id']) &\n", " (dups_dated['editor'] != dups_dated.shift(1)['editor']) &\n", " (dups_dated['editor'] == dups_dated.shift(2)['editor']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dups_dated[conflicts].shape\n", "\n", "#dups_dated[dups_dated['token_id'] == 1760]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "c_t = 1 / (\n", " np.log(\n", " dups_dated.loc[conflicts,['token_id','time_diff']].groupby(\n", " 'token_id').sum().astype('timedelta64[s]') + 2\n", " ) / np.log(3600))\n", "\n", "c_t.sum()# / dups_not_minus.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "c_t = np.log(3600) / (\n", " np.log(\n", " dups_dated.loc[conflicts,['token_id','time_diff']].groupby(\n", " 'token_id').sum().astype('timedelta64[s]') + 2\n", " ))\n", "\n", "c_t.sum()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dups_dated['conflict'] = 0\n", "\n", "dups_dated.loc[conflicts, 'conflict'] = np.log(3600) / np.log(\n", " dups_dated.loc[conflicts,'time_diff'].astype('timedelta64[s]')+2)\n", "\n", "# editor = '25450560'\n", "# actions = len(dups_dated[(dups_dated['editor'] == editor) & dups_dated['time_diff'].notnull()])\n", "\n", "# dups_dated.loc[conflicts & (dups_dated['editor'] == editor),'conflict'].sum() / actions\n", "\n", "dups_dated.loc[conflicts, 'conflict'].sum() / len(dups_dated[dups_dated['time_diff'].notnull()])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dups_dated.loc[conflicts, 'conflict'].sum() / len(dups_dated['rev_id'] == dups_dated['o_rev_id'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "confs_ed = dups_dated.loc[conflicts, ['editor', 'conflict']].groupby('editor').sum()\n", "\n", "actions = dups_dated.loc[dups_dated['time_diff'].notnull(),['editor','action']].groupby('editor').count()\n", "\n", "joined = confs_ed.join(actions)\n", "\n", "joined['conflict_score'] = joined['conflict'] / joined['action']\n", "\n", "joined.sort_values(\n", " 'conflict_score', ascending=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "editor = '25450560'\n", "conflictse = conflicts & (dups_dated['editor'] == editor)\n", "actions = len(dups_dated[(dups_dated['editor'] == editor) & dups_dated['time_diff'].notnull()])\n", "\n", "c_t = 1 / (\n", " np.log(\n", " dups_dated.loc[conflictse,['token_id','time_diff']].groupby(\n", " 'token_id').sum().astype('timedelta64[s]') + 2\n", " ) / np.log(3600))\n", "\n", "c_t.sum() / actions # / dups_not_minus.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", "c_t = dups_dated.loc[conflicts,:].groupby('token_id').size()\n", "\n", "c_t.sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ " conflicts = (\n", " (dups_sorted['token_id'] == dups_sorted.shift(1)['token_id']) &\n", " (dups_sorted['token_id'] == dups_sorted.shift(2)['token_id']) &\n", " (dups_sorted['editor'] != dups_sorted.shift(1)['editor']) &\n", " (dups_sorted['editor'] == dups_sorted.shift(2)['editor']))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rev_ins = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n", " columns={'rev_time': 'in_rev_time',\n", " 'rev_id': 'in',\n", " 'o_editor': 'in_editor'}\n", ")\n", "\n", "rev_outs = revisions[['rev_time', 'rev_id', 'o_editor']].rename(\n", " columns={\n", " 'rev_time': 'out_rev_time', \n", " 'rev_id': 'out',\n", " 'o_editor': 'out_editor'\n", " }\n", ")\n", "\n", "fulltokens = pd.merge(tokens, rev_ins, how='left', on='in')\n", "fulltokens = pd.merge(fulltokens, rev_outs, how='left', on='out')\n", "\n", "fulltokens.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def counting_token_conflict(tkn_group):\n", " return ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &\n", " (tkn_group['editor'] != tkn_group.shift(1)['editor'])).sum()\n", "\n", "res = dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x)) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "res" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def counting_token_conflict(tkn_group):\n", " conflicts = ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &\n", " (tkn_group['editor'] != tkn_group.shift(1)['editor']))\n", " \n", " return tkn_group.loc[conflicts, 'time_diff'].sum()\n", "\n", "res2 = dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#from datetime import timedelta\n", "#res2[res2 > timedelta(0)]\n", "\n", "res2\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def counting_token_conflict(tkn_group):\n", "\n", " return ((tkn_group['editor'] == tkn_group.shift(2)['editor']) &\n", " (tkn_group['editor'] != tkn_group.shift(1)['editor'])).sum()\n", "\n", "dups_sorted.groupby('token_id').apply(lambda x: counting_token_conflict(x))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# (dups_sorted['action'] == dups_sorted.shift(1)['action']).sum()\n", "\n", "\n", "\n", "# for tkn, tkn_group in dups_sorted.groupby('token_id'):\n", "# pass\n", "\n", "\n", "dups_sorted[dups_sorted['token_id'] == 7162]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }