{ "cells": [ { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" }, "tags": [] }, "source": [ "# Expanding the measurement of culture with a sample of two billion humans" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" }, "tags": [] }, "source": [ "## Replication Data and Code" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "This notebook provides all the steps to replicate the results of our paper [Expanding the measurement of culture with a sample of two billion humans](https://doi.org/10.1098/rsif.2022.0085) published in the *Journal of the Royal Society Interface 19:20220085* (2022)." ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "## Robustness Analysis\n", "## Analysis by Facebook Categories" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "# Setup and Pre-requisites" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "Let's start by importing the required packages" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "slideshow": { "slide_type": "subslide" } }, "outputs": [], "source": [ "#%pylab --no-import-all\n", "%matplotlib inline\n", "\n", "import sys, os, time\n", "import numpy as np\n", "import pandas as pd\n", "pd.set_option('display.width', 160)\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import matplotlib.patches as mpatches\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n", "from statsmodels.iolib.summary2 import summary_col\n", "from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, manhattan_distances, pairwise_distances\n", "from scipy.stats import zscore\n", "from scipy.cluster.hierarchy import dendrogram, linkage\n", "from scipy import spatial, stats\n", "from scipy.stats import zscore\n", "import MantelTest.MantelTest as MantelTest\n", "import re\n", "import seaborn as sns" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "subslide" } }, "source": [ "Let's setup our paths" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "pathfb = './data/'\n", "pathfbor = './data/OriginalData/'\n", "pathregs = pathfb + 'Regs/'\n", "pathsamples = pathfb + '/DemographicData/' \n", "\n", "pathout = pathregs + 'Categories/'\n", "if os.path.exists(pathout) == False:\n", " os.mkdir(pathout)\n", "pathshare = pathout\n", "if os.path.exists(pathfbor) == False:\n", " os.mkdir(pathfbor)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "## Import Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge all the category distances and compare them to original FB and other measures" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['BusinessIndustry',\n", " 'Education',\n", " 'Empty',\n", " 'FamilyRelationships',\n", " 'FitnessWellness',\n", " 'FoodDrink',\n", " 'HobbiesActivities',\n", " 'LifestyleCulture',\n", " 'NewsEntertainment',\n", " 'NonLocalBus',\n", " 'People',\n", " 'ShoppingFashion',\n", " 'SportsOutdoors',\n", " 'Technology',\n", " 'TravelPlacesEvents']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mycats = [f[f.find('_')+1:f.find('.')] for f in os.listdir(pathout) if f.endswith('dta')]\n", "mycats.sort()\n", "mycats" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "slideshow": { "slide_type": "skip" } }, "outputs": [], "source": [ "# Type of distance measure\n", "m = 'Cos'\n", "\n", "# Load data other distances \n", "df = pd.read_stata(pathregs + 'AllDistsFull.dta')\n", "\n", "for c in mycats:\n", " fbdist2 = pd.read_stata(pathout + 'FB' + m + 'Dist_' + c + '.dta')\n", " cols = fbdist2.columns.tolist()\n", " cols[0] = 'ISO_CODE_1'\n", " fbdist2.columns = cols\n", " fbdist2 = fbdist2.set_index('ISO_CODE_1').stack().copy()\n", " fbdist2 = fbdist2.reset_index()\n", " fbdist2.columns = ['ISO_CODE_1', 'ISO_CODE_2', 'FBDist_'+c]\n", " df = df.merge(fbdist2, on=['ISO_CODE_1', 'ISO_CODE_2'], how='left')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | ISO_CODE_1 | \n", "ISO_CODE_2 | \n", "CosDist1 | \n", "CosDist2 | \n", "CosDist3 | \n", "CosDist4 | \n", "CosDist5 | \n", "CosDist6 | \n", "CosDist7 | \n", "CosDist8 | \n", "... | \n", "FBDist_FoodDrink | \n", "FBDist_HobbiesActivities | \n", "FBDist_LifestyleCulture | \n", "FBDist_NewsEntertainment | \n", "FBDist_NonLocalBus | \n", "FBDist_People | \n", "FBDist_ShoppingFashion | \n", "FBDist_SportsOutdoors | \n", "FBDist_Technology | \n", "FBDist_TravelPlacesEvents | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "AD | \n", "AD | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "... | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "0.000000 | \n", "
1 | \n", "AD | \n", "AE | \n", "0.649726 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "... | \n", "0.178210 | \n", "0.256747 | \n", "0.186337 | \n", "0.083852 | \n", "0.245167 | \n", "0.733101 | \n", "0.133534 | \n", "0.283705 | \n", "0.099948 | \n", "0.519951 | \n", "
2 | \n", "AD | \n", "AF | \n", "0.027777 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "... | \n", "0.294734 | \n", "0.298884 | \n", "0.272084 | \n", "0.124117 | \n", "0.291968 | \n", "0.832636 | \n", "0.165568 | \n", "0.377505 | \n", "0.117173 | \n", "0.531510 | \n", "
3 | \n", "AD | \n", "AG | \n", "0.298230 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "... | \n", "0.251972 | \n", "0.258350 | \n", "0.154459 | \n", "0.099079 | \n", "0.234904 | \n", "0.680631 | \n", "0.143881 | \n", "0.376967 | \n", "0.085136 | \n", "0.504606 | \n", "
4 | \n", "AD | \n", "AI | \n", "0.917672 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "... | \n", "0.261493 | \n", "0.279948 | \n", "0.195226 | \n", "0.153596 | \n", "0.263495 | \n", "0.651478 | \n", "0.164097 | \n", "0.371870 | \n", "0.081088 | \n", "0.506484 | \n", "
5 rows × 69 columns
\n", "\n", " | ISO_CODE_1 | \n", "ISO_CODE_2 | \n", "CosDist1 | \n", "CosDist2 | \n", "CosDist3 | \n", "CosDist4 | \n", "CosDist5 | \n", "CosDist6 | \n", "CosDist7 | \n", "CosDist8 | \n", "... | \n", "FBDist_FoodDrink | \n", "FBDist_HobbiesActivities | \n", "FBDist_LifestyleCulture | \n", "FBDist_NewsEntertainment | \n", "FBDist_NonLocalBus | \n", "FBDist_People | \n", "FBDist_ShoppingFashion | \n", "FBDist_SportsOutdoors | \n", "FBDist_Technology | \n", "FBDist_TravelPlacesEvents | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "AD | \n", "AE | \n", "0.649726 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "... | \n", "0.178210 | \n", "0.256747 | \n", "0.186337 | \n", "0.083852 | \n", "0.245167 | \n", "0.733101 | \n", "0.133534 | \n", "0.283705 | \n", "0.099948 | \n", "0.519951 | \n", "
1 | \n", "AD | \n", "AF | \n", "0.027777 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "... | \n", "0.294734 | \n", "0.298884 | \n", "0.272084 | \n", "0.124117 | \n", "0.291968 | \n", "0.832636 | \n", "0.165568 | \n", "0.377505 | \n", "0.117173 | \n", "0.531510 | \n", "
2 | \n", "AD | \n", "AG | \n", "0.298230 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "... | \n", "0.251972 | \n", "0.258350 | \n", "0.154459 | \n", "0.099079 | \n", "0.234904 | \n", "0.680631 | \n", "0.143881 | \n", "0.376967 | \n", "0.085136 | \n", "0.504606 | \n", "
3 | \n", "AD | \n", "AI | \n", "0.917672 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "... | \n", "0.261493 | \n", "0.279948 | \n", "0.195226 | \n", "0.153596 | \n", "0.263495 | \n", "0.651478 | \n", "0.164097 | \n", "0.371870 | \n", "0.081088 | \n", "0.506484 | \n", "
4 | \n", "AD | \n", "AL | \n", "0.002674 | \n", "0.998614 | \n", "0.998062 | \n", "1.000000 | \n", "0.999967 | \n", "1.000000 | \n", "1.000000 | \n", "1.000000 | \n", "... | \n", "0.158682 | \n", "0.203463 | \n", "0.211969 | \n", "0.104971 | \n", "0.221430 | \n", "0.672669 | \n", "0.115245 | \n", "0.234657 | \n", "0.088875 | \n", "0.511526 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
27490 | \n", "YT | \n", "ZM | \n", "0.078120 | \n", "0.078460 | \n", "0.078460 | \n", "0.078427 | \n", "0.078427 | \n", "0.078427 | \n", "0.078427 | \n", "0.078427 | \n", "... | \n", "0.314180 | \n", "0.264914 | \n", "0.104575 | \n", "0.053099 | \n", "0.202428 | \n", "0.674473 | \n", "0.142165 | \n", "0.317044 | \n", "0.112804 | \n", "0.425856 | \n", "
27491 | \n", "YT | \n", "ZW | \n", "0.078103 | \n", "0.078497 | \n", "0.078497 | \n", "0.078497 | \n", "0.078497 | \n", "0.078497 | \n", "0.078497 | \n", "0.078497 | \n", "... | \n", "0.342396 | \n", "0.293738 | \n", "0.078400 | \n", "0.051409 | \n", "0.185660 | \n", "0.752413 | \n", "0.130497 | \n", "0.398470 | \n", "0.089990 | \n", "0.391816 | \n", "
27492 | \n", "ZA | \n", "ZM | \n", "0.042589 | \n", "0.040391 | \n", "0.040391 | \n", "0.021715 | \n", "0.021725 | \n", "0.021725 | \n", "0.021725 | \n", "0.021725 | \n", "... | \n", "0.131968 | \n", "0.114329 | \n", "0.076315 | \n", "0.018861 | \n", "0.104667 | \n", "0.293089 | \n", "0.065884 | \n", "0.149984 | \n", "0.051544 | \n", "0.296675 | \n", "
27493 | \n", "ZA | \n", "ZW | \n", "0.042107 | \n", "0.039633 | \n", "0.039633 | \n", "0.021641 | \n", "0.021641 | \n", "0.021641 | \n", "0.021641 | \n", "0.021641 | \n", "... | \n", "0.115236 | \n", "0.112350 | \n", "0.056001 | \n", "0.023694 | \n", "0.100517 | \n", "0.430028 | \n", "0.076627 | \n", "0.208669 | \n", "0.043190 | \n", "0.263977 | \n", "
27494 | \n", "ZM | \n", "ZW | \n", "0.000001 | \n", "0.000004 | \n", "0.000004 | \n", "0.000028 | \n", "0.000028 | \n", "0.000028 | \n", "0.000028 | \n", "0.000028 | \n", "... | \n", "0.141001 | \n", "0.092225 | \n", "0.048161 | \n", "0.022689 | \n", "0.085157 | \n", "0.248501 | \n", "0.055595 | \n", "0.109604 | \n", "0.023119 | \n", "0.239920 | \n", "
27495 rows × 61 columns
\n", "