{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b6535345",
   "metadata": {},
   "outputs": [],
   "source": [
    "# netowrks\n",
    "import networkx as nx\n",
    "import igraph as ig\n",
    "\n",
    "# data processing\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "#some functions to make our lifes easier\n",
    "import sys\n",
    "sys.path.append(\"./\")\n",
    "from common_functions import *\n",
    "\n",
    "# viz\n",
    "import pylab as plt\n",
    "import matplotlib as mpl\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "\n",
    "#Change the default options of visualization (improving the defaults)\n",
    "custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False, \"axes.spines.left\": False, \"axes.spines.bottom\":\n",
    "False,\"lines.linewidth\": 2, \"grid.color\": \"lightgray\", \"legend.frameon\": False, \"xtick.labelcolor\": \"#484848\", \"ytick.labelcolor\":\n",
    "\"#484848\", \"xtick.color\": \"#484848\", \"ytick.color\": \"#484848\",\"text.color\": \"#484848\", \"axes.labelcolor\": \"#484848\",\n",
    "\"axes.titlecolor\":\"#484848\",\"figure.figsize\": [5,3],\n",
    "\"axes.titlelocation\":\"left\",\"xaxis.labellocation\":\"left\",\"yaxis.labellocation\":\"bottom\"}\n",
    "palette = [\"#3d348b\",\"#e6af2e\",\"#191716\",\"#e0e2db\"] #use your favourite colours\n",
    "sns.set_theme(context='paper', style='white', palette=palette, font='Verdana', font_scale=1.3, color_codes=True,\n",
    "rc=custom_params)\n",
    "\n",
    "from IPython.display import display, HTML\n",
    "display(HTML(\"<style>.container { width:90% !important; }</style>\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb6649c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "path_data = \"../Data/\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4767bb68",
   "metadata": {},
   "source": [
    "# Exercise 1: Reading and visualizing graphs\n",
    "\n",
    "Goal: Get used to the `networkx` library"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9a2566d2",
   "metadata": {},
   "source": [
    "## 1.1. Basic plot\n",
    "- Read and understand the following code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17549962",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read an example data on florentine marriage families in the XV century\n",
    "G = nx.florentine_families_graph()\n",
    "\n",
    "# Show the nodes and edges\n",
    "print(\"Nodes: \", G.nodes())\n",
    "print(\"Edges: \", G.edges())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b408279",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize it\n",
    "plt.figure(figsize=(5,4)) #set up dimensions\n",
    "\n",
    "# create layout (once so we can reuse it if needed)\n",
    "pos = nx.spring_layout(G, seed = 1)\n",
    "nx.draw(G, pos = pos, with_labels = True, \n",
    "        edge_color = \"gray\", node_color = \"lightgray\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8178dbf",
   "metadata": {},
   "source": [
    "## 1.2. Read and visualize your network\n",
    "Network here: https://tinyurl.com/network-game\n",
    "(I added the direct CSV link. But you could also download it as a CSV file and move it to the folder of this notebook) \n",
    "\n",
    "Use the `nx.from_pandas_edgelist` function (you need to set up the `source` and `target` parameters)\n",
    "Print number of nodes and edges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4aad5d3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read edgelist (link)\n",
    "df = pd.read_csv(\"https://docs.google.com/spreadsheets/d/e/2PACX-1vSuZC86KjXYKSPr0Nw7mfRha4zwea3aMw-gTKliVcbRt_m3NRiCEyxbH_d5M8MBL0LWayg1WDmnqBET/pub?gid=0&single=true&output=csv\")\n",
    "display(df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09de5f10",
   "metadata": {},
   "outputs": [],
   "source": [
    "#get help\n",
    "nx.from_pandas_edgelist?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c667a9a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to networkx\n",
    "G = nx.from_pandas_edgelist(...)\n",
    "print(len(G.nodes()))\n",
    "print(len(G.edges()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea2cc766",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create figure and plot\n",
    "plt.figure(figsize=(10,8))\n",
    "\n",
    "# create layout (once so we can reuse it if needed)\n",
    "pos = nx.spring_layout(G, seed = 1)\n",
    "nx.draw(G, pos = pos, with_labels = True, node_size=10,\n",
    "        edge_color = \"lightgray\", node_color = \"gray\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1da80e98",
   "metadata": {},
   "source": [
    "## 1.3. Read and visualize the Twitter network\n",
    "\n",
    "We will be using (parts of) this network in the following days.\n",
    "\n",
    "It contains mentions, replies and quotes of user A to user B (but let's treat is as undirected)\n",
    "```\n",
    "replied_to    40255\n",
    "mentioned     10407\n",
    "quoted         3448\n",
    "```\n",
    "\n",
    "Use the nx.from_pandas_edgelist function\n",
    "Print number of nodes and edges"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0170e2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read edgelist (the separator is a TAB)\n",
    "path = f\"{path_data}/ic2s2_netsci_3.tsv\"\n",
    "\n",
    "df = pd.read_csv(...)\n",
    "display(df.head())\n",
    "\n",
    "# Convert to networkx\n",
    "G = nx.from_pandas_edgelist(df)\n",
    "G.remove_edges_from(nx.selfloop_edges(G)) #remove self-edges\n",
    "print(len(G.nodes()))\n",
    "print(len(G.edges()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5251186d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create figure and plot\n",
    "plt.figure(figsize=(20,16))\n",
    "\n",
    "# create layout (once so we can reuse it if needed)\n",
    "pos = nx.spring_layout(G, seed = 1)\n",
    "nx.draw(G, pos = pos, with_labels = False, node_size=10,\n",
    "        edge_color = \"lightgray\", node_color = \"gray\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "edf6cafd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# igraph is faster. Here is how to convert to igraph\n",
    "h = ig.Graph().from_networkx(G)\n",
    "# create layout\n",
    "layout = h.layout_fruchterman_reingold()\n",
    "# create positions in the networx format (as a dictionary)\n",
    "pos = dict(zip(G.nodes(), layout.coords))\n",
    "\n",
    "# Plot as before\n",
    "plt.figure(figsize=(20, 16))\n",
    "nx.draw(G, pos = pos, with_labels = False, node_size=10,\n",
    "        edge_color = \"lightgray\", node_color = \"gray\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e6b1b5a9",
   "metadata": {},
   "source": [
    "## 1.4. Read and visualize a large(r) network \n",
    "- Read network in f\"{path_data}/wiki-Vote.txt\". Careful, it is a directed network, you need to use the create_using parameter.\n",
    "\n",
    "(use iterations = 30 in the spring_layout)\n",
    "\n",
    "\n",
    "Format of the file:\n",
    "```\n",
    "# Directed graph (each unordered pair of nodes is saved once): Wiki-Vote.txt \n",
    "# Wikipedia voting on promotion to administratorship (till January 2008). Directed edge A->B means user A voted on B becoming Wikipedia administrator.\n",
    "# Nodes: 7115 Edges: 103689\n",
    "# FromNodeId\tToNodeId\n",
    "30\t1412\n",
    "30\t3352\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54c36775",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read directed graph\n",
    "path = f\"{path_data}/wiki-Vote.txt\"\n",
    "G_wiki = nx.read_edgelist(path, create_using=...)\n",
    "print(len(G_wiki.nodes()))\n",
    "print(len(G_wiki.edges()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfcc7e9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create layout (this will take a couple minutes). Networkx is a particularly slow library\n",
    "pos = nx.spring_layout(G_wiki, seed = 1, iterations=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e9a6c51",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Nobody wants to see your hairball, but let's plot it anyway\n",
    "plt.figure(figsize=(20, 20))\n",
    "\n",
    "# Plot only nodes (too many lines)\n",
    "nx.draw_networkx_nodes(G_wiki, pos = pos, node_size = 1, node_color = \"k\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2ff6d51",
   "metadata": {},
   "source": [
    "# Exercise 2: Network characteristics\n",
    "\n",
    "Network example: Protein interaction (PPI) in yeast. Nodes = Proteins, Edges = Recorded interactions in experiments\n",
    "\n",
    "__Exercise__: Compare the PPI network with the Twitter network (ic2s2_netsci_3). What characteristics apply to both?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42e9de76",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read PPI network\n",
    "path_network = f\"{path_data}/ppi_network_prediction.graphml\"\n",
    "G = nx.read_graphml(path_network, node_type=int)\n",
    "len(G.nodes()), len(G.edges())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "69caf98f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# igraph is faster. Here is how to convert to igraph\n",
    "h = ig.Graph().from_networkx(G)\n",
    "# create layout\n",
    "layout = h.layout_fruchterman_reingold()\n",
    "# create positions in the networx format (as a dictionary)\n",
    "pos = dict(zip(G.nodes(), layout.coords))\n",
    "\n",
    "# To use networkx (slow), uncomment the line below\n",
    "#pos = nx.spring_layout(G, seed = 1)\n",
    "\n",
    "# Plot as before\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "#This function is defined in common_functions.py file. It adds automatic coloring and nicer defaults\n",
    "plot_network(G, a0 = ax, pos = pos, with_labels=False) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b7204d45",
   "metadata": {},
   "source": [
    "## 2.1 Connectedness"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e57cfb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Number of connected components\n",
    "nx.components.number_connected_components(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c088402",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Size of the connected components (first 20 sorted)\n",
    "sorted([len(_) for _ in nx.connected_components(G)])[:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac07c20c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)\n",
    "largest_cc = max(nx.connected_components(G), key=len)\n",
    "\n",
    "# Create a subgraph of G consisting only of nodes in largest_cc\n",
    "G = G.subgraph(largest_cc).copy()\n",
    "len(G)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de92ae88",
   "metadata": {},
   "source": [
    "## 2.2 Small diameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1392dae1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Diameter\n",
    "nx.diameter(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "827778dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Average shortest path\n",
    "nx.average_shortest_path_length(G)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f28e551b",
   "metadata": {},
   "source": [
    "## 2.3 Density"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eba3dd49",
   "metadata": {},
   "outputs": [],
   "source": [
    "nx.density(G)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7cbd38f",
   "metadata": {},
   "source": [
    "## 2.4 Transitivity and average clustering "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98a6dc93",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Average at the network level (# triangles / possible number of triangles)\n",
    "print(nx.transitivity(G))\n",
    "\n",
    "# Average at the node level = Average(# triangles_i / possible triangles_i )\n",
    "print(nx.average_clustering(G))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0e9ba007",
   "metadata": {},
   "source": [
    "## 2.5 Assortativity and homophily\n",
    "\n",
    "What does it mean to have a negative/positive degree assortativity?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fa3b399",
   "metadata": {},
   "outputs": [],
   "source": [
    "nx.assortativity.degree_assortativity_coefficient(G)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5b01ea4",
   "metadata": {},
   "source": [
    "## 2.6 Heavy tails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d396865",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find the degree of each node\n",
    "degree_list = dict(G.degree()).values()\n",
    "\n",
    "# count the number of nodes with a specific degree, sort it\n",
    "from collections import Counter\n",
    "C = Counter(degree_list)\n",
    "deg, cnt = zip(*sorted(C.items()))\n",
    "\n",
    "# Plot\n",
    "plt.plot(deg, cnt, \"o-\")\n",
    "plt.xlabel(\"Degree\")\n",
    "plt.ylabel(\"Count (degree)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a5b0c1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Defined in common_functions.py\n",
    "plot_cdf(degree_list, compl = True, xlabel = \"Degree (d)\", ylabel = \"p(Degree > d)\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad5df77d",
   "metadata": {},
   "source": [
    "## 2.7 Robustness to failures / Fragility to targeted attacks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8684247e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def attack_network(G, how=\"targeted\", n_remove=200):\n",
    "    \"\"\"\n",
    "    Removes nodes iteratively and keeps the (harmonic average of distance between all pairs of nodes)\n",
    "    \n",
    "    how = \"targeted\" targets the node with the highest degree\n",
    "    \"\"\"\n",
    "    n_components = []\n",
    "\n",
    "    G2 = G.copy()\n",
    "    for i in range(n_remove):\n",
    "        if how == \"targeted\":\n",
    "            # Find the node with the highest degree\n",
    "            node_to_remove = max(G2.degree(), key=lambda x: x[1])[0]\n",
    "\n",
    "        else:\n",
    "            # Random node\n",
    "            node_to_remove = np.random.choice(G2.nodes(), 1)[0]\n",
    "\n",
    "        # Remove node\n",
    "        G2.remove_node(node_to_remove)\n",
    "        # Number of nodes in giant component\n",
    "        n_comp = max([len(_) for _ in nx.connected_components(G2)])\n",
    "        n_components.append(100*n_comp/len(G2))\n",
    "        \n",
    "        \n",
    "    return n_components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00e0fd95",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create an ER graph (random graph)\n",
    "G_r = nx.random_graphs.gnm_random_graph(1000,len(G.edges()))\n",
    "# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)\n",
    "largest_cc = max(nx.connected_components(G_r), key=len)\n",
    "\n",
    "# Create a subgraph of G consisting only of nodes in largest_cc\n",
    "G_r = G_r.subgraph(largest_cc).copy()\n",
    "\n",
    "# Plot network random\n",
    "plot_network(G_r, with_labels=False)\n",
    "plt.show()\n",
    "\n",
    "# Plot network PPI\n",
    "plot_network(G, with_labels=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1fe19ba3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove 200 nodes\n",
    "n_remove = 200\n",
    "n_comp_attack_t = attack_network(G, how=\"targeted\", n_remove = n_remove)\n",
    "n_comp_attack_r = attack_network(G, how=\"random\", n_remove = n_remove)\n",
    "\n",
    "ref_n_comp_attack_t = attack_network(G_r, how=\"targeted\", n_remove = n_remove)\n",
    "ref_n_comp_attack_r = attack_network(G_r, how=\"random\", n_remove = n_remove)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d51b399",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 5))\n",
    "plt.subplot(121)\n",
    "plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_t, label=\"Protein Interaction\")\n",
    "plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_t, label=\"Random network\")\n",
    "plt.xlabel(\"Share of nodes removed\")\n",
    "plt.ylabel(\"Share of nodes in giant component\")\n",
    "plt.legend()\n",
    "plt.title(\"Targeted attack\")\n",
    "plt.ylim(0,100)\n",
    "\n",
    "plt.subplot(122)\n",
    "plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_r, label=\"Protein Interaction\")\n",
    "plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_r, label=\"Random network\")\n",
    "plt.xlabel(\"Share of nodes removed\")\n",
    "plt.ylabel(\"Share of nodes in giant component\")\n",
    "plt.legend()\n",
    "plt.title(\"Random attack\")\n",
    "plt.ylim(0,100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9985571",
   "metadata": {},
   "source": [
    "# Exercise 3: Distributions (i.e. looking at characteristics of the nodes)\n",
    "- Degree\n",
    "- Number of triangles\n",
    "- Clustering (transitivity)\n",
    "- Local assortativity (homophily)\n",
    "- Path length\n",
    "\n",
    "__Goal__: Understand how to calculate distributions in `networkx`, interpret differences in the degree distribution \n",
    "\n",
    "__Exercise__: In 3.7 compare the Wikipedia network (default example), the PPI network and the Twitter network (IC2S2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16f880a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read data on florentine marriage families in the XV century\n",
    "G = nx.florentine_families_graph()\n",
    "len(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d4837fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use the following function to plot the CDF of the degree distributions\n",
    "def plot_cdf(values, scale = \"log\", ax = None, cum = True, compl = False, marker = 'o-', xlabel = \"Degree (d)\", ylabel = \"p(Degree < d)\"):\n",
    "    \"\"\"\n",
    "    Calculates and plot CDF\n",
    "    \"\"\"\n",
    "    from collections import Counter\n",
    "\n",
    "    # count the number of instance per each degree, sort it\n",
    "    C = Counter(values)\n",
    "    deg, cnt = zip(*sorted(C.items()))\n",
    "    \n",
    "    # calcualte the cumulative distribution, normalize to be a probability instead of a count\n",
    "    if cum:\n",
    "        cs = np.cumsum(cnt)/np.sum(cnt)\n",
    "    else:\n",
    "        cs = cnt/np.sum(cnt)\n",
    "    \n",
    "    if compl:\n",
    "        cs = 1 - cs\n",
    "        \n",
    "    if ax is None:\n",
    "        ax = plt.subplot()\n",
    "    # plot\n",
    "    ax.plot(deg, cs, marker)\n",
    "    ax.set_ylabel(ylabel)\n",
    "    ax.set_xlabel(xlabel)\n",
    "    plt.tight_layout()\n",
    "    sns.despine(left=True, bottom=True)\n",
    "    plt.xscale(scale)\n",
    "    plt.yscale(scale)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "998712a3",
   "metadata": {},
   "source": [
    "## 3.1 Degree distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "913ddeb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_network_distribution(G, values, mult = 1000, with_labels = True):\n",
    "    \"\"\"\n",
    "    Plots network (color and node size depends on values) and distributions\n",
    "    \"\"\"\n",
    "    import matplotlib as mpl\n",
    "    norm = mpl.colors.Normalize(vmin=min(values), vmax=max(values), clip=True)\n",
    "    mapper = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.coolwarm)\n",
    "\n",
    "    f, (a0, a1, a2) = plt.subplots(1, 3, gridspec_kw={'width_ratios': [2, 1, 1]}, figsize=(12,4))\n",
    "    \n",
    "    node_size = mult*np.array(list(values))\n",
    "    if min(node_size) < 0:\n",
    "        node_size -= min(node_size)\n",
    "        node_size += 1\n",
    "        \n",
    "    nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = with_labels, node_size = node_size, edge_color = \"gray\", \n",
    "           node_color = [mapper.to_rgba(i) for i in values], ax = a0,)\n",
    "\n",
    "\n",
    "    sns.histplot(values, ax = a1)\n",
    "    \n",
    "    \n",
    "    plot_cdf(values, ax = a2, compl = True, xlabel = \"Cent c\", ylabel = \"p(Cent > c)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3033f75e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Degree distribution\n",
    "degree = G.degree() #also nx.degree(G)\n",
    "degree_values = dict(degree).values()\n",
    "\n",
    "# Plot using sns.histplot\n",
    "sns.histplot(degree_values)\n",
    "print(sorted(degree_values))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d1faa5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot CDF in log-log scale\n",
    "plot_cdf(degree_values, scale = \"log\", ax = None, cum = True, compl = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a9dcf85",
   "metadata": {},
   "outputs": [],
   "source": [
    "# change \"mult\" to change the size of the nodes \n",
    "plot_network_distribution(G, degree_values, mult = 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f57f36b",
   "metadata": {},
   "source": [
    "## 3.2 Distribution of number of triangles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbe7b2f0",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Distribution of the number of triangles per node\n",
    "n_triangs = nx.triangles(G).values() \n",
    "display(list(zip(G.nodes(), n_triangs))[:20])\n",
    "\n",
    "plot_network_distribution(G, n_triangs, mult = 100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ec072d6",
   "metadata": {},
   "source": [
    "## 3.3 Distribution of clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f713c1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_clus = ...\n",
    "display(list(zip(G.nodes(), n_clus))[:20])\n",
    "\n",
    "plot_network_distribution(G, n_clus, mult = 1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3a41d71",
   "metadata": {},
   "source": [
    "### 3.4 Distribution of shortest paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c47c87af",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate all shortest paths (careful, this quickly becomes unfeasible)\n",
    "path_lenghts = ...\n",
    "\n",
    "# Get results from a nested dictionary\n",
    "path_lenghts = [list(_[1].values()) for _ in path_lenghts]\n",
    "path_lenghts = [subitem for item in path_lenghts for subitem in item ]\n",
    "\n",
    "# Plot using sns.histplot\n",
    "sns.histplot(path_lenghts)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d69dcf0",
   "metadata": {},
   "source": [
    "## 3.5 Distribution of local degree assortativity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c1f70d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "attribute = [k for v,k in G.degree()]\n",
    "print(attribute)\n",
    "\n",
    "# Defined in common_functions (based on Peel et al 2018)\n",
    "local_assort = calculate_local_assort(G, attribute)\n",
    "display(list(zip(G.nodes(), local_assort))[:20])\n",
    "\n",
    "plot_network_distribution(G, local_assort, mult = 1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e863fc5",
   "metadata": {},
   "source": [
    "## 3.6 Degree distribution if ~ normal distribution (used in the lectures)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70f6f7ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Don't change this\n",
    "def plot_distributions(d, scale=\"log\"):\n",
    "    \"\"\"\n",
    "    Plot the PDF, CDF and CCDF.\n",
    "    \"\"\"\n",
    "    plt.figure(figsize=(12,4))\n",
    "    ax = plt.subplot(131)\n",
    "    plot_cdf(d, cum = False, ax = ax,  xlabel = \"Degree (d)\", ylabel = \"p(Degree)\", marker = \".\", scale=scale)\n",
    "    if scale == \"log\":\n",
    "        plt.plot([7E1, 5E2],[2.5E-3,1E-4],\"--\")\n",
    "    plt.title(\"PDF\")\n",
    "    ax = plt.subplot(132)\n",
    "    plot_cdf(d, cum = True, ax = ax,  xlabel = \"Degree (d)\", ylabel = \"p(Degree < d)\", marker=\"-\", scale=scale)\n",
    "    plt.title(\"CDF\")\n",
    "    ax = plt.subplot(133)\n",
    "    plot_cdf(d, compl = True, ax = ax,  xlabel = \"Degree (d)\", ylabel = \"p(Degree > d)\", marker=\".-\", scale=scale)\n",
    "    plt.title(\"CCDF\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d68e564",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Degree distribution (random normally distributed data)\n",
    "## Try changing the scale to \"log\"\n",
    "d = np.random.binomial(500, p = 30/500, size = 10000)\n",
    "plot_distributions(d, scale=\"linear\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "782ac2e0",
   "metadata": {},
   "source": [
    "## 3.7 Degree distribution in the wiki network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "02989329",
   "metadata": {},
   "outputs": [],
   "source": [
    "## Degree distribution (wiki network)\n",
    "# Try the code using scale = \"linear\"\n",
    "G_wiki = nx.read_edgelist(f\"{path_data}/wiki-Vote.txt\", create_using=nx.DiGraph())\n",
    "d = [v for k,v in G_wiki.degree()]\n",
    "plot_distributions(d, \"log\")\n",
    "plt.xlim([100,1000])\n",
    "plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29321b85",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Twitter data\n",
    "df = pd.read_csv(f\"{path_data}/ic2s2_netsci_3.tsv\", sep=\"\\t\")\n",
    "G_twitter = nx.from_pandas_edgelist(df)\n",
    "G_twitter.remove_edges_from(nx.selfloop_edges(G_twitter)) #remove self-edges\n",
    "\n",
    "d = [v for k,v in G_twitter.degree()]\n",
    "plot_distributions(d, \"log\")\n",
    "plt.xlim([100,1000])\n",
    "plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43b2e02e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# PPI data\n",
    "path_network = f\"{path_data}/ppi_network_prediction.graphml\"\n",
    "G_ppi = nx.read_graphml(path_network, node_type=int)\n",
    "\n",
    "d = [v for k,v in G_ppi.degree()]\n",
    "plot_distributions(d, \"log\")\n",
    "\n",
    "plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "429bea65",
   "metadata": {},
   "source": [
    "# Exercise 4A \n",
    "Is the network homophilic? (are people linked to people like them?). You can use the function `nx.assortativity.attribute_assortativity_coefficient`\n",
    "\n",
    "Solution: Shuffle the node labels\n",
    "\n",
    "__Goal__: Understand how a permutation test works (4A) and how reference models work (4B)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24e6abfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create labels\n",
    "ns = ['Acciaiuoli', 'Medici', 'Castellani', 'Peruzzi', 'Strozzi', 'Barbadori', 'Ridolfi', 'Tornabuoni', 'Albizzi', 'Salviati', 'Pazzi', 'Bischeri', 'Guadagni', 'Ginori', 'Lamberteschi']\n",
    "classes = [\"m\", \"m\", \"o\", \"o\", \"o\", \"o\", \"o\", \"m\", \"m\", \"m\", \"o\", \"o\", \"o\", \"o\", \"o\"]\n",
    "nx.set_node_attributes(G, dict(zip(ns,classes)), \"classes\")\n",
    "\n",
    "# Plot\n",
    "nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = True, edge_color = \"gray\", \n",
    "       node_color = [palette[0] if c == \"m\" else palette[1] for c in classes])\n",
    "    \n",
    "\n",
    "# calculate assortativity coefficient\n",
    "assort = ...\n",
    "print(assort)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "daf87a30",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Permutenode labels\n",
    "G2 = G.copy()\n",
    "\n",
    "# Randomize classes and calculate assortativity\n",
    "iters = 10000\n",
    "values = []\n",
    "for i in range(iters):\n",
    "    # shuffle the classes\n",
    "    nx.set_node_attributes(G2, dict(zip(ns,np.random.permutation(classes))), \"classes\")\n",
    "    # calculate assortativity and keep in a list\n",
    "    values.append(nx.assortativity.attribute_assortativity_coefficient(G2, \"classes\"))\n",
    "values = np.array(values)\n",
    "\n",
    "# Plot results\n",
    "sns.histplot(values, binwidth=0.05)\n",
    "plt.plot([assort,assort],[0,iters/10], \"--\", color=\"gray\")\n",
    "\n",
    "# p-value (probability that we would observe a value equal or more extreme to the one observed given \n",
    "# that the null hyphotesis is true---i.e. the graph is the real graph and the links \n",
    "# are not correlated with the classes\n",
    "print(\"p-value\", 1-len(values[values<assort])/len(values))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8488f10e",
   "metadata": {},
   "source": [
    "# Exercise 4B \n",
    "Is the network transitive? (are your friends also friends between them?)\n",
    "Is the network assortative in terms of degree? \n",
    "\n",
    "Solution: Shuffle the edges or Grow the network\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "edb409f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read data on florentine marriage families in the XV century\n",
    "G = nx.florentine_families_graph()\n",
    "# Calculate average clustering\n",
    "print(...)\n",
    "# Calculate global clustering (transitivity)\n",
    "print(...)\n",
    "# Calculate degree assortativity\n",
    "print(...)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca861531",
   "metadata": {},
   "source": [
    "#### We can compare our network to randomly generated data (this is useful to test hypothesis/get a baseline)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a17f9599",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize random graphs (run several times)\n",
    "degree_seq = [v for k,v in G.degree()]\n",
    "G_r = nx.configuration_model(degree_seq)\n",
    "G_r = nx.Graph(G_r)\n",
    "nx.draw(G_r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3af9f377",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize random graphs (run several times)\n",
    "n = len(G)\n",
    "m = len(G.edges())\n",
    "G_r = nx.random_graphs.barabasi_albert_graph(n,int(m/n)+1)\n",
    "G_r = nx.Graph(G_r)\n",
    "nx.draw(G_r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a338cbdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize random graphs (run several times)\n",
    "n = len(G)\n",
    "m = len(G.edges())\n",
    "G_r = nx.random_graphs.gnm_random_graph(n,m)\n",
    "G_r = nx.Graph(G_r)\n",
    "nx.draw(G_r)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "387cfdee",
   "metadata": {},
   "source": [
    "#### The function conf_int (defined in common_functions.py) creates 100 random graphs and calculates the desired metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fa1594f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Average local clustering coefficient\n",
    "conf_int(G, nx.average_clustering, 100)\n",
    "nx.average_clustering(G)\n",
    "#same as np.mean(list(nx.clustering(G).values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "838d9f74",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Transitivity (global clustering)\n",
    "conf_int(G, nx.transitivity, 100)\n",
    "nx.transitivity(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f4e0f94",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Assortativity\n",
    "conf_int(G, nx.assortativity.degree_assortativity_coefficient, 100)\n",
    "nx.assortativity.degree_assortativity_coefficient(G)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c6dc4846",
   "metadata": {},
   "source": [
    "# Exercise 4B.2\n",
    "\n",
    "Use igraph to read the network and calculate assortativity, and compare with random model (ER)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ff5ffb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "G_wiki = nx.read_edgelist(f\"{path_data}/wiki-Vote.txt\", create_using=nx.DiGraph())\n",
    "# Read directed graph\n",
    "print(len(G_wiki.nodes()))\n",
    "print(len(G_wiki.edges()))\n",
    "\n",
    "# Convert to igraph\n",
    "h = ig.Graph.from_networkx(G_wiki)\n",
    "\n",
    "# Look at how much faster it is (700 times slower in my computer)\n",
    "%timeit h.assortativity_degree()\n",
    "%timeit nx.assortativity.degree_assortativity_coefficient(G_wiki)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98b5105e",
   "metadata": {},
   "source": [
    "### Compare with random model (ER) -- igraph vs networkx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44821cd1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# networkx\n",
    "def conf_dens_nx(n, m):\n",
    "    \"\"\"Create random graph and calculate assortativity in networkx\"\"\"\n",
    "    G_r = nx.random_graphs.gnm_random_graph(n,m)\n",
    "    return nx.assortativity.degree_assortativity_coefficient(G_r)\n",
    "def conf_dens_ig(n,m):\n",
    "    \"\"\"Create random graph and calculate assortativity in igraph\"\"\"\n",
    "    h_r = ig.Graph.Erdos_Renyi(n=n,m=m)\n",
    "    return h_r.assortativity_degree()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55bb3feb",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Doing it in igraph 10 times\n",
    "n, m = len(G_wiki), len(G_wiki.edges())\n",
    "print(np.percentile([conf_dens_nx(n, m) for i in range(10)], [5,95]))\n",
    "\n",
    "nx.assortativity.degree_assortativity_coefficient(G_wiki)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c911d3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# Doing it in igraph 100 times\n",
    "n, m = h.vcount(), h.ecount()\n",
    "print(np.percentile([conf_dens_ig(n,m) for i in range(100)], [5,95]))\n",
    "\n",
    "h.assortativity_degree()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0f82c7f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:networks]",
   "language": "python",
   "name": "conda-env-networks-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}