{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "b6535345", "metadata": {}, "outputs": [], "source": [ "# netowrks\n", "import networkx as nx\n", "import igraph as ig\n", "\n", "# data processing\n", "import pandas as pd\n", "import numpy as np\n", "\n", "#some functions to make our lifes easier\n", "import sys\n", "sys.path.append(\"./\")\n", "from common_functions import *\n", "\n", "# viz\n", "import pylab as plt\n", "import matplotlib as mpl\n", "import seaborn as sns\n", "%matplotlib inline\n", "\n", "#Change the default options of visualization (improving the defaults)\n", "custom_params = {\"axes.spines.right\": False, \"axes.spines.top\": False, \"axes.spines.left\": False, \"axes.spines.bottom\":\n", "False,\"lines.linewidth\": 2, \"grid.color\": \"lightgray\", \"legend.frameon\": False, \"xtick.labelcolor\": \"#484848\", \"ytick.labelcolor\":\n", "\"#484848\", \"xtick.color\": \"#484848\", \"ytick.color\": \"#484848\",\"text.color\": \"#484848\", \"axes.labelcolor\": \"#484848\",\n", "\"axes.titlecolor\":\"#484848\",\"figure.figsize\": [5,3],\n", "\"axes.titlelocation\":\"left\",\"xaxis.labellocation\":\"left\",\"yaxis.labellocation\":\"bottom\"}\n", "palette = [\"#3d348b\",\"#e6af2e\",\"#191716\",\"#e0e2db\"] #use your favourite colours\n", "sns.set_theme(context='paper', style='white', palette=palette, font='Verdana', font_scale=1.3, color_codes=True,\n", "rc=custom_params)\n", "\n", "from IPython.display import display, HTML\n", "display(HTML(\"\"))" ] }, { "cell_type": "code", "execution_count": null, "id": "eb6649c1", "metadata": {}, "outputs": [], "source": [ "path_data = \"../Data/\"" ] }, { "cell_type": "markdown", "id": "4767bb68", "metadata": {}, "source": [ "# Exercise 1: Reading and visualizing graphs\n", "\n", "Goal: Get used to the `networkx` library" ] }, { "cell_type": "markdown", "id": "9a2566d2", "metadata": {}, "source": [ "## 1.1. Basic plot\n", "- Read and understand the following code" ] }, { "cell_type": "code", "execution_count": null, "id": "17549962", "metadata": {}, "outputs": [], "source": [ "# Read an example data on florentine marriage families in the XV century\n", "G = nx.florentine_families_graph()\n", "\n", "# Show the nodes and edges\n", "print(\"Nodes: \", G.nodes())\n", "print(\"Edges: \", G.edges())" ] }, { "cell_type": "code", "execution_count": null, "id": "5b408279", "metadata": {}, "outputs": [], "source": [ "# Visualize it\n", "plt.figure(figsize=(5,4)) #set up dimensions\n", "\n", "# create layout (once so we can reuse it if needed)\n", "pos = nx.spring_layout(G, seed = 1)\n", "nx.draw(G, pos = pos, with_labels = True, \n", " edge_color = \"gray\", node_color = \"lightgray\")" ] }, { "cell_type": "markdown", "id": "a8178dbf", "metadata": {}, "source": [ "## 1.2. Read and visualize your network\n", "Network here: https://tinyurl.com/network-game\n", "(I added the direct CSV link. But you could also download it as a CSV file and move it to the folder of this notebook) \n", "\n", "Use the `nx.from_pandas_edgelist` function (you need to set up the `source` and `target` parameters)\n", "Print number of nodes and edges" ] }, { "cell_type": "code", "execution_count": null, "id": "4aad5d3d", "metadata": {}, "outputs": [], "source": [ "# Read edgelist (link)\n", "df = pd.read_csv(\"https://docs.google.com/spreadsheets/d/e/2PACX-1vSuZC86KjXYKSPr0Nw7mfRha4zwea3aMw-gTKliVcbRt_m3NRiCEyxbH_d5M8MBL0LWayg1WDmnqBET/pub?gid=0&single=true&output=csv\")\n", "display(df.head())" ] }, { "cell_type": "code", "execution_count": null, "id": "09de5f10", "metadata": {}, "outputs": [], "source": [ "#get help\n", "nx.from_pandas_edgelist?" ] }, { "cell_type": "code", "execution_count": null, "id": "c667a9a2", "metadata": {}, "outputs": [], "source": [ "# Convert to networkx\n", "G = nx.from_pandas_edgelist(...)\n", "print(len(G.nodes()))\n", "print(len(G.edges()))" ] }, { "cell_type": "code", "execution_count": null, "id": "ea2cc766", "metadata": {}, "outputs": [], "source": [ "# create figure and plot\n", "plt.figure(figsize=(10,8))\n", "\n", "# create layout (once so we can reuse it if needed)\n", "pos = nx.spring_layout(G, seed = 1)\n", "nx.draw(G, pos = pos, with_labels = True, node_size=10,\n", " edge_color = \"lightgray\", node_color = \"gray\")" ] }, { "cell_type": "markdown", "id": "1da80e98", "metadata": {}, "source": [ "## 1.3. Read and visualize the Twitter network\n", "\n", "We will be using (parts of) this network in the following days.\n", "\n", "It contains mentions, replies and quotes of user A to user B (but let's treat is as undirected)\n", "```\n", "replied_to 40255\n", "mentioned 10407\n", "quoted 3448\n", "```\n", "\n", "Use the nx.from_pandas_edgelist function\n", "Print number of nodes and edges" ] }, { "cell_type": "code", "execution_count": null, "id": "d0170e2d", "metadata": {}, "outputs": [], "source": [ "# Read edgelist (the separator is a TAB)\n", "path = f\"{path_data}/ic2s2_netsci_3.tsv\"\n", "\n", "df = pd.read_csv(...)\n", "display(df.head())\n", "\n", "# Convert to networkx\n", "G = nx.from_pandas_edgelist(df)\n", "G.remove_edges_from(nx.selfloop_edges(G)) #remove self-edges\n", "print(len(G.nodes()))\n", "print(len(G.edges()))" ] }, { "cell_type": "code", "execution_count": null, "id": "5251186d", "metadata": {}, "outputs": [], "source": [ "# create figure and plot\n", "plt.figure(figsize=(20,16))\n", "\n", "# create layout (once so we can reuse it if needed)\n", "pos = nx.spring_layout(G, seed = 1)\n", "nx.draw(G, pos = pos, with_labels = False, node_size=10,\n", " edge_color = \"lightgray\", node_color = \"gray\")" ] }, { "cell_type": "code", "execution_count": null, "id": "edf6cafd", "metadata": {}, "outputs": [], "source": [ "# igraph is faster. Here is how to convert to igraph\n", "h = ig.Graph().from_networkx(G)\n", "# create layout\n", "layout = h.layout_fruchterman_reingold()\n", "# create positions in the networx format (as a dictionary)\n", "pos = dict(zip(G.nodes(), layout.coords))\n", "\n", "# Plot as before\n", "plt.figure(figsize=(20, 16))\n", "nx.draw(G, pos = pos, with_labels = False, node_size=10,\n", " edge_color = \"lightgray\", node_color = \"gray\")" ] }, { "cell_type": "markdown", "id": "e6b1b5a9", "metadata": {}, "source": [ "## 1.4. Read and visualize a large(r) network \n", "- Read network in f\"{path_data}/wiki-Vote.txt\". Careful, it is a directed network, you need to use the create_using parameter.\n", "\n", "(use iterations = 30 in the spring_layout)\n", "\n", "\n", "Format of the file:\n", "```\n", "# Directed graph (each unordered pair of nodes is saved once): Wiki-Vote.txt \n", "# Wikipedia voting on promotion to administratorship (till January 2008). Directed edge A->B means user A voted on B becoming Wikipedia administrator.\n", "# Nodes: 7115 Edges: 103689\n", "# FromNodeId\tToNodeId\n", "30\t1412\n", "30\t3352\n", "```" ] }, { "cell_type": "code", "execution_count": null, "id": "54c36775", "metadata": {}, "outputs": [], "source": [ "# Read directed graph\n", "path = f\"{path_data}/wiki-Vote.txt\"\n", "G_wiki = nx.read_edgelist(path, create_using=...)\n", "print(len(G_wiki.nodes()))\n", "print(len(G_wiki.edges()))" ] }, { "cell_type": "code", "execution_count": null, "id": "cfcc7e9c", "metadata": {}, "outputs": [], "source": [ "# Create layout (this will take a couple minutes). Networkx is a particularly slow library\n", "pos = nx.spring_layout(G_wiki, seed = 1, iterations=30)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e9a6c51", "metadata": {}, "outputs": [], "source": [ "# Nobody wants to see your hairball, but let's plot it anyway\n", "plt.figure(figsize=(20, 20))\n", "\n", "# Plot only nodes (too many lines)\n", "nx.draw_networkx_nodes(G_wiki, pos = pos, node_size = 1, node_color = \"k\")" ] }, { "cell_type": "markdown", "id": "f2ff6d51", "metadata": {}, "source": [ "# Exercise 2: Network characteristics\n", "\n", "Network example: Protein interaction (PPI) in yeast. Nodes = Proteins, Edges = Recorded interactions in experiments\n", "\n", "__Exercise__: Compare the PPI network with the Twitter network (ic2s2_netsci_3). What characteristics apply to both?\n" ] }, { "cell_type": "code", "execution_count": null, "id": "42e9de76", "metadata": {}, "outputs": [], "source": [ "# Read PPI network\n", "path_network = f\"{path_data}/ppi_network_prediction.graphml\"\n", "G = nx.read_graphml(path_network, node_type=int)\n", "len(G.nodes()), len(G.edges())" ] }, { "cell_type": "code", "execution_count": null, "id": "69caf98f", "metadata": {}, "outputs": [], "source": [ "# igraph is faster. Here is how to convert to igraph\n", "h = ig.Graph().from_networkx(G)\n", "# create layout\n", "layout = h.layout_fruchterman_reingold()\n", "# create positions in the networx format (as a dictionary)\n", "pos = dict(zip(G.nodes(), layout.coords))\n", "\n", "# To use networkx (slow), uncomment the line below\n", "#pos = nx.spring_layout(G, seed = 1)\n", "\n", "# Plot as before\n", "fig, ax = plt.subplots(figsize=(10, 8))\n", "#This function is defined in common_functions.py file. It adds automatic coloring and nicer defaults\n", "plot_network(G, a0 = ax, pos = pos, with_labels=False) " ] }, { "cell_type": "markdown", "id": "b7204d45", "metadata": {}, "source": [ "## 2.1 Connectedness" ] }, { "cell_type": "code", "execution_count": null, "id": "9e57cfb8", "metadata": {}, "outputs": [], "source": [ "# Number of connected components\n", "nx.components.number_connected_components(G)" ] }, { "cell_type": "code", "execution_count": null, "id": "7c088402", "metadata": {}, "outputs": [], "source": [ "# Size of the connected components (first 20 sorted)\n", "sorted([len(_) for _ in nx.connected_components(G)])[:20]" ] }, { "cell_type": "code", "execution_count": null, "id": "ac07c20c", "metadata": {}, "outputs": [], "source": [ "# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)\n", "largest_cc = max(nx.connected_components(G), key=len)\n", "\n", "# Create a subgraph of G consisting only of nodes in largest_cc\n", "G = G.subgraph(largest_cc).copy()\n", "len(G)" ] }, { "cell_type": "markdown", "id": "de92ae88", "metadata": {}, "source": [ "## 2.2 Small diameters" ] }, { "cell_type": "code", "execution_count": null, "id": "1392dae1", "metadata": {}, "outputs": [], "source": [ "# Diameter\n", "nx.diameter(G)" ] }, { "cell_type": "code", "execution_count": null, "id": "827778dd", "metadata": {}, "outputs": [], "source": [ "# Average shortest path\n", "nx.average_shortest_path_length(G)" ] }, { "cell_type": "markdown", "id": "f28e551b", "metadata": {}, "source": [ "## 2.3 Density" ] }, { "cell_type": "code", "execution_count": null, "id": "eba3dd49", "metadata": {}, "outputs": [], "source": [ "nx.density(G)" ] }, { "cell_type": "markdown", "id": "c7cbd38f", "metadata": {}, "source": [ "## 2.4 Transitivity and average clustering " ] }, { "cell_type": "code", "execution_count": null, "id": "98a6dc93", "metadata": {}, "outputs": [], "source": [ "# Average at the network level (# triangles / possible number of triangles)\n", "print(nx.transitivity(G))\n", "\n", "# Average at the node level = Average(# triangles_i / possible triangles_i )\n", "print(nx.average_clustering(G))" ] }, { "cell_type": "markdown", "id": "0e9ba007", "metadata": {}, "source": [ "## 2.5 Assortativity and homophily\n", "\n", "What does it mean to have a negative/positive degree assortativity?" ] }, { "cell_type": "code", "execution_count": null, "id": "2fa3b399", "metadata": {}, "outputs": [], "source": [ "nx.assortativity.degree_assortativity_coefficient(G)" ] }, { "cell_type": "markdown", "id": "b5b01ea4", "metadata": {}, "source": [ "## 2.6 Heavy tails" ] }, { "cell_type": "code", "execution_count": null, "id": "8d396865", "metadata": {}, "outputs": [], "source": [ "# Find the degree of each node\n", "degree_list = dict(G.degree()).values()\n", "\n", "# count the number of nodes with a specific degree, sort it\n", "from collections import Counter\n", "C = Counter(degree_list)\n", "deg, cnt = zip(*sorted(C.items()))\n", "\n", "# Plot\n", "plt.plot(deg, cnt, \"o-\")\n", "plt.xlabel(\"Degree\")\n", "plt.ylabel(\"Count (degree)\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8a5b0c1f", "metadata": {}, "outputs": [], "source": [ "# Defined in common_functions.py\n", "plot_cdf(degree_list, compl = True, xlabel = \"Degree (d)\", ylabel = \"p(Degree > d)\")\n" ] }, { "cell_type": "markdown", "id": "ad5df77d", "metadata": {}, "source": [ "## 2.7 Robustness to failures / Fragility to targeted attacks" ] }, { "cell_type": "code", "execution_count": null, "id": "8684247e", "metadata": {}, "outputs": [], "source": [ "def attack_network(G, how=\"targeted\", n_remove=200):\n", " \"\"\"\n", " Removes nodes iteratively and keeps the (harmonic average of distance between all pairs of nodes)\n", " \n", " how = \"targeted\" targets the node with the highest degree\n", " \"\"\"\n", " n_components = []\n", "\n", " G2 = G.copy()\n", " for i in range(n_remove):\n", " if how == \"targeted\":\n", " # Find the node with the highest degree\n", " node_to_remove = max(G2.degree(), key=lambda x: x[1])[0]\n", "\n", " else:\n", " # Random node\n", " node_to_remove = np.random.choice(G2.nodes(), 1)[0]\n", "\n", " # Remove node\n", " G2.remove_node(node_to_remove)\n", " # Number of nodes in giant component\n", " n_comp = max([len(_) for _ in nx.connected_components(G2)])\n", " n_components.append(100*n_comp/len(G2))\n", " \n", " \n", " return n_components" ] }, { "cell_type": "code", "execution_count": null, "id": "00e0fd95", "metadata": {}, "outputs": [], "source": [ "# Create an ER graph (random graph)\n", "G_r = nx.random_graphs.gnm_random_graph(1000,len(G.edges()))\n", "# Let's keep the largest component (be careful, don't do this in your data withuot a very good reason)\n", "largest_cc = max(nx.connected_components(G_r), key=len)\n", "\n", "# Create a subgraph of G consisting only of nodes in largest_cc\n", "G_r = G_r.subgraph(largest_cc).copy()\n", "\n", "# Plot network random\n", "plot_network(G_r, with_labels=False)\n", "plt.show()\n", "\n", "# Plot network PPI\n", "plot_network(G, with_labels=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "1fe19ba3", "metadata": {}, "outputs": [], "source": [ "# Remove 200 nodes\n", "n_remove = 200\n", "n_comp_attack_t = attack_network(G, how=\"targeted\", n_remove = n_remove)\n", "n_comp_attack_r = attack_network(G, how=\"random\", n_remove = n_remove)\n", "\n", "ref_n_comp_attack_t = attack_network(G_r, how=\"targeted\", n_remove = n_remove)\n", "ref_n_comp_attack_r = attack_network(G_r, how=\"random\", n_remove = n_remove)" ] }, { "cell_type": "code", "execution_count": null, "id": "3d51b399", "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(10, 5))\n", "plt.subplot(121)\n", "plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_t, label=\"Protein Interaction\")\n", "plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_t, label=\"Random network\")\n", "plt.xlabel(\"Share of nodes removed\")\n", "plt.ylabel(\"Share of nodes in giant component\")\n", "plt.legend()\n", "plt.title(\"Targeted attack\")\n", "plt.ylim(0,100)\n", "\n", "plt.subplot(122)\n", "plt.plot(100*np.arange(n_remove)/len(G), n_comp_attack_r, label=\"Protein Interaction\")\n", "plt.plot(100*np.arange(n_remove)/len(G), ref_n_comp_attack_r, label=\"Random network\")\n", "plt.xlabel(\"Share of nodes removed\")\n", "plt.ylabel(\"Share of nodes in giant component\")\n", "plt.legend()\n", "plt.title(\"Random attack\")\n", "plt.ylim(0,100)" ] }, { "cell_type": "markdown", "id": "e9985571", "metadata": {}, "source": [ "# Exercise 3: Distributions (i.e. looking at characteristics of the nodes)\n", "- Degree\n", "- Number of triangles\n", "- Clustering (transitivity)\n", "- Local assortativity (homophily)\n", "- Path length\n", "\n", "__Goal__: Understand how to calculate distributions in `networkx`, interpret differences in the degree distribution \n", "\n", "__Exercise__: In 3.7 compare the Wikipedia network (default example), the PPI network and the Twitter network (IC2S2)" ] }, { "cell_type": "code", "execution_count": null, "id": "16f880a5", "metadata": {}, "outputs": [], "source": [ "# Read data on florentine marriage families in the XV century\n", "G = nx.florentine_families_graph()\n", "len(G)" ] }, { "cell_type": "code", "execution_count": null, "id": "3d4837fe", "metadata": {}, "outputs": [], "source": [ "# Use the following function to plot the CDF of the degree distributions\n", "def plot_cdf(values, scale = \"log\", ax = None, cum = True, compl = False, marker = 'o-', xlabel = \"Degree (d)\", ylabel = \"p(Degree < d)\"):\n", " \"\"\"\n", " Calculates and plot CDF\n", " \"\"\"\n", " from collections import Counter\n", "\n", " # count the number of instance per each degree, sort it\n", " C = Counter(values)\n", " deg, cnt = zip(*sorted(C.items()))\n", " \n", " # calcualte the cumulative distribution, normalize to be a probability instead of a count\n", " if cum:\n", " cs = np.cumsum(cnt)/np.sum(cnt)\n", " else:\n", " cs = cnt/np.sum(cnt)\n", " \n", " if compl:\n", " cs = 1 - cs\n", " \n", " if ax is None:\n", " ax = plt.subplot()\n", " # plot\n", " ax.plot(deg, cs, marker)\n", " ax.set_ylabel(ylabel)\n", " ax.set_xlabel(xlabel)\n", " plt.tight_layout()\n", " sns.despine(left=True, bottom=True)\n", " plt.xscale(scale)\n", " plt.yscale(scale)\n", " " ] }, { "cell_type": "markdown", "id": "998712a3", "metadata": {}, "source": [ "## 3.1 Degree distribution" ] }, { "cell_type": "code", "execution_count": null, "id": "913ddeb2", "metadata": {}, "outputs": [], "source": [ "def plot_network_distribution(G, values, mult = 1000, with_labels = True):\n", " \"\"\"\n", " Plots network (color and node size depends on values) and distributions\n", " \"\"\"\n", " import matplotlib as mpl\n", " norm = mpl.colors.Normalize(vmin=min(values), vmax=max(values), clip=True)\n", " mapper = mpl.cm.ScalarMappable(norm=norm, cmap=mpl.cm.coolwarm)\n", "\n", " f, (a0, a1, a2) = plt.subplots(1, 3, gridspec_kw={'width_ratios': [2, 1, 1]}, figsize=(12,4))\n", " \n", " node_size = mult*np.array(list(values))\n", " if min(node_size) < 0:\n", " node_size -= min(node_size)\n", " node_size += 1\n", " \n", " nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = with_labels, node_size = node_size, edge_color = \"gray\", \n", " node_color = [mapper.to_rgba(i) for i in values], ax = a0,)\n", "\n", "\n", " sns.histplot(values, ax = a1)\n", " \n", " \n", " plot_cdf(values, ax = a2, compl = True, xlabel = \"Cent c\", ylabel = \"p(Cent > c)\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3033f75e", "metadata": {}, "outputs": [], "source": [ "# Degree distribution\n", "degree = G.degree() #also nx.degree(G)\n", "degree_values = dict(degree).values()\n", "\n", "# Plot using sns.histplot\n", "sns.histplot(degree_values)\n", "print(sorted(degree_values))" ] }, { "cell_type": "code", "execution_count": null, "id": "3d1faa5f", "metadata": {}, "outputs": [], "source": [ "# Plot CDF in log-log scale\n", "plot_cdf(degree_values, scale = \"log\", ax = None, cum = True, compl = True)" ] }, { "cell_type": "code", "execution_count": null, "id": "6a9dcf85", "metadata": {}, "outputs": [], "source": [ "# change \"mult\" to change the size of the nodes \n", "plot_network_distribution(G, degree_values, mult = 100)" ] }, { "cell_type": "markdown", "id": "3f57f36b", "metadata": {}, "source": [ "## 3.2 Distribution of number of triangles" ] }, { "cell_type": "code", "execution_count": null, "id": "cbe7b2f0", "metadata": { "scrolled": false }, "outputs": [], "source": [ "# Distribution of the number of triangles per node\n", "n_triangs = nx.triangles(G).values() \n", "display(list(zip(G.nodes(), n_triangs))[:20])\n", "\n", "plot_network_distribution(G, n_triangs, mult = 100)" ] }, { "cell_type": "markdown", "id": "4ec072d6", "metadata": {}, "source": [ "## 3.3 Distribution of clustering" ] }, { "cell_type": "code", "execution_count": null, "id": "f713c1f6", "metadata": {}, "outputs": [], "source": [ "n_clus = ...\n", "display(list(zip(G.nodes(), n_clus))[:20])\n", "\n", "plot_network_distribution(G, n_clus, mult = 1000)" ] }, { "cell_type": "markdown", "id": "a3a41d71", "metadata": {}, "source": [ "### 3.4 Distribution of shortest paths" ] }, { "cell_type": "code", "execution_count": null, "id": "c47c87af", "metadata": {}, "outputs": [], "source": [ "# Calculate all shortest paths (careful, this quickly becomes unfeasible)\n", "path_lenghts = ...\n", "\n", "# Get results from a nested dictionary\n", "path_lenghts = [list(_[1].values()) for _ in path_lenghts]\n", "path_lenghts = [subitem for item in path_lenghts for subitem in item ]\n", "\n", "# Plot using sns.histplot\n", "sns.histplot(path_lenghts)\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "5d69dcf0", "metadata": {}, "source": [ "## 3.5 Distribution of local degree assortativity" ] }, { "cell_type": "code", "execution_count": null, "id": "2c1f70d5", "metadata": {}, "outputs": [], "source": [ "attribute = [k for v,k in G.degree()]\n", "print(attribute)\n", "\n", "# Defined in common_functions (based on Peel et al 2018)\n", "local_assort = calculate_local_assort(G, attribute)\n", "display(list(zip(G.nodes(), local_assort))[:20])\n", "\n", "plot_network_distribution(G, local_assort, mult = 1000)" ] }, { "cell_type": "markdown", "id": "8e863fc5", "metadata": {}, "source": [ "## 3.6 Degree distribution if ~ normal distribution (used in the lectures)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "70f6f7ef", "metadata": {}, "outputs": [], "source": [ "# Don't change this\n", "def plot_distributions(d, scale=\"log\"):\n", " \"\"\"\n", " Plot the PDF, CDF and CCDF.\n", " \"\"\"\n", " plt.figure(figsize=(12,4))\n", " ax = plt.subplot(131)\n", " plot_cdf(d, cum = False, ax = ax, xlabel = \"Degree (d)\", ylabel = \"p(Degree)\", marker = \".\", scale=scale)\n", " if scale == \"log\":\n", " plt.plot([7E1, 5E2],[2.5E-3,1E-4],\"--\")\n", " plt.title(\"PDF\")\n", " ax = plt.subplot(132)\n", " plot_cdf(d, cum = True, ax = ax, xlabel = \"Degree (d)\", ylabel = \"p(Degree < d)\", marker=\"-\", scale=scale)\n", " plt.title(\"CDF\")\n", " ax = plt.subplot(133)\n", " plot_cdf(d, compl = True, ax = ax, xlabel = \"Degree (d)\", ylabel = \"p(Degree > d)\", marker=\".-\", scale=scale)\n", " plt.title(\"CCDF\")\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "9d68e564", "metadata": {}, "outputs": [], "source": [ "## Degree distribution (random normally distributed data)\n", "## Try changing the scale to \"log\"\n", "d = np.random.binomial(500, p = 30/500, size = 10000)\n", "plot_distributions(d, scale=\"linear\")\n" ] }, { "cell_type": "markdown", "id": "782ac2e0", "metadata": {}, "source": [ "## 3.7 Degree distribution in the wiki network" ] }, { "cell_type": "code", "execution_count": null, "id": "02989329", "metadata": {}, "outputs": [], "source": [ "## Degree distribution (wiki network)\n", "# Try the code using scale = \"linear\"\n", "G_wiki = nx.read_edgelist(f\"{path_data}/wiki-Vote.txt\", create_using=nx.DiGraph())\n", "d = [v for k,v in G_wiki.degree()]\n", "plot_distributions(d, \"log\")\n", "plt.xlim([100,1000])\n", "plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "29321b85", "metadata": {}, "outputs": [], "source": [ "# Twitter data\n", "df = pd.read_csv(f\"{path_data}/ic2s2_netsci_3.tsv\", sep=\"\\t\")\n", "G_twitter = nx.from_pandas_edgelist(df)\n", "G_twitter.remove_edges_from(nx.selfloop_edges(G_twitter)) #remove self-edges\n", "\n", "d = [v for k,v in G_twitter.degree()]\n", "plot_distributions(d, \"log\")\n", "plt.xlim([100,1000])\n", "plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")" ] }, { "cell_type": "code", "execution_count": null, "id": "43b2e02e", "metadata": {}, "outputs": [], "source": [ "# PPI data\n", "path_network = f\"{path_data}/ppi_network_prediction.graphml\"\n", "G_ppi = nx.read_graphml(path_network, node_type=int)\n", "\n", "d = [v for k,v in G_ppi.degree()]\n", "plot_distributions(d, \"log\")\n", "\n", "plt.plot([2E2, 1E3], [2.5E-2,2.5E-4],\"--\")" ] }, { "cell_type": "markdown", "id": "429bea65", "metadata": {}, "source": [ "# Exercise 4A \n", "Is the network homophilic? (are people linked to people like them?). You can use the function `nx.assortativity.attribute_assortativity_coefficient`\n", "\n", "Solution: Shuffle the node labels\n", "\n", "__Goal__: Understand how a permutation test works (4A) and how reference models work (4B)." ] }, { "cell_type": "code", "execution_count": null, "id": "24e6abfa", "metadata": {}, "outputs": [], "source": [ "# Create labels\n", "ns = ['Acciaiuoli', 'Medici', 'Castellani', 'Peruzzi', 'Strozzi', 'Barbadori', 'Ridolfi', 'Tornabuoni', 'Albizzi', 'Salviati', 'Pazzi', 'Bischeri', 'Guadagni', 'Ginori', 'Lamberteschi']\n", "classes = [\"m\", \"m\", \"o\", \"o\", \"o\", \"o\", \"o\", \"m\", \"m\", \"m\", \"o\", \"o\", \"o\", \"o\", \"o\"]\n", "nx.set_node_attributes(G, dict(zip(ns,classes)), \"classes\")\n", "\n", "# Plot\n", "nx.draw(G, pos = nx.spring_layout(G, seed = 1), with_labels = True, edge_color = \"gray\", \n", " node_color = [palette[0] if c == \"m\" else palette[1] for c in classes])\n", " \n", "\n", "# calculate assortativity coefficient\n", "assort = ...\n", "print(assort)" ] }, { "cell_type": "code", "execution_count": null, "id": "daf87a30", "metadata": {}, "outputs": [], "source": [ "# Permutenode labels\n", "G2 = G.copy()\n", "\n", "# Randomize classes and calculate assortativity\n", "iters = 10000\n", "values = []\n", "for i in range(iters):\n", " # shuffle the classes\n", " nx.set_node_attributes(G2, dict(zip(ns,np.random.permutation(classes))), \"classes\")\n", " # calculate assortativity and keep in a list\n", " values.append(nx.assortativity.attribute_assortativity_coefficient(G2, \"classes\"))\n", "values = np.array(values)\n", "\n", "# Plot results\n", "sns.histplot(values, binwidth=0.05)\n", "plt.plot([assort,assort],[0,iters/10], \"--\", color=\"gray\")\n", "\n", "# p-value (probability that we would observe a value equal or more extreme to the one observed given \n", "# that the null hyphotesis is true---i.e. the graph is the real graph and the links \n", "# are not correlated with the classes\n", "print(\"p-value\", 1-len(values[values