{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Text Analytics using Graphs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "project_name = \"reco-tut-gml\"; branch = \"main\"; account = \"sparsh-ai\"\n", "project_path = os.path.join('/content', project_name)\n", "\n", "if not os.path.exists(project_path):\n", " !cp /content/drive/MyDrive/mykeys.py /content\n", " import mykeys\n", " !rm /content/mykeys.py\n", " path = \"/content/\" + project_name; \n", " !mkdir \"{path}\"\n", " %cd \"{path}\"\n", " import sys; sys.path.append(path)\n", " !git config --global user.email \"recotut@recohut.com\"\n", " !git config --global user.name \"reco-tut\"\n", " !git init\n", " !git remote add origin https://\"{mykeys.git_token}\":x-oauth-basic@github.com/\"{account}\"/\"{project_name}\".git\n", " !git pull origin \"{branch}\"\n", " !git checkout main\n", "else:\n", " %cd \"{project_path}\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%writefile requirements.txt\n", "networkx==2.4  \n", "scikit-learn==0.24.0 \n", "stellargraph==1.2.1 \n", "spacy==3.0.3 \n", "pandas==1.1.3 \n", "numpy==1.19.2 \n", "node2vec==0.3.3 \n", "Keras==2.0.2 \n", "tensorflow==2.4.1 \n", "communities==2.2.0 \n", "gensim==3.8.3 \n", "matplotlib==3.3.4 \n", "nltk==3.5 \n", "langdetect==1.0.9\n", "fasttext==0.9.2\n", "python-louvain==0.15\n", "click==7.1.2\n", "smart-open==3.0.0" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -r requirements.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from collections import Counter\n", "\n", "import nltk\n", "from nltk.corpus import reuters\n", "import langdetect\n", "import spacy\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "import gensim\n", "from gensim.summarization import keywords\n", "\n", "from matplotlib import pyplot as plt\n", "from spacy import displacy\n", "from sklearn.manifold import TSNE\n", "\n", "import networkx as nx\n", "from networkx.algorithms.bipartite.projection import *\n", "from node2vec import Node2Vec\n", "import community\n", "from community import community_louvain\n", "\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nltk.download('reuters')\n", "!python -m spacy download en_core_web_md\n", "\n", "default_edge_color = 'gray'\n", "default_node_color = '#407cc9'\n", "enhanced_node_color = '#f5b042'\n", "enhanced_edge_color = '#cc2f04'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset overview" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will use Reuters-21578 dataset. The original dataset includes a set of 21,578 news articles that were published in the financial Reuters newswire in 1987, which were assembled and indexed in categories. The original dataset has a very skewed distribution, with some categories appearing only in the training set or in the test set. For this reason, we will use a modified version, known as ApteMod, also referred to as Reuters-21578 Distribution 1.0, that has a smaller skew distribution and consistent labels between the training and test datasets. The Reuters-21578 dataset can easily be downloaded using the nltk library (which is a very useful library for post-processing documents)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus = pd.DataFrame([\n", " {\"id\": _id, \"clean_text\": reuters.raw(_id).replace(\"\\n\", \"\"), \"label\": reuters.categories(_id)}\n", " for _id in reuters.fileids()\n", "]).set_index(\"id\")\n", "\n", "corpus.head(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus.describe().T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## NLP\n", "\n", "In this section, we will extract structured information from text by using NLP techniques and models" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Language detection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def getLanguage(text: str):\n", " try:\n", " return langdetect.detect(text)\n", " except: \n", " return np.nan" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus[\"language\"] = corpus[\"clean_text\"].apply(getLanguage)\n", "corpus[\"language\"].value_counts().head(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### NLP enrichment" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nlp = spacy.load('en_core_web_md')\n", "corpus[\"parsed\"] = corpus[\"clean_text\"].apply(nlp)\n", "corpus.loc[\"test/14832\"][\"clean_text\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "displacy.render(corpus.loc[\"test/14832\"][\"parsed\"], style='ent', jupyter=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus[[\"clean_text\", \"label\", \"language\", \"parsed\"]].to_pickle(\"/content/corpus.p\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus.to_pickle()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus[[\"parsed\"]].to_pickle(\"/content/parsed.p\", compression='gzip')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Graph Generation\n", "In this section, we will create two different kind of graphs out of a corpus of documents:\n", "1. Knowledge base graphs, where the subject-verb-object relation will be encoded to build a semantic graph\n", "2. Bipartite graphs, linking documents with the entities/keywords appearing therein" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Knowledge Graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#@markdown SVO\n", "SUBJECTS = [\"nsubj\", \"nsubjpass\", \"csubj\", \"csubjpass\", \"agent\", \"expl\"]\n", "OBJECTS = [\"dobj\", \"dative\", \"attr\", \"oprd\"]\n", "\n", "def getSubsFromConjunctions(subs):\n", " moreSubs = []\n", " for sub in subs:\n", " # rights is a generator\n", " rights = list(sub.rights)\n", " rightDeps = {tok.lower_ for tok in rights}\n", " if \"and\" in rightDeps:\n", " moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == \"NOUN\"])\n", " if len(moreSubs) > 0:\n", " moreSubs.extend(getSubsFromConjunctions(moreSubs))\n", " return moreSubs\n", "\n", "def getObjsFromConjunctions(objs):\n", " moreObjs = []\n", " for obj in objs:\n", " # rights is a generator\n", " rights = list(obj.rights)\n", " rightDeps = {tok.lower_ for tok in rights}\n", " if \"and\" in rightDeps:\n", " moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == \"NOUN\"])\n", " if len(moreObjs) > 0:\n", " moreObjs.extend(getObjsFromConjunctions(moreObjs))\n", " return moreObjs\n", "\n", "def getVerbsFromConjunctions(verbs):\n", " moreVerbs = []\n", " for verb in verbs:\n", " rightDeps = {tok.lower_ for tok in verb.rights}\n", " if \"and\" in rightDeps:\n", " moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == \"VERB\"])\n", " if len(moreVerbs) > 0:\n", " moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))\n", " return moreVerbs\n", "\n", "def findSubs(tok):\n", " head = tok.head\n", " while head.pos_ != \"VERB\" and head.pos_ != \"NOUN\" and head.head != head:\n", " head = head.head\n", " if head.pos_ == \"VERB\":\n", " subs = [tok for tok in head.lefts if tok.dep_ == \"SUB\"]\n", " if len(subs) > 0:\n", " verbNegated = isNegated(head)\n", " subs.extend(getSubsFromConjunctions(subs))\n", " return subs, verbNegated\n", " elif head.head != head:\n", " return findSubs(head)\n", " elif head.pos_ == \"NOUN\":\n", " return [head], isNegated(tok)\n", " return [], False\n", "\n", "def isNegated(tok):\n", " negations = {\"no\", \"not\", \"n't\", \"never\", \"none\"}\n", " for dep in list(tok.lefts) + list(tok.rights):\n", " if dep.lower_ in negations:\n", " return True\n", " return False\n", "\n", "def findSVs(tokens):\n", " svs = []\n", " verbs = [tok for tok in tokens if tok.pos_ == \"VERB\"]\n", " for v in verbs:\n", " subs, verbNegated = getAllSubs(v)\n", " if len(subs) > 0:\n", " for sub in subs:\n", " svs.append((sub.orth_, \"!\" + v.orth_ if verbNegated else v.orth_))\n", " return svs\n", "\n", "def getObjsFromPrepositions(deps):\n", " objs = []\n", " for dep in deps:\n", " if dep.pos_ == \"ADP\" and dep.dep_ == \"prep\":\n", " objs.extend([tok for tok in dep.rights if tok.dep_ in OBJECTS or (tok.pos_ == \"PRON\" and tok.lower_ == \"me\")])\n", " return objs\n", "\n", "def getObjsFromAttrs(deps):\n", " for dep in deps:\n", " if dep.pos_ == \"NOUN\" and dep.dep_ == \"attr\":\n", " verbs = [tok for tok in dep.rights if tok.pos_ == \"VERB\"]\n", " if len(verbs) > 0:\n", " for v in verbs:\n", " rights = list(v.rights)\n", " objs = [tok for tok in rights if tok.dep_ in OBJECTS]\n", " objs.extend(getObjsFromPrepositions(rights))\n", " if len(objs) > 0:\n", " return v, objs\n", " return None, None\n", "\n", "def getObjFromXComp(deps):\n", " for dep in deps:\n", " if dep.pos_ == \"VERB\" and dep.dep_ == \"xcomp\":\n", " v = dep\n", " rights = list(v.rights)\n", " objs = [tok for tok in rights if tok.dep_ in OBJECTS]\n", " objs.extend(getObjsFromPrepositions(rights))\n", " if len(objs) > 0:\n", " return v, objs\n", " return None, None\n", "\n", "def getAllSubs(v):\n", " verbNegated = isNegated(v)\n", " subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != \"DET\"]\n", " if len(subs) > 0:\n", " subs.extend(getSubsFromConjunctions(subs))\n", " else:\n", " foundSubs, verbNegated = findSubs(v)\n", " subs.extend(foundSubs)\n", " return subs, verbNegated\n", "\n", "def getAllObjs(v):\n", " # rights is a generator\n", " rights = list(v.rights)\n", " objs = [tok for tok in rights if tok.dep_ in OBJECTS]\n", " objs.extend(getObjsFromPrepositions(rights))\n", "\n", " #potentialNewVerb, potentialNewObjs = getObjsFromAttrs(rights)\n", " #if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:\n", " # objs.extend(potentialNewObjs)\n", " # v = potentialNewVerb\n", "\n", " potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)\n", " if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:\n", " objs.extend(potentialNewObjs)\n", " v = potentialNewVerb\n", " if len(objs) > 0:\n", " objs.extend(getObjsFromConjunctions(objs))\n", " return v, objs\n", "\n", "def findSVOs(tokens, output=\"str\"):\n", " svos = []\n", " # verbs = [tok for tok in tokens if tok.pos_ == \"VERB\" and tok.dep_ != \"aux\"]\n", " verbs = [tok for tok in tokens if tok.dep_ != \"AUX\"]\n", " for v in verbs:\n", " subs, verbNegated = getAllSubs(v)\n", " # hopefully there are subs, if not, don't examine this verb any longer\n", " if len(subs) > 0:\n", " v, objs = getAllObjs(v)\n", " for sub in subs:\n", " for obj in objs:\n", " objNegated = isNegated(obj)\n", " \n", " if output is \"str\":\n", " element = (\n", " sub.lower_, \"!\" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_\n", " )\n", " elif output is \"obj\":\n", " element = (sub, (v, verbNegated or objNegated), obj)\n", " \n", " svos.append(element)\n", " return svos\n", "\n", "def getAbuserOntoVictimSVOs(tokens):\n", " maleAbuser = {'he', 'boyfriend', 'bf', 'father', 'dad', 'husband', 'brother', 'man'}\n", " femaleAbuser = {'she', 'girlfriend', 'gf', 'mother', 'mom', 'wife', 'sister', 'woman'}\n", " neutralAbuser = {'pastor', 'abuser', 'offender', 'ex', 'x', 'lover', 'church', 'they'}\n", " victim = {'me', 'sister', 'brother', 'child', 'kid', 'baby', 'friend', 'her', 'him', 'man', 'woman'}\n", "\n", " svos = findSVOs(tokens)\n", " wnl = WordNetLemmatizer()\n", " passed = []\n", " for s, v, o in svos:\n", " s = wnl.lemmatize(s)\n", " v = \"!\" + wnl.lemmatize(v[1:], 'v') if v[0] == \"!\" else wnl.lemmatize(v, 'v')\n", " o = \"!\" + wnl.lemmatize(o[1:]) if o[0] == \"!\" else wnl.lemmatize(o)\n", " if s in maleAbuser.union(femaleAbuser).union(neutralAbuser) and o in victim:\n", " passed.append((s, v, o))\n", " return passed\n", "\n", "def printDeps(toks):\n", " for tok in toks:\n", " print(tok.orth_, tok.dep_, tok.pos_, tok.head.orth_, [t.orth_ for t in tok.lefts], [t.orth_ for t in tok.rights])\n", "\n", "def testSVOs():\n", " tok = nlp(\"making $12 an hour? where am i going to go? i have no other financial assistance available and he certainly won't provide support.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')}\n", " print(svos)\n", "\n", " tok = nlp(\"i don't have other assistance\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " assert set(svos) == {('i', '!have', 'assistance')}\n", "\n", " print(\"-----------------------------------------------\")\n", " tok = nlp(\"They ate the pizza with anchovies.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('they', 'ate', 'pizza')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"I have no other financial assistance available and he certainly won't provide support.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"I have no other financial assistance available, and he certainly won't provide support.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('i', '!have', 'assistance'), ('he', '!provide', 'support')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he did not kill me\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', '!kill', 'me')}\n", "\n", " #print(\"--------------------------------------------------\")\n", " #tok = nlp(\"he is an evil man that hurt my child and sister\")\n", " #svos = findSVOs(tok)\n", " #printDeps(tok)\n", " #print(svos)\n", " #assert set(svos) == {('he', 'hurt', 'child'), ('he', 'hurt', 'sister'), ('man', 'hurt', 'child'), ('man', 'hurt', 'sister')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he told me i would die alone with nothing but my career someday\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', 'told', 'me')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"I wanted to kill him with a hammer.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('i', 'kill', 'him')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"because he hit me and also made me so angry i wanted to kill him with a hammer.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', 'hit', 'me'), ('i', 'kill', 'him')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he and his brother shot me\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', 'shot', 'me'), ('brother', 'shot', 'me')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he and his brother shot me and my sister\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', 'shot', 'me'), ('he', 'shot', 'sister'), ('brother', 'shot', 'me'), ('brother', 'shot', 'sister')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"the annoying person that was my boyfriend hit me\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('person', 'was', 'boyfriend'), ('person', 'hit', 'me')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"the boy raced the girl who had a hat that had spots.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('boy', 'raced', 'girl'), ('who', 'had', 'hat'), ('hat', 'had', 'spots')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he spit on me\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', 'spit', 'me')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he didn't spit on me\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', '!spit', 'me')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"the boy raced the girl who had a hat that didn't have spots.\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('boy', 'raced', 'girl'), ('who', 'had', 'hat'), ('hat', '!have', 'spots')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he is a nice man that didn't hurt my child and sister\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', 'is', 'man'), ('man', '!hurt', 'child'), ('man', '!hurt', 'sister')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he didn't spit on me and my child\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " assert set(svos) == {('he', '!spit', 'me'), ('he', '!spit', 'child')}\n", "\n", " print(\"--------------------------------------------------\")\n", " tok = nlp(\"he beat and hurt me\")\n", " svos = findSVOs(tok)\n", " printDeps(tok)\n", " print(svos)\n", " # tok = nlp(\"he beat and hurt me\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus[\"triplets\"] = corpus[\"parsed\"].apply(lambda x: findSVOs(x, output=\"obj\"))\n", "corpus.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "edge_list = [\n", " {\"id\": _id, \"source\": source.lemma_.lower(), \"target\": target.lemma_.lower(), \"edge\": edge.lemma_.lower()}\n", " for _id, triplets in corpus[\"triplets\"].iteritems()\n", " for (source, (edge, neg), target) in triplets\n", "]\n", "\n", "edges = pd.DataFrame(edge_list)\n", "edges.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "edges[\"edge\"].value_counts().head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G=nx.from_pandas_edgelist(edges, \"source\", \"target\", \n", " edge_attr=True, create_using=nx.MultiDiGraph())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(nx.info(G))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.log10(pd.Series({k: v for k, v in nx.degree(G)}).sort_values(ascending=False)).hist()\n", "plt.yscale(\"log\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "e = edges[(edges[\"source\"]!=\" \") & (edges[\"target\"]!=\" \") & (edges[\"edge\"]==\"lend\")]\n", "G=nx.from_pandas_edgelist(e, \"source\", \"target\", \n", " edge_attr=True, create_using=nx.MultiDiGraph())\n", "\n", "plt.figure(figsize=(13, 6))\n", "pos = nx.spring_layout(G, k=1.2) # k regulates the distance between nodes\n", "nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos, font_size=12)\n", "plt.savefig(\"KnowledgeGraph.png\", dpi=300, format=\"png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bipartite Graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Keyword extraction" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "text = corpus[\"clean_text\"][0]\n", "keywords(text, words=10, split=True, scores=True, pos_filter=('NN', 'JJ'), lemmatize=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus[\"keywords\"] = corpus[\"clean_text\"].apply(\n", " lambda text: keywords(text, words=10, split=True, scores=True, pos_filter=('NN', 'JJ'), lemmatize=True)\n", ")\n", "corpus.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def extractEntities(ents, minValue=1, typeFilters=[\"GPE\", \"ORG\", \"PERSON\"]):\n", " entities = pd.DataFrame([\n", " {\"lemma\": e.lemma_, \"lower\": e.lemma_.lower(), \"type\": e.label_}\n", " for e in ents if hasattr(e, \"label_\")\n", " ])\n", "\n", " if len(entities)==0:\n", " return pd.DataFrame()\n", " \n", " g = entities.groupby([\"type\", \"lower\"])\n", "x\n", " summary = pd.concat({\n", " \"alias\": g.apply(lambda x: x[\"lemma\"].unique()), \n", " \"count\": g[\"lower\"].count()\n", " }, axis=1)\n", " \n", " return summary[summary[\"count\"]>1].loc[pd.IndexSlice[typeFilters, :, :]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def getOrEmpty(parsed, _type):\n", " try:\n", " return list(parsed.loc[_type][\"count\"].sort_values(ascending=False).to_dict().items())\n", " except:\n", " return []" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def toField(ents):\n", " typeFilters=[\"GPE\", \"ORG\", \"PERSON\"]\n", " parsed = extractEntities(ents, 1, typeFilters)\n", " return pd.Series({_type: getOrEmpty(parsed, _type) for _type in typeFilters})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entities = corpus[\"parsed\"].apply(lambda x: toField(x.ents))\n", "merged = pd.concat([corpus, entities], axis=1) \n", "merged.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Entity-entity graph projection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "edges = pd.DataFrame([\n", " {\"source\": _id, \"target\": keyword, \"weight\": score, \"type\": _type}\n", " for _id, row in merged.iterrows()\n", " for _type in [\"keywords\", \"GPE\", \"ORG\", \"PERSON\"] \n", " for (keyword, score) in row[_type]\n", "])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G = nx.Graph()\n", "G.add_nodes_from(edges[\"source\"].unique(), bipartite=0)\n", "G.add_nodes_from(edges[\"target\"].unique(), bipartite=1)\n", "G.add_edges_from([\n", " (row[\"source\"], row[\"target\"])\n", " for _, row in edges.iterrows()\n", "])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "document_nodes = {n for n, d in G.nodes(data=True) if d[\"bipartite\"] == 0}\n", "entity_nodes = {n for n, d in G.nodes(data=True) if d[\"bipartite\"] == 1}\n", "nodes_with_low_degree = {n for n, d in nx.degree(G, nbunch=entity_nodes) if d<5}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(nx.info(G))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> Warning: Following cell will take 30-40 mins to run." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dimensions = 10\n", "window = 20\n", "\n", "node2vec = Node2Vec(G, dimensions=dimensions) \n", "model = node2vec.fit(window=window) \n", "embeddings = model.wv \n", "\n", "pd.DataFrame(embeddings.vectors, index=embeddings.index2word)\\\n", " .to_pickle(f\"bipartiteGraphEmbeddings_{dimensions}_{window}.p\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "edges.to_pickle('bipartiteEdges.p')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "subGraph = G.subgraph(set(G.nodes) - nodes_with_low_degree)\n", "entityGraph = overlap_weighted_projected_graph(\n", " subGraph, \n", " {n for n, d in subGraph.nodes(data=True) if d[\"bipartite\"] == 1}\n", ")\n", "print(nx.info(entityGraph))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filteredEntityGraph = entityGraph.edge_subgraph(\n", " [edge for edge in entityGraph.edges if entityGraph.edges[edge][\"weight\"]>0.05]\n", ")\n", "print(nx.info(filteredEntityGraph))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Local and global properties of the graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "globalKpis = [{\n", " \"shortest_path\": nx.average_shortest_path_length(_graph),\n", " \"clustering_coefficient\": nx.average_clustering(_graph),\n", " \"global_efficiency\": nx.global_efficiency(_graph)\n", "} for components in nx.connected_components(filteredEntityGraph) \n", " for _graph in [nx.subgraph(filteredEntityGraph, components)]]\n", " \n", "pd.concat([\n", " pd.DataFrame(globalKpis), \n", " pd.Series([len(c) for c in nx.connected_components(filteredEntityGraph)])\n", "], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "globalKpis[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "betweeness = nx.betweenness_centrality(filteredEntityGraph)\n", "_betweeness = pd.Series(betweeness)\n", "pageRanks = pd.Series(nx.pagerank(filteredEntityGraph))\n", "degrees = pd.Series({k: v for k, v in nx.degree(filteredEntityGraph)})\n", "\n", "kpis = pd.concat({\n", " \"pageRank\": pageRanks, \n", " \"degrees\": degrees, \n", " \"betweeness\": _betweeness\n", "}, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plotDistribution(serie: pd.Series, nbins: int, minValue=None, maxValue=None):\n", " _minValue=int(np.floor(np.log10(minValue if minValue is not None else serie.min())))\n", " _maxValue=int(np.ceil(np.log10(maxValue if maxValue is not None else serie.max())))\n", " bins = [0] + list(np.logspace(_minValue, _maxValue, nbins)) + [np.inf]\n", " serie.hist(bins=bins)\n", " plt.xscale(\"log\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 5))\n", "\n", "plt.subplot(1,2,1)\n", "plt.title(\"Page rank vs degrees\")\n", "plt.plot(kpis[\"pageRank\"], kpis[\"degrees\"], '.', color=\"tab:blue\")\n", "plt.xlabel(\"page rank\")\n", "plt.ylabel(\"degree\")\n", "plt.xscale(\"log\")\n", "plt.yscale(\"log\")\n", "\n", "plt.subplot(1,2,2)\n", "plt.title(\"Page rank vs betweeness\")\n", "plt.plot(kpis[\"pageRank\"], kpis[\"betweeness\"], '.', color=\"tab:blue\")\n", "plt.xlabel(\"page rank\")\n", "plt.ylabel(\"betweeness\")\n", "plt.xscale(\"log\")\n", "plt.yscale(\"log\")\n", "plt.ylim([1E-5, 2E-2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 5))\n", "\n", "plt.subplot(1,2,1)\n", "plotDistribution(degrees, 13)\n", "plt.yscale(\"log\")\n", "plt.title(\"Degree Distribution\")\n", "\n", "plt.subplot(1,2,2)\n", "plotDistribution(allEdgesWeights, 20)\n", "plt.xlim([1E-2, 10])\n", "plt.yscale(\"log\")\n", "plt.title(\"Edge Weight Distribution\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Network visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Create network layout for visualizations\n", "spring_pos = nx.spring_layout(filteredEntityGraph)\n", "\n", "default_edge_color = 'gray'\n", "default_node_color = '#407cc9'\n", "enhanced_node_color = '#f5b042'\n", "enhanced_edge_color = '#cc2f04'\n", "\n", "plt.axis(\"off\")\n", "nx.draw_networkx(filteredEntityGraph, pos=spring_pos, node_color=default_node_color, \n", " edge_color=default_edge_color, with_labels=False, node_size=15)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Community detection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "communities = pd.Series(community_louvain.best_partition(filteredEntityGraph))\n", "communities.value_counts().sort_values(ascending=False).plot(kind=\"bar\", figsize=(12, 5))\n", "plt.xlabel(\"Community\")\n", "plt.ylabel(\"# Members\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nodes = communities[communities==17].index\n", "nodes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "smallGrap = nx.subgraph(filteredEntityGraph, nbunch=nodes)\n", "\n", "plt.figure(figsize=(10,10))\n", "pos = nx.spring_layout(smallGrap) # k regulates the distance between nodes\n", "nx.draw(smallGrap, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)\n", "plt.savefig(\"CloseUp.png\", dpi=300, format=\"png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bipartiteCloseup = subGraph.edge_subgraph( {e for e in subGraph.edges() if len(set(e).intersection(nodes))>0})\n", "deg = nx.degree(bipartiteCloseup)\n", "smallGrap = nx.subgraph(bipartiteCloseup, {n for n, d in bipartiteCloseup.nodes(data=True) if d[\"bipartite\"]==1 or deg[n]>1})\n", "\n", "plt.figure(figsize=(10,10))\n", "pos = nx.kamada_kawai_layout(smallGrap) # k regulates the distance between nodes\n", "node_color = [\"skyblue\" if d[\"bipartite\"]==1 else \"red\" for n, d in smallGrap.nodes(data=True)]\n", "nx.draw(smallGrap, with_labels=False, node_color=node_color, #'skyblue', \n", " node_size=150, edge_cmap=plt.cm.Blues, pos = pos)\n", "plt.savefig(\"BipartiteCloseUp.png\", dpi=300, format=\"png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "node2vec = Node2Vec(filteredEntityGraph, dimensions=5) \n", "model = node2vec.fit(window=10) \n", "embeddings = model.wv \n", "\n", "tsne=TSNE(n_components=2)\n", "embedding2d=tsne.fit_transform(embeddings.vectors)\n", "\n", "plt.plot(embedding2d[:, 0], embedding2d[:, 1], 'o')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Node2Vec allows also to compute a similarity between entities\n", "embeddings.most_similar(positive=[\"turkey\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Document-document graph projection" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documentGraph = overlap_weighted_projected_graph(G, {n for n, d in G.nodes(data=True) if d[\"bipartite\"] == 0})\n", "print(nx.info(documentGraph))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "allEdgesWeights = pd.Series({(d[0], d[1]): d[2][\"weight\"] for d in documentGraph.edges(data=True)})\n", "filteredDocumentGraph = documentGraph.edge_subgraph(\n", " allEdgesWeights[(allEdgesWeights>0.6)].index.tolist()\n", ")\n", "print(nx.info(filteredDocumentGraph))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Network visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spring_pos = nx.spring_layout(filteredDocumentGraph)\n", "\n", "plt.axis(\"off\")\n", "nx.draw_networkx(filteredDocumentGraph, pos=spring_pos, node_color=default_node_color, \n", " edge_color=default_edge_color, with_labels=False, node_size=15)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "components = pd.Series({ith: component \n", " for ith, component in enumerate(nx.connected_components(filteredDocumentGraph))})\n", "\n", "coreDocumentGraph = nx.subgraph(\n", " filteredDocumentGraph,\n", " [node for nodes in components[components.apply(len)>8].values for node in nodes]\n", ")\n", "\n", "print(nx.info(coreDocumentGraph))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spring_pos = nx.spring_layout(coreDocumentGraph)\n", "\n", "plt.axis(\"off\")\n", "nx.draw_networkx(coreDocumentGraph, pos=spring_pos, node_color=default_node_color, \n", " edge_color=default_edge_color, with_labels=False, node_size=15)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Community Detection and Topics Clustering" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "communities = pd.Series(community_louvain.best_partition(coreDocumentGraph))\n", "communities = pd.Series(community_louvain.best_partition(filteredDocumentGraph))\n", "\n", "def getTopicRatio(df):\n", " return Counter([label for labels in df[\"label\"] for label in labels])\n", "\n", "communityTopics = pd.DataFrame.from_dict({\n", " cid: getTopicRatio(corpus.loc[comm.index])\n", " for cid, comm in communities.groupby(communities)\n", "}, orient=\"index\")\n", "\n", "normalizedCommunityTopics = (communityTopics.T / communityTopics.sum(axis=1)).T\n", "\n", "topicsCorrelation = normalizedCommunityTopics.corr().fillna(0)\n", "topicsCorrelation[topicsCorrelation<0.8]=0\n", "\n", "topicsGraph = nx.from_pandas_adjacency(topicsCorrelation)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8,8))\n", "pos = nx.spring_layout(topicsGraph, k=0.35) # k regulates the distance between nodes\n", "nx.draw(topicsGraph, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)\n", "plt.savefig(\"TopicsAll.png\", dpi=300, format=\"png\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "filteredTopicsGraph = nx.subgraph(\n", " topicsGraph,\n", " [node for component in nx.connected_components(topicsGraph) if len(component)>3 for node in component]\n", ")\n", "\n", "plt.figure(figsize=(8,8))\n", "pos = nx.kamada_kawai_layout(filteredTopicsGraph) # k regulates the distance between nodes\n", "nx.draw(filteredTopicsGraph, with_labels=True, node_color='skyblue', node_size=1500, \n", " edge_cmap=plt.cm.Blues, pos = pos)\n", "plt.savefig(\"TopicsCore.png\", dpi=300, format=\"png\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "node2vec = Node2Vec(coreDocumentGraph, dimensions=20) \n", "model = node2vec.fit(window=10) \n", "embeddings = model.wv \n", "\n", "tsne=TSNE(n_components=2)\n", "embedding2d=tsne.fit_transform(embeddings.vectors)\n", "\n", "plt.plot(embedding2d[:, 0], embedding2d[:, 1], 'o')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.DataFrame(embeddings.vectors, index=embeddings.index2word)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Shallow-Learning Topic Modelling" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following we will create a topic model, using a shallow-learning approach. Here we will use the results and the embeddings obtained from the document-document projection of the bipartite graph." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from glob import glob\n", "from collections import Counter\n", "\n", "from sklearn.base import BaseEstimator\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.ensemble import RandomForestClassifier \n", "from sklearn.multioutput import MultiOutputClassifier\n", "from sklearn.model_selection import GridSearchCV\n", "from sklearn.model_selection import RandomizedSearchCV\n", "\n", "from sklearn.metrics import f1_score \n", "from sklearn.metrics import classification_report" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus = pd.read_pickle(\"corpus.p\")\n", "corpus.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "topics = Counter([label for document_labels in corpus[\"label\"] for label in document_labels]).most_common(10)\n", "topics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "topicsList = [topic[0] for topic in topics]\n", "topicsSet = set(topicsList)\n", "dataset = corpus[corpus[\"label\"].apply(lambda x: len(topicsSet.intersection(x))>0)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a class to \"simulate\" the training of the embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class EmbeddingsTransformer(BaseEstimator):\n", " \n", " def __init__(self, embeddings_file):\n", " self.embeddings_file = embeddings_file\n", " \n", " def fit(self, *args, **kwargs):\n", " self.embeddings = pd.read_pickle(self.embeddings_file)\n", " return self\n", " \n", " def transform(self, X):\n", " return self.embeddings.loc[X.index]\n", " \n", " def fit_transform(self, X, y):\n", " return self.fit().transform(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "files = glob(\"./bipartiteGraphEmbeddings*\")\n", "files" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "graphEmbeddings = EmbeddingsTransformer(files[0]).fit()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train/Test split" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_labels(corpus, topicsList=topicsList):\n", " return corpus[\"label\"].apply(\n", " lambda labels: pd.Series({label: 1 for label in labels}).reindex(topicsList).fillna(0)\n", " )[topicsList]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_features(corpus):\n", " return corpus[\"parsed\"] #graphEmbeddings.transform(corpus[\"parsed\"])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_features_and_labels(corpus):\n", " return get_features(corpus), get_labels(corpus)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train_test_split(corpus):\n", " graphIndex = [index for index in corpus.index if index in graphEmbeddings.embeddings.index]\n", " \n", " train_idx = [idx for idx in graphIndex if \"training/\" in idx]\n", " test_idx = [idx for idx in graphIndex if \"test/\" in idx]\n", " return corpus.loc[train_idx], corpus.loc[test_idx]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train, test = train_test_split(dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Build the model and cross-validation " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = MultiOutputClassifier(RandomForestClassifier())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline([\n", " (\"embeddings\", graphEmbeddings),\n", " (\"model\", model)\n", "])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "param_grid = {\n", " \"embeddings__embeddings_file\": files,\n", " \"model__estimator__n_estimators\": [50, 100], \n", " \"model__estimator__max_features\": [0.2,0.3, \"auto\"], \n", " #\"model__estimator__max_depth\": [3, 5]\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "features, labels = get_features_and_labels(train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, \n", " scoring=lambda y_true, y_pred: f1_score(y_true, y_pred,average='weighted'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = grid_search.fit(features, labels)\n", "model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.best_params_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Evaluate performance " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_predictions(model, features):\n", " return pd.DataFrame(\n", " model.predict(features), \n", " columns=topicsList, \n", " index=features.index\n", " )" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "preds = get_predictions(model, get_features(test))\n", "labels = get_labels(test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "errors = 1 - (labels - preds).abs().sum().sum() / labels.abs().sum().sum()\n", "errors" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(classification_report(labels, preds))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Graph Neural Network Topic Classifier [TODO]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the following we will focus on building a model for topic classification based on a Graph Neural Network approach.\n", "\n", "In particular in the following we will learn how to:\n", "\n", "* Create a TF-IDF representation of the corpus, that will be used as node features in the Graph Neural Network model \n", "* Build, train a Graph Neural Network model and identify the best threshold for classifying documents \n", "* Test the performance of the model in a out-of-sample tests, following a truly inductive approach " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import nltk \n", "import numpy as np\n", "import pandas as pd\n", "import networkx as nx\n", "from collections import Counter\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import f1_score, classification_report\n", "\n", "import stellargraph as sg\n", "from stellargraph import StellarGraph, IndexedArray\n", "from stellargraph.mapper import GraphSAGENodeGenerator\n", "from stellargraph.layer import GraphSAGE\n", "from stellargraph.data import EdgeSplitter\n", "from stellargraph.mapper import HinSAGENodeGenerator\n", "from stellargraph.layer import HinSAGE\n", "\n", "from tensorflow.keras import layers, optimizers, losses, metrics, Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus = pd.read_pickle(\"corpus.p\")\n", "corpus.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "topics = Counter([label for document_labels in corpus[\"label\"] for label in document_labels]).most_common(10)\n", "topics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "topicsList = [topic[0] for topic in topics]\n", "topicsSet = set(topicsList)\n", "dataset = corpus[corpus[\"label\"].apply(lambda x: len(topicsSet.intersection(x))>0)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_labels(corpus, topicsList=topicsList):\n", " return corpus[\"label\"].apply(\n", " lambda labels: pd.Series({label: 1 for label in labels}).reindex(topicsList).fillna(0)\n", " )[topicsList]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "labels = get_labels(dataset)\n", "labels.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_features(corpus):\n", " return corpus[\"parsed\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_features_and_labels(corpus):\n", " return get_features(corpus), get_labels(corpus)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train_test_split(corpus):\n", " train_idx = [idx for idx in corpus.index if \"training/\" in idx]\n", " test_idx = [idx for idx in corpus.index if \"test/\" in idx]\n", " return corpus.loc[train_idx], corpus.loc[test_idx]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train, test = train_test_split(dataset)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def my_spacy_tokenizer(pos_filter=[\"NOUN\", \"VERB\", \"PROPN\"]):\n", " def tokenizer(doc):\n", " return [token.lemma_ for token in doc if (pos_filter is None) or (token.pos_ in pos_filter)] \n", " return tokenizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cntVectorizer = TfidfVectorizer(\n", " analyzer=my_spacy_tokenizer(),\n", " max_df = 0.25, min_df = 2, max_features = 10000\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainFeatures, _ = get_features_and_labels(train)\n", "testFeatures, _ = get_features_and_labels(test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainedTransformed = cntVectorizer.fit_transform(trainFeatures)\n", "testTransformed = cntVectorizer.transform(testFeatures)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "features = pd.concat([\n", " pd.DataFrame.sparse.from_spmatrix(trainedTransformed, index=trainFeatures.index), \n", " pd.DataFrame.sparse.from_spmatrix(testTransformed, index=testFeatures.index)\n", "])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "features.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating the Graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "edges = pd.read_pickle(\"bipartiteEdges.p\")\n", "entityTypes = {entity: ith for ith, entity in enumerate(edges[\"type\"].unique())}\n", "entityTypes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "documentFeatures = features.loc[set(corpus.index).intersection(features.index)] #.assign(document=1, entity=0)\n", "documentFeatures.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entities = edges.groupby([\"target\", \"type\"])[\"source\"].count().groupby(level=0).apply(\n", " lambda s: s.droplevel(0).reindex(entityTypes.keys()).fillna(0)\n", ").unstack(level=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "entityFeatures = (entities.T / entities.sum(axis=1)).T.assign(document=0, entity=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nodes = {\"entity\": entityFeatures, \n", " \"document\": documentFeatures}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "stellarGraph = StellarGraph(nodes, \n", " edges[edges[\"source\"].isin(documentFeatures.index)], \n", " edge_type_column=\"type\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(stellarGraph.info())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "splitter = EdgeSplitter(stellarGraph)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "graphTest, samplesTest, labelsTest = splitter.train_test_split(p=0.2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(stellarGraph.info())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(graphTest.info())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Creating a Topic Classification Model " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by splitting the data into train, validation and test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "targets = labels.reindex(documentFeatures.index).fillna(0)\n", "#documentFeatures.drop([\"entity\", \"document\"], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "targets.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train_test_split(corpus):\n", " graphIndex = [index for index in corpus.index]\n", " \n", " train_idx = [idx for idx in graphIndex if \"training/\" in idx]\n", " test_idx = [idx for idx in graphIndex if \"test/\" in idx]\n", " return corpus.loc[train_idx], corpus.loc[test_idx]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sampled, hold_out = train_test_split(targets)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "allNeighbors = np.unique([n for node in sampled.index for n in stellarGraph.neighbors(node)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "subgraph = stellarGraph.subgraph(set(sampled.index).union(allNeighbors))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(subgraph.info())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train, leftOut = train_test_split(\n", " sampled,\n", " train_size=0.1,\n", " test_size=None,\n", " random_state=42,\n", ")\n", "\n", "validation, test = train_test_split(\n", " leftOut, train_size=0.2, test_size=None, random_state=100,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "validation = validation[validation.sum(axis=1) > 0]\n", "test = test[test.sum(axis=1) > 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f\"Validation: {validation.shape}\")\n", "print(f\"Test: {test.shape}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training the Model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We start by creating the model " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "batch_size = 50\n", "num_samples = [10, 5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "generator = HinSAGENodeGenerator(subgraph, batch_size, num_samples, head_node_type=\"document\")\n", "graphsage_model = HinSAGE(\n", " layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_inp, x_out = graphsage_model.in_out_tensors()\n", "prediction = layers.Dense(units=train.shape[1], activation=\"sigmoid\")(x_out)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "prediction.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = Model(inputs=x_inp, outputs=prediction)\n", "model.compile(\n", " optimizer=optimizers.Adam(lr=0.005),\n", " loss=losses.binary_crossentropy,\n", " metrics=[\"acc\"],\n", ")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now train the model " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_gen = generator.flow(train.index, train, shuffle=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "val_gen = generator.flow(validation.index, validation)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "history = model.fit(\n", " train_gen, epochs=50, validation_data=val_gen, verbose=1, shuffle=False\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sg.utils.plot_history(history)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "history = model.fit(\n", " train_gen, epochs=50, validation_data=val_gen, verbose=1, shuffle=False\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sg.utils.plot_history(history)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Threshold identification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_gen = generator.flow(test.index, test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_metrics = model.evaluate(test_gen)\n", "print(\"\\nTest Set Metrics:\")\n", "for name, val in zip(model.metrics_names, test_metrics):\n", " print(\"\\t{}: {:0.4f}\".format(name, val))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_predictions = pd.DataFrame(model.predict(test_gen), index=test.index, columns=test.columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_results = pd.concat({\n", " \"target\": test, \n", " \"preds\": test_predictions\n", "}, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f1s = {}\n", "\n", "for th in [0.01,0.05,0.1,0.2,0.3,0.4,0.5]:\n", " f1s[th] = f1_score(test_results[\"target\"], 1.0*(test_results[\"preds\"]>th), average=\"macro\")\n", " \n", "pd.Series(f1s).plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As it can be seen, with a threshold of about 0.2 we obtain the best performances. We thus use this value for producing the classification report" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(classification_report(test_results[\"target\"], 1.0*(test_results[\"preds\"]>0.2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Inductive Prediction" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We now provide a prediction truly inductive, thus we will be using the full graph and we will also use the threshold of 0.2 we have identified above as the one providing the top f1-score. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "generator = HinSAGENodeGenerator(stellarGraph, batch_size, num_samples, head_node_type=\"document\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hold_out = hold_out[hold_out.sum(axis=1) > 0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hold_out_gen = generator.flow(hold_out.index, hold_out)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "hold_out_predictions = model.predict(hold_out_gen)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "preds = pd.DataFrame(1.0*(hold_out_predictions > 0.2), index=hold_out.index, columns=hold_out.columns)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "results = pd.concat({\n", " \"target\": hold_out, \n", " \"preds\": preds\n", "}, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(classification_report(results[\"target\"], results[\"preds\"]))" ] } ], "metadata": { "colab": { "authorship_tag": "ABX9TyOMTkdWsiMcIWxyEEA6y5ov", "collapsed_sections": [], "mount_file_id": "131eF0BSAZ1UCdUtlF-8obvkScRNOuJ19", "name": "rec-tut-gml-06-tag-reuters.ipynb", "provenance": [ { "file_id": "1517zDu-LicLAVdDjYvLd1PDTwSwvXhaW", "timestamp": 1628144829587 }, { "file_id": "1f05nHjML9TqPNTz_NWt_6VVUvYlPDGM6", "timestamp": 1628078002102 }, { "file_id": "18rsHbAXudxz_EspXEEFhfLLd-MT2jbhK", "timestamp": 1628066780292 }, { "file_id": "1sAKOySokSkK8dTp6GYjmIh3AjBOT1R0J", "timestamp": 1627993731574 }, { "file_id": "1FlR0Nt00zRzrjciEpl46j51IIomB0K_p", "timestamp": 1627989061002 } ], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }