{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "a3255fff", "metadata": {}, "outputs": [], "source": [ "from nltk import sent_tokenize, word_tokenize\n", "from nltk.stem import WordNetLemmatizer as wnl\n", "import nltk, string, glob\n", "import gensim\n", "import itertools\n", "import re\n", "import csv\n", "import scipy\n", "import warnings\n", "import numpy as np\n", "import networkx as nx\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "\n", "#\n", "model = \"/home/jovyan/enwiki_5_ner.txt\"\n", "word_vectors = gensim.models.KeyedVectors.load_word2vec_format(model, binary=False)\n", "\n", "#################################################\n", "# Initialize, config & define helpful functions #\n", "#################################################\n", "\n", "translator = str.maketrans('', '', string.punctuation.replace('-', '')) #filters punctuation except dash\n", "lemmatizeCondition = 1\n", "lemmatizer = wnl()\n", "stop_words = nltk.corpus.stopwords.words('english')\n", "\n", "# Function for finding index of words of interest, like 'references'\n", "\n", "def find(target):\n", " for i, word in enumerate(sents):\n", " try:\n", " j = word.index(target)\n", " except ValueError:\n", " continue\n", " yield i\n", "\n", "# Function for handling the input for gensim word2vec\n", "\n", "class FileToSent(object):\n", " def __init__(self, filename):\n", " self.filename = filename\n", "\n", " def __iter__(self):\n", " for line in open(self.filename, 'r'):\n", " ll = line.strip().split(\",\")\n", " ll = [''.join(c for c in s if c not in string.punctuation) for s in ll]\n", " ll = [num.strip() for num in ll]\n", " yield ll\n", "\n", "\n", "###################################################\n", "# Read in .txt file(s) from a specified directory #\n", "###################################################\n", "\n", "IDs = sorted(glob.glob('/home/jovyan/data/*.txt'))\n", "\n", "####################\n", "# Clean, lemmatize #\n", "####################\n", "\n", "for ID in IDs: # loop through papers\n", " print(ID)\n", " totalWords = []\n", " with open(ID, newline='') as csvfile:\n", " text = csvfile.read()\n", " text = re.sub(\"\\u2013|\\u2014\", \"-\", str(text)) # Replace em-dashes\n", " sents = sent_tokenize(text) # Split into sentences\n", " sents = [word_tokenize(s) for s in sents]\n", " sents = [[w.translate(translator) for w in s] for s in sents] # filter punctuation\n", " sents = [[re.sub(r'^[-+]?[0-9]*[\\.\\-]?[0-9]+$', 'numeric', w) for w in s] for s in sents] # replace all numerals with the holder \"number\"\n", " sents = [[w for w in s if re.search('[^a-zA-Z-0-9-]+', w) is None] for s in sents] # trips everything but alphanumeric\n", " sents = [[w.lower() for w in s] for s in sents] # make lower case\n", " sents = [s for s in sents if len(s) > 0] # remove empty lines\n", " sents = [[w for w in s if not w in stop_words] for s in sents] # filter stop words\n", " sents = [[w for w in s if len(w) > 1] for s in sents] # filters out variables, etc\n", " sents = [[w for w in s if len(w) > 2] for s in sents] # filters out variables, etc\n", " sents = [[w for w in s if len(w) > 3] for s in sents] # filters out variables and abbreviations\n", " sents = [s for s in sents if len(s) > 0] # remove empty lines\n", " words = [[lemmatizer.lemmatize(w) for w in s if lemmatizeCondition == 1] for s in sents] # lemmatize\n", " words = list(itertools.chain.from_iterable(words)) # join list of lists\n", " totalWords.append(words)\n", "\n", " # Write cleaned text to file\n", " with open('/home/jovyan/cleanedText.txt', 'w') as f:\n", " for _list in words:\n", " f.write(str(_list) + ' ')\n", "\n", "###############################\n", "# Construct semantic networks #\n", "###############################\n", "\n", "\"\"\"\n", "Code to make a network out of the shortest N cosine-distances (or, equivalently, the strongest N associations)\n", "between a set of words in a gensim word2vec model.\n", "\"\"\"\n", "\n", "model = word_vectors # load\n", "\n", "# Specify words\n", "###############\n", "\n", "my_words = []\n", "text = open('/home/jovyan/cleanedText.txt').read()\n", "my_words = [word for word in word_tokenize(text)]\n", "\n", "# filter out words not in model\n", "my_words = [word for word in my_words if word in model.index_to_key]\n", "\n", "# The number of connections we want: either as a factor of the number of words or a set number\n", "num_top_conns = len(my_words) * 15\n", "\n", "# Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]\n", "dists = []\n", "\n", "# Find similarity distances between each word pair\n", "\n", "for i1, word1 in enumerate(my_words):\n", " for i2, word2 in enumerate(my_words):\n", " if i1 >= i2: continue\n", " cosine_similarity = model.similarity(word1, word2)\n", " cosine_distance = 1 - cosine_similarity\n", " dist = (word1, word2, cosine_distance)\n", " dists.append(dist)\n", "\n", "# Sort the list by ascending distance\n", "dists.sort(key=lambda _tuple: _tuple[-1])\n", "\n", "# Get the top connections\n", "top_conns = dists[:num_top_conns]\n", "\n", "# Make a network\n", "g = nx.Graph()\n", "g.add_nodes_from(my_words)\n", "for word1, word2, dist in top_conns:\n", " weight = 1 - dist # cosine similarity for weight\n", " g.add_edge(word1, word2, weight=float(weight))\n", "\n", "# Write the network\n", "nx.write_graphml(g, \"/home/jovyan/network.graphml\") # Readable by Gephi\n", "\n", "nx.write_edgelist(g, \"/home/jovyan/edgeList.txt\", delimiter=' ', data=['weight'])\n", "\n", "text = open('/home/jovyan/edgeList.txt', 'r')\n", "lines = text.readlines()\n", "\n", "counter = 0\n", "with open('/home/jovyan/networkDynamicsLabels.txt', 'w') as f:\n", " for word1 in my_words:\n", " for line in lines:\n", " if word1 in line:\n", " for i in [i for i, x in enumerate(my_words) if x == word1]:\n", " f.write(line.replace('\\n', ' ' + str(i) + '\\n'))\n", "\n", "\n", "# A = nx.adjacency_matrix(g, nodelist=my_words, weight=float(weight))\n", "# adjmat = A.todense()\n", "#\n", "# numpy.savetxt(\"./semanticNetwork/semanticNetworkAdjmat.txt\", adjmat, delimiter=' ')\n", "#\n", "# with open('./semanticNetwork/semanticNetworkNodeLabels.txt', 'w') as f:\n", "# print(g.nodes, file=f)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }