import string
from gensim import corpora
from collections import defaultdict
from scipy.spatial import distance
import numpy as np
from sklearn import manifold
from nodesAndEdges import writeNodesEdges, readLongestParagraphs
from tqdm import tqdm

npar = 30   # target number of paragraphs per text (30 for a simple demo)
size = []   # actual number of paragraphs per text

documents = readLongestParagraphs('timeMachine.txt', size, npar)    # Herbert Wells
documents += readLongestParagraphs('oliverTwist.txt', size, npar)   # Charles Dickens
documents += readLongestParagraphs('adventuresOfHuckleberryFinn.txt', size, npar)   # Mark Twain
documents += readLongestParagraphs('theWarOfTheWorlds.txt', size, npar)   # Herbert Wells
documents += readLongestParagraphs('astro.txt', size, npar)         # astrophysics paper
documents += readLongestParagraphs('brothersKaramazov.txt', size, npar)   # Fyodor Dostoevsky

documents += readLongestParagraphs('matthew.txt', size, npar)
documents += readLongestParagraphs('mark.txt', size, npar)
documents += readLongestParagraphs('luke.txt', size, npar)
documents += readLongestParagraphs('john.txt', size, npar)

authorTags = [1, 2, 3, 1, 4, 5, 6, 6, 6, 6]
novelPerAuthorTags = [1, 1, 1, 2, 1, 1, 1, 1, 1, 1]

# convert line breaks and dashes to spaces, and remove punctuation
for i, p in enumerate(documents):
    tmp = p.replace('\n', ' ').replace('-',' ')
    for c in string.punctuation:
        tmp = tmp.replace(c,'')
    documents[i] = tmp

# remove common words and tokenize (break into words)
stoplist = set('for from a of the and to in at through'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]

# count words across all paragraphs
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# remove words that appear only once across all paragraphs
texts = [[token for token in text if frequency[token] > 1] for text in texts]

# build a dictionary of words from scratch (not related to above word count)
dictionary = corpora.Dictionary(texts)
# print(dictionary.token2id)   # print IDs of all words in the dictionary
nwords = len(dictionary.token2id)
print('built a global dictionary with', nwords, 'words')

# convert documents to sparse vectors containing tuples (wordID, wordCount);
# corpus is a list of paragraphs, each is a list of tuples
corpus = [dictionary.doc2bow(text) for text in texts]

# convert sparse vectors to full vectors of length nwords
n = sum(size)
fullCorpus = np.zeros((n,nwords), dtype=np.int32)
for i, d in enumerate(corpus):
    for word in d:
        id, count = word
        fullCorpus[i,id] = count

# normalize each full vector
normalizedFullCorpus = np.zeros((n,nwords))
numberWordsPerDocument = fullCorpus.sum(axis=1)
for i in range(n):
    if numberWordsPerDocument[i] > 0:
        normalizedFullCorpus[i] = fullCorpus[i] / numberWordsPerDocument[i]

# import matplotlib.pyplot as plt
# from matplotlib import cm
# plt.figure(figsize=(10,8))
# from scipy import ndimage
# zoomFactor = (1, 0.05)
# coarse = ndimage.zoom(normalizedFullCorpus**0.3, zoomFactor)
# plt.imshow(coarse, cmap=cm.Blues, vmin=0., interpolation='nearest')
# plt.colorbar(orientation='vertical', shrink=0.45, aspect=20)
# plt.title('Normalized word count')
# plt.xlabel('20X compressed vocabulary')
# plt.ylabel('paragraphs')
# # plt.show()
# plt.savefig('normalizedFullCorpus.png')

# compute distance from d1 to d2 in nwords-dimensional space
n, i = len(normalizedFullCorpus), -1
dist = np.zeros((n,n))
for d1 in tqdm(normalizedFullCorpus):
    i += 1
    row = []
    for j, d2 in enumerate(normalizedFullCorpus):
        if i < j:
            dist[i,j] = distance.euclidean(d1, d2)
            dist[j,i] = dist[i,j]

# normalize the distance array
amax = np.amax(dist)
dist /= amax

mds = manifold.MDS(n_components=3, dissimilarity="precomputed", random_state=6, normalized_stress="auto") # multidimensional scaling
results = mds.fit(dist)
coords = results.embedding_

author, novelPerAuthor = [], []
for i, s in enumerate(size):
    author += [authorTags[i]] * s
    novelPerAuthor += [novelPerAuthorTags[i]] * s

print(coords)
writeNodesEdges(coords, scalar=[author,novelPerAuthor],
             name=['author','novel per author'], power=[1,0.7], fileout='multilingual')
