import nltk , nltk.data , collections, string , sys from nltk.text import Text from nltk.probability import FreqDist , ConditionalFreqDist from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.collocations import * with open('military.txt') as my_dataFile: my_text = my_dataFile.read() # everything in lowercase my_text = my_text.lower() # prepare the list of punctuation symbols excludedPunct = set(string.punctuation) # prepare the list of stopwords stopWordsList = set(stopwords.words('english')) # tokenize the text token_text = word_tokenize(my_text) # remove stopwords and punctuation. Create a new list with list comprehension clean_token_text = [w for w in token_text if w not in excludedPunct and w not in stopWordsList] # instantiate the class Text for extracting statistics tokens_to_be_analysed = Text(token_text) term = 'military' print('## total number of words:', len(tokens_to_be_analysed), '\n') print('## lexical diversity:', len(set(tokens_to_be_analysed)) / len(tokens_to_be_analysed), '\n') print('## occurrences of the term "', term, '": ', tokens_to_be_analysed.count(term), '\n') print('## percentage on the total amount of words:', 100 * tokens_to_be_analysed.count(str(term)) / len(tokens_to_be_analysed)) print('\n## concordance of the term '+term+':') tokens_to_be_analysed.concordance(term, 75, sys.maxsize) print('\n## similar words in the same context of "'+term+'":') tokens_to_be_analysed.similar('military') # distribution of words in the cleaned corpus print('## frequency distribution of the 100 most common words (without stopwords and punctuation):') fd = FreqDist(tokens_to_be_analysed) for w,f in fd.most_common(100): print(w,':',f) # compute frequency distribution for all the bigrams in the text bgs = nltk.bigrams(tokens_to_be_analysed) fdist = FreqDist(bgs) print('\n## distribution of 50 most common bigrams:\n', fdist.most_common(50),'\n') # Collocation of bigrams that appear more than twice bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens_to_be_analysed) finder.apply_freq_filter(2) # filter bigrams that appear more than 3 times scored = finder.score_ngrams(bigram_measures.raw_freq) # score bigrams by their frequency (raw freq) print("\n## collocation of words that appear more than twice:\n", scored) # rating bigrams that will likely occour scored2 = finder.score_ngrams(bigram_measures.likelihood_ratio) # score bigrams by their likelihood print("\n## collocation of words that are likely to appear together:\n",scored2)