from intros import (read_txt_file_to_list, write_dict_to_csvfile) import string inputfile = "brothers_karamazov.txt" search_term = 'suddenly' def strip_punctuation(text): # strip out all punctuation characters exclude_characters = set(string.punctuation) # some add additional characters we want to strip exclude_characters = exclude_characters.union(set("“”")) for character in exclude_characters: text = text.replace(character, "") return text # read our text file into a list lines = read_txt_file_to_list(inputfile) # Join all the separate lines of text into one string text_combined = ''.join(lines) # make everything lowercase text_combined = text_combined.lower() # strip punctuation text_combined = strip_punctuation(text_combined) # Create text tokens - that is, a list of all the words in th text text_tokens = text_combined.split() # create an empty list tokens_with_search_term_preceding = [] for i in range(0, len(text_tokens) - 2): tokenpair = text_tokens[i:i + 2] if search_term == tokenpair[0]: tokens_with_search_term_preceding.append(tokenpair[1]) # find unique terms in our group of terms: tokens_set = set(tokens_with_search_term_preceding) token_counts_dict = {} # count the number of occurences for each item in set for token in tokens_set: occurrences = tokens_with_search_term_preceding.count(token) token_counts_dict[token] = occurrences outputfile = "suddenly_2grams.csv" write_dict_to_csvfile(token_counts_dict, outputfile)