#Pass in a series #Examples of how to tokenize """ low_pizza_ana_list = [] for x in low_pizza_ana: low_pizza_ana_list.append((word_tokenize(x))) """ #ranges will be decimals (ex. .25, .50 , .75) top 25% top 50% top 75% def gpt_nlp_prep(df_series, top_range, mid_range, low_range): import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize import string from nltk import pos_tag t_list = [] for x in df_series: t_list.append(word_tokenize(x)) #Set parameters to make bi and trigrams for anagram creation function bigram = 2 trigram = 3 #lists to capture raw bigram_list = [] trigram_list = [] master_word_list = [] #dictionaries to capture count and terms bigram_dict = {} trigram_dict = {} master_word_dict = {} #count scale of each dictionary value bigram_count = [] trigram_count = [] master_word_count = [] #lists to capture non-dupes bigram_non_dupes = [] trigram_non_dupes = [] master_word_non_dupes = [] #range lists bigram_top_range_list = [] bigram_mid_range_list = [] bigram_low_range_list = [] trigram_top_range_list = [] trigram_mid_range_list = [] trigram_low_range_list = [] master_word_top_range_list = [] master_word_mid_range_list = [] master_word_low_range_list = [] stop_word_list = ['both', 'myself', 'some', 'y', 'a', 'have', 'me', 'be', 'or', 'as', "shouldn't", 'by', 'but', 'they', 'you', 'same', 'yourself', 'their', 'in', 'doesn', 'm', 'at', 'why', 'when', 'further', 'not', 'my', "isn't", "shan't", 'didn', 'only', 'of', 'this', 'to', 'more', 'own', 'itself', 'ma', 'while', 's', 'theirs', 'shan', 'couldn', 'against', 'will', 'needn', 'we', 'those', 'the', 'ain', 'ourselves', 'having', 'most', 'such', 'i', 'that', 'through', "it's", "don't", "couldn't", 'off', 'being', 'it', 'mustn', 'hadn', "she's", 'before', 're', 'just', 'll', 'wouldn', 'had', "you're", "needn't", "you've", 'doing', 'she', 'there', 'him', "wouldn't", 've', "should've", "weren't", 'than', 'do', 'hers', 'all', "won't", 'he', 'up', 'how', 'after', 'and', 'our', 'herself', 'few', 'does', 'his', 'can', 't', 'were', 'below', 'don', 'about', 'isn', 'ours', 'between', 'into', 'wasn', 'has', 'o', 'am', 'which', "you'll", 'was', "hadn't", 'during', "aren't", "doesn't", 'no', 'because', 'each', "mustn't", 'down', 'these', 'yourselves', 'what', "hasn't", 'again', "didn't", "mightn't", "you'd", 'them', 'should', 'is', 'then', 'with', 'are', "that'll", 'hasn', 'out', 'who', 'aren', 'your', 'weren', 'whom', 'did', 'won', "haven't", 'its', 'over', 'd', 'above', 'until', "wasn't", 'shouldn', 'on', 'been', 'her', 'where', 'nor', 'under', 'any', 'very', 'himself', 'for', 'mightn', 'from', 'if', 'once', 'here', 'other', 'haven', 'themselves', 'too', 'an', 'now', 'yours', 'so'] extra_punctuation_check = ["``", "''","..."] my_punctuation_list = ['!','"','#','$','%','&',"'",'(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','|','}','~'] bigram_dict = {} trigram_dict = {} master_word_dict = {} #FUNCTION TO MAKE tri and bi grams def anagram_creation(my_list, desired_len): def anagram_prep(my_list, desired_len): if len(my_list) % desired_len == 0: return my_list if len(my_list) % desired_len == 1: my_list.remove(my_list[len(my_list)-1]) return my_list if len(my_list) % desired_len == 2: my_list.remove(my_list[len(my_list)-1]) my_list.remove(my_list[len(my_list)-1]) return my_list if len(my_list) % desired_len > 2: return my_list working_list = anagram_prep(my_list, desired_len) if desired_len == 2: output_list = [] times_calculations_run = len(working_list) / 2 my_counter = 0 bigram_one = 0 bigram_two = 1 while my_counter != times_calculations_run: bigram_element = working_list[bigram_one] + " " + working_list[bigram_two] output_list.append(bigram_element) my_counter += 1 bigram_one += 2 bigram_two += 2 if desired_len == 3: output_list = [] times_calculations_run = len(working_list) / 3 my_counter = 0 trigram_one = 0 trigram_two = 1 trigram_three = 2 while my_counter != times_calculations_run: trigram_element = working_list[trigram_one] + " " + working_list[trigram_two] + " " + working_list[trigram_three] output_list.append(trigram_element) my_counter += 1 trigram_one += 3 trigram_two += 3 trigram_three += 3 return output_list #bigram establishment for x in t_list: bigram_list.append(anagram_creation(x,2)) #trigram establishment for x in t_list: trigram_list.append(anagram_creation(x,3)) #bigram dictionary establishment for list in bigram_list: for x in list: if x not in bigram_dict: bigram_dict[x] = 1 if x in bigram_dict: bigram_dict[x] += 1 #trigram dictionary establishment for list in trigram_list: for x in list: if x not in trigram_dict: trigram_dict[x] = 1 if x in trigram_dict: trigram_dict[x] += 1 #Making a scale of counts of the dictionary Bigram for k,v in bigram_dict.items(): bigram_count.append(v) #Making a scale of counts of the dictionary Trigram for k,v in trigram_dict.items(): trigram_count.append(v) #Get rid of duplicate counts Bigram for x in bigram_count: if x not in bigram_non_dupes: bigram_non_dupes.append(x) if x in bigram_non_dupes: continue #Get rid of duplicate counts Trigram for x in trigram_count: if x not in trigram_non_dupes: trigram_non_dupes.append(x) if x in trigram_non_dupes: continue #Sort Bigrams and Trigrams bigram_non_dupes.sort(reverse=False) trigram_non_dupes.sort(reverse=False) #Bigram scales bigram_minimum = bigram_non_dupes[0] bigram_top_range = bigram_non_dupes[int(len(bigram_non_dupes) * top_range // 1)] bigram_mid_range = bigram_non_dupes[int(len(bigram_non_dupes) * mid_range // 1)] bigram_low_range = bigram_non_dupes[int(len(bigram_non_dupes) * low_range // 1)] bigram_maximum = bigram_non_dupes[len(bigram_non_dupes)-1] #Trigram scales trigram_minimum = trigram_non_dupes[0] trigram_top_range = trigram_non_dupes[int(len(trigram_non_dupes) * top_range // 1)] trigram_mid_range = trigram_non_dupes[int(len(trigram_non_dupes) * mid_range // 1)] trigram_low_range = trigram_non_dupes[int(len(trigram_non_dupes) * low_range // 1)] trigram_maximum = trigram_non_dupes[len(trigram_non_dupes)-1] #Bigram distribution to lists bigram_top_range_list = [] bigram_mid_range_list = [] bigram_low_range_list = [] for k,v in bigram_dict.items(): if v > bigram_minimum and v <= bigram_low_range: bigram_low_range_list.append(k) if v > bigram_low_range and v <= bigram_mid_range: bigram_mid_range_list.append(k) if v >= bigram_top_range and v <= bigram_maximum: bigram_top_range_list.append(k) #Trigram distribution to lists trigram_top_range_list = [] trigram_mid_range_list = [] trigram_low_range_list = [] for k,v in trigram_dict.items(): if v > trigram_minimum and v <= trigram_low_range: trigram_low_range_list.append(k) if v > trigram_low_range and v <= trigram_mid_range: trigram_mid_range_list.append(k) if v >= trigram_top_range and v <= trigram_maximum: trigram_top_range_list.append(k) #Establishing list of words master_word_list = [] final_word_list = [] clean_word_list = [] for x in bigram_list: for ele in x: master_word_list.append(word_tokenize(ele)) for x in master_word_list: for ele in x: final_word_list.append(ele) for x in final_word_list: if x not in stop_word_list and x not in my_punctuation_list and x not in extra_punctuation_check: clean_word_list.append(x) else: continue #word dictionary establishment for x in clean_word_list: if x not in master_word_dict: master_word_dict[x] = 1 if x in master_word_dict: master_word_dict[x] += 1 #Making a scale of counts of the dictionary Bigram for k,v in master_word_dict.items(): master_word_count.append(v) #Get rid of duplicate counts master words for x in master_word_count: if x not in master_word_non_dupes: master_word_non_dupes.append(x) if x in master_word_non_dupes: continue #Sort master_words master_word_non_dupes.sort(reverse=False) #Master word scales master_word_minimum = master_word_non_dupes[0] master_word_top_range = master_word_non_dupes[int(len(master_word_non_dupes) * top_range // 1)] master_word_mid_range = master_word_non_dupes[int(len(master_word_non_dupes) * mid_range // 1)] master_word_low_range = master_word_non_dupes[int(len(master_word_non_dupes) * low_range // 1)] master_word_maximum = master_word_non_dupes[len(master_word_non_dupes)-1] #master word distribution to lists master_word_top_range_list = [] master_word_mid_range_list = [] master_word_range_list = [] for k,v in master_word_dict.items(): if v > master_word_minimum and v <= master_word_low_range: master_word_low_range_list.append(k) if v > master_word_low_range and v <= master_word_mid_range: master_word_mid_range_list.append(k) if v >= master_word_top_range and v <= master_word_maximum: master_word_top_range_list.append(k) top_range_output = top_range * 100 mid_range_output = mid_range * 100 low_range_output = low_range * 100 output_dict = {(f'The top {top_range_output}% bigram') : bigram_top_range_list, (f'The middle {mid_range_output}% bigram') : bigram_mid_range_list, (f'The low {low_range_output}% bigram') : bigram_low_range_list, (f'The top {top_range_output}% trigram') : trigram_top_range_list, (f'The middle {mid_range_output}% trigram') : trigram_mid_range_list, (f'The low {low_range_output}% trigram') : trigram_low_range_list, (f'The top {top_range_output}% words') : master_word_top_range_list, (f'The middle {mid_range_output}% words') : master_word_mid_range_list, (f'The low {low_range_output}% words') : master_word_low_range_list, } """ output = [(f'The top {top_range_output}% bigram : {bigram_top_range_list} '), (f'The middle {mid_range_output}% bigram : {bigram_mid_range_list} '), (f'The low {low_range_output}% bigram : {bigram_low_range_list}'), (f'The top {top_range_output}% trigram : {trigram_top_range_list} '), (f'The middle {mid_range_output}% trigram : {trigram_mid_range_list} '), (f'The low {low_range_output}% trigram : {trigram_low_range_list}'), (f'The top {top_range_output}% words : {master_word_top_range_list} '), (f'The middle {mid_range_output}% words : {master_word_mid_range_list} '), (f'The low {low_range_output}% words : {master_word_low_range_list}') ] """ return output_dict