#!/usr/bin/env python ''' Application to calculate the Type-Token Ratio from a speech sample. Copyright (C) 2013 Steven C. Howell This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Please report any issues on-line at: https://github.com/StevenCHowell/type_token_ratio/issues ''' from __future__ import print_function import collections import os import sys import string def main(speech_sample): with open(speech_sample) as f: flines = f.readlines() n_lines = len(flines) words = [] for line in flines: new_words = line.split() words += [word.lower() for word in new_words] n_words = len(words) # remove all punctuations for i in range(n_words): for c in string.punctuation: words[i] = words[i].replace(c,'') # remove empty words words = list(filter(None, words)) n_words = len(words) # count each word word_count = collections.Counter(words) # get the sorted list of unique words unique_words = list(word_count.keys()) unique_words.sort() n_unique = len(unique_words) ttr = len(word_count)/float(len(words)) out_fname = '{}_out.txt'.format(os.path.splitext(speech_sample)[0]) out_lines = [] out_lines.append('Type-Token Ratio (U/T): {:0.4f}\n'.format(ttr)) out_lines.append('Number of Utterances: {}\n'.format(n_lines)) out_lines.append('Total Number of Words (T): {}\n'.format(n_words)) out_lines.append('Total Number of Unique Words (U): {}\n'.format(n_unique)) out_lines.append('\nUnique Words (frequency):\n') for word, count in word_count.most_common(): out_lines.append('{}\t{}\n'.format(count, word)) out_lines.append('\nUnique Words (alphabetical):\n') for word in unique_words: out_lines.append('{}\t{}\n'.format(word_count[word], word)) # lines.append('\n\n{}\n'.format(str(word_count))) # out_file.write('\n\n' + str(word_count)+'\n') with open(out_fname, 'w') as out_file: for line in out_lines: out_file.write(line) out_lines = ['output saved to: \n{}\n\n'.format(out_fname)] + out_lines out_lines.append('='*80) out_lines.append( '\n Copyright (C) 2013 Steven C. Howell\n' '\n' 'This program is free software: you can redistribute it and/or modify\n' 'it under the terms of the GNU General Public License as published by\n' 'the Free Software Foundation, either version 3 of the License, or\n' '(at your option) any later version.\n' '\n' 'This program is distributed in the hope that it will be useful,\n' 'but WITHOUT ANY WARRANTY; without even the implied warranty of\n' 'MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n' 'GNU General Public License for more details.\n' '\n' 'You should have received a copy of the GNU General Public License\n' 'along with this program. If not, see .\n' ) out_lines.append('='*80) output = ''.join(out_lines) return output if __name__ == '__main__': output = main(sys.argv[1]) print(output)