#!/usr/bin/env python
'''
Application to calculate the Type-Token Ratio from a speech sample.
Copyright (C) 2013 Steven C. Howell
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Please report any issues on-line at:
https://github.com/StevenCHowell/type_token_ratio/issues
'''
from __future__ import print_function
import collections
import os
import sys
import string
def main(speech_sample):
with open(speech_sample) as f:
flines = f.readlines()
n_lines = len(flines)
words = []
for line in flines:
new_words = line.split()
words += [word.lower() for word in new_words]
n_words = len(words)
# remove all punctuations
for i in range(n_words):
for c in string.punctuation:
words[i] = words[i].replace(c,'')
# remove empty words
words = list(filter(None, words))
n_words = len(words)
# count each word
word_count = collections.Counter(words)
# get the sorted list of unique words
unique_words = list(word_count.keys())
unique_words.sort()
n_unique = len(unique_words)
ttr = len(word_count)/float(len(words))
out_fname = '{}_out.txt'.format(os.path.splitext(speech_sample)[0])
out_lines = []
out_lines.append('Type-Token Ratio (U/T): {:0.4f}\n'.format(ttr))
out_lines.append('Number of Utterances: {}\n'.format(n_lines))
out_lines.append('Total Number of Words (T): {}\n'.format(n_words))
out_lines.append('Total Number of Unique Words (U): {}\n'.format(n_unique))
out_lines.append('\nUnique Words (frequency):\n')
for word, count in word_count.most_common():
out_lines.append('{}\t{}\n'.format(count, word))
out_lines.append('\nUnique Words (alphabetical):\n')
for word in unique_words:
out_lines.append('{}\t{}\n'.format(word_count[word], word))
# lines.append('\n\n{}\n'.format(str(word_count)))
# out_file.write('\n\n' + str(word_count)+'\n')
with open(out_fname, 'w') as out_file:
for line in out_lines:
out_file.write(line)
out_lines = ['output saved to: \n{}\n\n'.format(out_fname)] + out_lines
out_lines.append('='*80)
out_lines.append(
'\n Copyright (C) 2013 Steven C. Howell\n'
'\n'
'This program is free software: you can redistribute it and/or modify\n'
'it under the terms of the GNU General Public License as published by\n'
'the Free Software Foundation, either version 3 of the License, or\n'
'(at your option) any later version.\n'
'\n'
'This program is distributed in the hope that it will be useful,\n'
'but WITHOUT ANY WARRANTY; without even the implied warranty of\n'
'MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n'
'GNU General Public License for more details.\n'
'\n'
'You should have received a copy of the GNU General Public License\n'
'along with this program. If not, see .\n'
)
out_lines.append('='*80)
output = ''.join(out_lines)
return output
if __name__ == '__main__':
output = main(sys.argv[1])
print(output)