#########################################################################################
# ARPABET VOWEL ANALYZER                                                                #
#                                                                                       #
# DESCRIPTION:                                                                          #
# This script (modeled on Mietta Lennes' collect_formant_data_from_files.praat          #
# available at http://www.helsinki.fi/~lennes/praat-scripts/ and distributed under the  # 
# GNU General Public License, copyright 4/7/2003) is designed to be run on a set of     #
# soundfiles and P2FA-generated TextGrids. It extracts duration (in ms), timestamps     #
# (in s), and F1/F2/F3 from all intervals containing Arpabet vowels, and extracts       #
# labels for corresponding word and preceding & following phones, along with any notes  #
# present in the TextGrid. Last, the script outputs the analyst name, date, settings,   #
# OS, and Praat version to the results file.The script can also be constrained to a     #
# user-defined set of words using the "targets" option. Before writing the results      #
# file, words are lowercased and Arpabet is converted to Unicode IPA.                   #
#                                                                                       #
# NOTES:                                                                                #
# 1) Remember to add a / or \ (depending on the OS) to the end of all paths. 2)         #
# Soundfiles and TextGrids must have identical names in order for the script to match   #
# them up. 3) To use the "targets" option, make sure your TextGrid includes a word tier #
# and the "use word tier" option is selected, then create a tab-delimited text file     #
# containing a list of the words (case-sensitive) you'd like to extract, separated by   #
# newlines, with "word" as the column header. 4) Because this script converts Arpabet   #
# to Unicode IPA, make sure Praat's text writing preferences are set to UTF-8 before    #
# running the script. If you intend to view the results file in Excel, in Windows you   #
# will have to open it from within Excel and import it specifying UTF-8 encoding in     #
# order for the characters to display correctly. On OSX, you will first have to convert #
# the file to UTF-16 Little Endian using a text editor, then import it into Excel.      #
#                                                                                       #
# CHANGELOG:                                                                            #
# 04/18/15: Fixed bug in code preceding/following phone code, fixed error in            #
#           instructions.                                                               #
# 02/08/14: Rewrote code for results file generation, added pitch extraction option,    #
#           added metadata output to the script (analyst, version, settings, etc.).     #
# 11/06/13: Corrected ARPABET-IPA conversion (thanks Daniel).                           #
# 06/26/13: Fixed bug introduced by a recent version of Praat.                          #
# 04/27/13: Fixed error in Arpabet -> IPA conversion.                                   #
# 03/10/13: Reordered formant options, fixed preceding/following phone extraction bug.  #
# 02/22/13: Added option to append data to an existing results file.                    #
# 02/21/13: Added counter for number of vowels analyzed if using targets option, and    #
#           the option to only analyze stressed vowels.                                 #
# 02/14/13: Fixed bugs involving running the script over multiple files, and extracting #
#           surrounding phonological environment. Also simplified the extraction of     #
#           sounds from longsounds.                                                     #
# 01/19/13: Added ability to select the formant measurement points.                     #
# 01/01/13: Release version.                                                            #
#                                                                                       #
# This modified script distributed under the GNU General Public License v3 or higher,   #
# copyright 1/2013, John Riebold (riebold@uw.edu).                                      #
#########################################################################################

# PROMPT THE USER FOR THE LOCATION OF THE INPUT/OUTPUT FILES, FORMANT SETTINGS, ETC.
form Arpabet Vowel Analyzer
	comment Paths:
	sentence Soundfile_directory 
	sentence Textgrid_directory 
	sentence Results_file results.txt
	optionmenu Use_targets_file 2
		option yes
		option no
	sentence Targets_file targets.txt
	comment Tiers:
	sentence Phone_tier phone
	sentence Word_tier word
	optionmenu Use_notes_tier: 2
		option yes
		option no
	sentence Notes_tier notes
	comment Options:
	optionmenu Analyze_unstressed_vowels: 2
		option yes
		option no
	comment Formant settings:
	optionmenu Measurement_points: 4
		option Midpoint
		option 30%/50%/70%
		option 25%/50%/75%
		option 20%/50%/80%
	positive Maximum_formant_(Hz) 5500
	integer Number_of_formants 5
	comment Pitch Settings
	optionmenu Extract_pitch: 2
		option yes
		option no
	integer left_Pitch_range_(Hz) 75
	integer right_Pitch_range_(Hz) 500
	comment Analyst:
	sentence Initials
endform

# SET ADDITIONAL FORMANT OPTIONS, CHANGE IF NECESSARY
preemphasis_from = 50
window_length = 0.025
time_step = 0.01

# DEFINE EMPTY VARIABLE IN CASE LABEL EMPTY/NOT PRESENT IN TEXTGRID
notes_label$ = ""

# DEFINE DUMMY COUNTER VARIABLES FOR END-OF-SCRIPT REPORT
sound_count = 0
vowel_count = 0
target_vowel_count = 0

# GET TIME AND OS
rundate$ = date$ ()
if windows = 1
    os$ = "Windows"
elsif macintosh = 1
    os$ = "OSX"
elsif unix = 1
	os$ = "Linux"
endif
version$ = "'praatVersion'"
version$ = replace_regex$ ("'version$'", "(\d)(\d)(\d{2,2})", "\1.\2.\3", 0)

# INITIALIZE RESULTS FILE
if fileReadable (results_file$)
	beginPause ("Warning")
		comment ("The file 'results_file$' already exists.")
	results_choice = endPause ("Append", "Overwrite", 1)
	if results_choice = 2
		filedelete 'results_file$'
		call InitializeResultsFile
	endif
else
	call InitializeResultsFile
endif

# OPEN TARGETS FILE
if use_targets_file = 1
	Read Table from tab-separated file... 'targets_file$'
	targets$ = selected$ ("Table", 1)
endif

# CREATE LIST OF SOUNDFILES IN DIRECTORY
Create Strings as file list... list 'soundfile_directory$'*.wav
numberoffiles = Get number of strings

# GO THROUGH EACH SOUND FILE
for ifile to numberoffiles
	select Strings list
	filename$ = Get string... ifile

	# OPEN SOUNDFILE FROM LIST
	Open long sound file... 'soundfile_directory$''filename$'
	soundfile$ = selected$ ("LongSound", 1)

	# INCREMENT SOUND COUNT
	sound_count = sound_count + 1

	# OPEN TEXTGRID OF SAME NAME
	gridfile$ = "'textgrid_directory$''soundfile$'.TextGrid"
	if fileReadable (gridfile$)
		Read from file... 'gridfile$'

		# FIND TIER NUMBER FOR PHONE AND WORD TIERS
		call GetTier 'phone_tier$' phone_tier
		call GetTier 'word_tier$' word_tier
		intervals = Get number of intervals... phone_tier

		# EXTRACT ANNOTATED PORTION OF SOUNDFILE
		gridstart = Get start time
		gridend = Get end time
		select LongSound 'soundfile$'
		Extract part... gridstart gridend yes

		# REMOVE LONGSOUND
		select LongSound 'soundfile$'
		Remove

		# EXTRACT FORMANT AND PITCH OBJECTS
		select Sound 'soundfile$'
		To Formant (burg)... time_step number_of_formants maximum_formant window_length preemphasis_from
		if extract_pitch = 1
			select Sound 'soundfile$'
			To Pitch... 0 left_Pitch_range right_Pitch_range
		endif

		# PASS THROUGH EACH INTERVAL IN SELECTED TIER AND GET LABEL
		for interval to intervals
			select TextGrid 'soundfile$'
			phone_label$ = Get label of interval... phone_tier interval

			# CHECK IF INTERVAL CONTAINS ARPABET VOWEL, IF SO, ANALYZE IT
			if analyze_unstressed_vowels = 1
				if phone_label$ = "AO1" or phone_label$ = "AA1" or phone_label$ = "IY1" or phone_label$ = "UW1" or phone_label$ = "EH1" or phone_label$ = "IH1" or phone_label$ = "UH1" or phone_label$ = "AH1" or phone_label$ = "AX1" or phone_label$ = "AE1" or phone_label$ = "EY1" or phone_label$ = "AY1" or phone_label$ = "OW1" or phone_label$ = "AW1" or phone_label$ = "OY1" or phone_label$ = "ER1" or phone_label$ = "AXR1" or phone_label$ = "AO2" or phone_label$ = "AA2" or phone_label$ = "IY2" or phone_label$ = "UW2" or phone_label$ = "EH2" or phone_label$ = "IH2" or phone_label$ = "UH2" or phone_label$ = "AH2" or phone_label$ = "AX2" or phone_label$ = "AE2" or phone_label$ = "EY2" or phone_label$ = "AY2" or phone_label$ = "OW2" or phone_label$ = "AW2" or phone_label$ = "OY2" or phone_label$ = "ER2" or phone_label$ = "AXR2" or phone_label$ = "AO0" or phone_label$ = "AA0" or phone_label$ = "IY0" or phone_label$ = "UW0" or phone_label$ = "EH0" or phone_label$ = "IH0" or phone_label$ = "UH0" or phone_label$ = "AH0" or phone_label$ = "AX0" or phone_label$ = "AE0" or phone_label$ = "EY0" or phone_label$ = "AY0" or phone_label$ = "OW0" or phone_label$ = "AW0" or phone_label$ = "OY0" or phone_label$ = "ER0" or phone_label$ = "AXR0"
				call AnalyzeVowel
				endif
			elsif analyze_unstressed_vowels = 2
				if phone_label$ = "AO1" or phone_label$ = "AA1" or phone_label$ = "IY1" or phone_label$ = "UW1" or phone_label$ = "EH1" or phone_label$ = "IH1" or phone_label$ = "UH1" or phone_label$ = "AH1" or phone_label$ = "AX1" or phone_label$ = "AE1" or phone_label$ = "EY1" or phone_label$ = "AY1" or phone_label$ = "OW1" or phone_label$ = "AW1" or phone_label$ = "OY1" or phone_label$ = "ER1" or phone_label$ = "AXR1" or phone_label$ = "AO2" or phone_label$ = "AA2" or phone_label$ = "IY2" or phone_label$ = "UW2" or phone_label$ = "EH2" or phone_label$ = "IH2" or phone_label$ = "UH2" or phone_label$ = "AH2" or phone_label$ = "AX2" or phone_label$ = "AE2" or phone_label$ = "EY2" or phone_label$ = "AY2" or phone_label$ = "OW2" or phone_label$ = "AW2" or phone_label$ = "OY2" or phone_label$ = "ER2" or phone_label$ = "AXR2"
				call AnalyzeVowel
				endif
			endif
		endfor

		# REMOVE TEXTGRID OBJECT FROM OBJECT LIST
		select TextGrid 'soundfile$'
		Remove
	endif

	# REMOVE TEMPORARY OBJECTS AND CONTINUE WITH NEXT FILE
	select Sound 'soundfile$'
	plus Formant 'soundfile$'
	if extract_pitch = 1
		plus Pitch 'soundfile$'
	endif
	Remove
endfor

# REMOVE REST OF OBJECTS AND FINISH
select Strings list
if use_targets_file = 1
	plus Table 'targets$'
endif
Remove

# PRINT A REPORT
echo Done. Analyzed 'target_vowel_count' of 'vowel_count' vowels in 'sound_count' file(s).

# PROCEDURE TO ANALYZE VOWELS
procedure AnalyzeVowel

	# INCREMENT VOWEL COUNT
	vowel_count = vowel_count + 1

	# GET START AND END TIMES, CALCULATE DURATION, ETC.
	start = Get starting point... phone_tier interval
	end = Get end point... phone_tier interval
	duration = (end-start)
	duration_ms = duration*1000
	midpoint = (start+end)/2

	# DETERMINE WHICH POINTS TO MEASURE
	if measurement_points = 2
		onset = start+(duration*0.3)
		offset = end-(duration*0.3)
	elsif measurement_points = 3
		onset = start+(duration/4)
		offset = end-(duration/4)
	elsif measurement_points = 4
		onset = start+(duration/5)
		offset = end-(duration/5)
	endif

	# GET FORMANT VALUES AT INTERVAL(S)
	select Formant 'soundfile$'
	f1_2 = Get value at time... 1 midpoint Hertz Linear
	f2_2 = Get value at time... 2 midpoint Hertz Linear
	f3_2 = Get value at time... 3 midpoint Hertz Linear
	if measurement_points != 1
		f1_1 = Get value at time... 1 onset Hertz Linear
		f2_1 = Get value at time... 2 onset Hertz Linear
		f3_1 = Get value at time... 3 onset Hertz Linear
		f1_3 = Get value at time... 1 offset Hertz Linear
		f2_3 = Get value at time... 2 offset Hertz Linear
		f3_3 = Get value at time... 3 offset Hertz Linear
	endif

	# EXTRACT PITCH AT INTERVAL(S)
	if extract_pitch = 1
		select Pitch 'soundfile$'
		f0_2 = Get value at time... midpoint Hertz Linear
		if measurement_points != 1
			f0_1 = Get value at time... onset Hertz Linear
			f0_3 = Get value at time... offset Hertz Linear
		endif
	endif

	# GET WORD VOWEL IS FROM
	select TextGrid 'soundfile$'
	word = Get interval at time... word_tier midpoint
	word_label$ = Get label of interval... word_tier word

	# GET PRECEDING AND FOLLOWING ENVIRONMENTS, SKIPPING SPACES
	preceding_label$ = Get label of interval... phone_tier (interval-1)
	if preceding_label$ = "sp" or preceding_label$ = "sil"
		if interval-2 >= 1
			preceding_label$ = Get label of interval... phone_tier (interval-2)
		elsif interval-2 < 1
			preceding_label$ = ""
		endif
	endif
	if interval <= intervals-1
		following_label$ = Get label of interval... phone_tier (interval+1)
		if following_label$ = "sp" or following_label$ = "sil"
			if interval+2 <= intervals
				following_label$ = Get label of interval... phone_tier (interval+2)
			elsif interval+2 > intervals
				following_label$ = ""
			endif
		endif
	elsif interval+1 > intervals
		following_label$ = ""
	endif

	# GET CONTENTS OF NOTES TIER
	if use_notes_tier = 1
		call GetTier 'notes_tier$' notes_tier
		note = Get interval at time... notes_tier midpoint
		notes_label$ = Get label of interval... notes_tier note
	endif

	# CONVERT WORDS TO LOWERCASE, ARPABET TO UNICODE IPA
	word_label$ = replace_regex$ (word_label$, "[A-Z]", "\L&", 0)
	call ConvertText phone_label$
	call ConvertText preceding_label$
	call ConvertText following_label$

	# CREATE RESULTS LINE
	resultsline_begin$ = "'soundfile$'	'word_label$'	'phone_label$'	'preceding_label$'	'following_label$'	'start'	'end'	'duration_ms'	"
	if use_notes_tier = 1
		resultsline_end$ = "'notes_label$'	'initials$'	'rundate$'	Max formant: 'maximum_formant' Hz, Number of formants: 'number_of_formants', Window length: 'window_length' s	'version$'	'os$''newline$'"
	else
		resultsline_end$ = "'initials$'	'rundate$'	Max formant: 'maximum_formant' Hz, Number of formants: 'number_of_formants', Window length: 'window_length' s	'version$'	'os$''newline$'"
	endif
	resultsline_middle$ = "'f1_1'	'f1_2'	'f1_3'	'f2_1'	'f2_2'	'f2_3'	'f3_1'	'f3_2'	'f3_3'	"
	if measurement_points = 1
		resultsline_middle$ = "'f1_2'	'f2_2'	'f3_2'	"
		if extract_pitch = 1
			resultsline_middle$ = "'f0_2'	" + resultsline_middle$
		endif
	elsif measurement_points != 1 and extract_pitch = 1
		resultsline_middle$ = "'f0_1'	'f0_2'	'f0_3'	" + resultsline_middle$
	endif
	resultsline$ = resultsline_begin$ + resultsline_middle$ + resultsline_end$

	# OUTPUT TO RESULTS FILE
	if use_targets_file = 1
		select Table 'targets$'
		match = Search column... word 'word_label$'
		if match
			target_vowel_count = target_vowel_count + 1
			fileappend "'results_file$'" 'resultsline$'
		endif
	else
		target_vowel_count = target_vowel_count + 1
		fileappend "'results_file$'" 'resultsline$'
	endif
endproc

# PROCEDURE TO INITIALIZE RESULTS FILE
procedure InitializeResultsFile
	header_begin$ = "Filename	Word	Vowel	Preceding Phone	Following Phone	Begin Time (s)	End Time (s)	Duration (ms)	"
	if use_notes_tier = 1
		header_end$ = "Notes	Analyst	Date	Settings	Praat Version	OS'newline$'"
	else
		header_end$ = "Analyst	Date	Settings	Praat Version	OS'newline$'"
	endif
	if measurement_points = 1
		header_middle$ = "F1 50%	F2 50%	F3 50%	"
		if extract_pitch = 1
			header_middle$ = "F0 50%	" + header_middle$
		endif
	elsif measurement_points = 2
		header_middle$ = "F1 30%	F1 50%	F1 70%	F2 30%	F2 50%	F2 70%	F3 30%	F3 50%	F3 70%	"
		if extract_pitch = 1
			header_middle$ = "F0 30%	F0 50%	F0 70%	" + header_middle$
		endif
	elsif measurement_points = 3
		header_middle$ = "F1 25%	F1 50%	F1 75%	F2 25%	F2 50%	F2 75%	F3 25%	F3 50%	F3 75%	"
		if extract_pitch = 1
			header_middle$ = "F0 25%	F0 50%	F0 75%	" + header_middle$
		endif
	elsif measurement_points = 4
		header_middle$ = "F1 20%	F1 50%	F1 80%	F2 20%	F2 50%	F2 80%	F3 20%	F3 50%	F3 80%	"
		if extract_pitch = 1
			header_middle$ = "F0 20%	F0 50%	F0 80%	" + header_middle$
		endif
	endif
	header$ = header_begin$ + header_middle$ + header_end$
	fileappend "'results_file$'" 'header$'
endproc

# PROCEDURE TO FIND NUMBER OF TIER WITH GIVEN LABEL
procedure GetTier name$ variable$
	numberOfTiers = Get number of tiers
	itier = 1
	repeat
		tier$ = Get tier name... itier
		itier = itier + 1
	until tier$ = name$ or itier > numberOfTiers
	if tier$ <> name$
		'variable$' = 0
	else
		'variable$' = itier - 1
	endif
	if 'variable$' = 0
		exit The tier 'name$' is missing from the file 'soundfile$'!
	endif
endproc

# PROCEDURE TO CONVERT ARPABET TO UNICODE IPA
procedure ConvertText arplabel$
	'arplabel$' = replace_regex$ ('arplabel$', "[A-Z]", "\L&", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ao\d", "ɔ", 0) 
	'arplabel$' = replace_regex$ ('arplabel$', "aa\d", "ɑ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "iy\d", "i", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "uw\d", "u", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "eh\d", "ɛ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ih\d", "ɪ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "uh\d", "ʊ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ah[12]", "ʌ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ah0", "ə", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ax\d", "ə", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ae\d", "æ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ey\d", "e", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ay\d", "aj", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ow\d", "o", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "aw\d", "aw", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "oy\d", "ɔj", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "er\d", "ɝ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "axr\d", "ɚ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "ch", "ʧ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "jh", "ʤ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "th", "θ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "dh", "ð", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "sh", "ʃ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "zh", "ʒ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "hh", "h", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "em", "m̩", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "en$", "n̩", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "^ng", "ŋ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "eng", "ŋ̩", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "el", "ɫ̩", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "r", "ɹ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "dx", "ɾ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "nx", "ɾ̃", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "q", "ʔ", 0)
	'arplabel$' = replace_regex$ ('arplabel$', "y", "j", 0)
endproc