## CodeTokens.praat ## Dan Villarreal (d.vill@pitt.edu) ## ## With a folder of TextGrids+sound files and a csv file, creates an ## interface for coding tokens interactively that populates the csv. ## Meant to be used with APLS version 0.4.1 (https://apls.pitt.edu). ## ## To use: ## 1. Search for tokens of interest (https://djvill.github.io/APLS/doc/search) ## 2. _CSV Export_ with "Include annotation start/end times:" set to "(always) if manually, automatically, or default aligned" ## - Keep all default field/layer selections ## 3. _Utterance Export_ with all defaults ## 4. _Audio Export_ with all defaults ## 5. Unzip the two .zip folders downloaded in the previous two steps ## 6. Make note of where these files and folders are saved on your computer ## 7. Run this script ##Require Praat version >= 6.4.32 for random_initializeWithSeedUnsafelyButPredictably() @validateVersion: "6.4.32" ##Parameters ##Use GUI form to specify script parameters? (0 to use parameters specified in the following lines) use_form = 1 ##Search name search_name$ = "" @search_name_to_filename: search_name$ search_name$ = search_name_to_filename.search_name$ ##Directory containing input csv file, and subfolders with TextGrids & sound files windows_home$ = environment$("USERPROFILE") if windows_home$ <> "" in_dir$ = replace$(windows_home$, "\", "/", 0) + "/Downloads" else in_dir$ = environment$("HOME") + "/Downloads" endif ##Directory containing TextGrids tg_dir$ = in_dir$ + "/fragments_" + search_name$ + "/" ##Directory containing sound files wav_dir$ = in_dir$ + "/media_" + search_name$ + "/" ##Input csv file with tokens to code in_csv$ = in_dir$ + "/" + "results_" + search_name$ + ".csv" ##Write csv file for saving coded tokens? write = 1 ##Suffix on output csv file out_csv_suffix$ = "_coded" ##Output csv file for saving coded tokens out_csv$ = replace$(in_csv$, ".csv", out_csv_suffix$ + ".csv", 1) ##Name of column for storing codes code_col$ = "HandCode" ##Variants variants$# = {"Full vowel", "Reduced"} ##Number of times to play token when it's loaded autoplays = 1 ##Buffer in seconds between multiple autoplays (if applicable) autoplay_buffer = 0.25 ##Randomize token order? shuffle = 1 ##Input column names transcript_column$ = "Transcript" utterance_start_time_column$ = "Line" utterance_end_time_column$ = "LineEnd" word_start_time_column$ = "word start" word_end_time_column$ = "word end" ##Special codes unsure_code$ = "(unsure)" excluded_code$ = "(not a token)" ##Tracking variables file_in_progress = 0 already_visited_advanced = 0 ##Optional settings dialogs while use_form if use_form = 1 ##Basic settings ##Unparse variants vector to string variants$ = variants$#[1] for i from 2 to size(variants$#) variants$ = variants$ + "," + variants$#[i] endfor ##Form beginPause: "Token coding - basic settings" if not already_visited_advanced comment: "Search name is at the top of the Results page" text: 2, "Search name", search_name$ comment: "Folder that contains files downloaded from APLS:" comment: "1. CSV file" comment: "2. Subfolder with TextGrid files" comment: "3. Subfolder with wav files" folder: "Input folder", in_dir$ endif comment: "Check the box if you're picking up where you left off on a partially coded file" boolean: "File in progress", 0 comment: "Column for storing codes (will be created if it doesn't exist)" sentence: "Code column", code_col$ comment: "Variants (separated by commas)" comment: "Example: diphthong,monophthong,intermediate" sentence: "Variants", variants$ comment: "Other settings" integer: "Number of autoplays", autoplays boolean: "Randomize token order", 1 boolean: "Save output to csv file", 1 clicked = endPause: "Continue", "Advanced settings", 1 ##Translate/validate variables if search_name$ <> "" @search_name_to_filename: search_name$ search_name$ = search_name_to_filename.search_name$ else exitScript: "Search name must not be blank" endif if folderExists(input_folder$) in_dir$ = input_folder$ else exitScript: "Input folder " + input_folder$ + " doesn't exist" endif if code_column$ <> "" code_col$ = code_column$ else exitScript: "Code column must not be blank" endif if number_of_autoplays > 0 autoplays = number_of_autoplays else autoplays = 0 endif shuffle = randomize_token_order write = save_output_to_csv_file ##Defaults for advanced settings if not already_visited_advanced in_dir$ = replace_regex$(in_dir$, "/$", "", 1) tg_dir$ = in_dir$ + "/fragments_" + search_name$ + "/" wav_dir$ = in_dir$ + "/media_" + search_name$ + "/" in_csv$ = in_dir$ + "/" + "results_" + search_name$ + ".csv" if file_in_progress if not endsWith(in_csv$, out_csv_suffix$ + ".csv") in_csv$ = replace$(in_csv$, ".csv", out_csv_suffix$ + ".csv", 1) endif out_csv$ = in_csv$ else out_csv$ = replace$(in_csv$, ".csv", out_csv_suffix$ + ".csv", 1) endif endif ##Parse variants string to vector num_variants = 1 variants$ = replace_regex$(variants$, ",+", ",", 0) variants$ = replace_regex$(variants$, ",$", "", 1) while index(variants$, ",") v$[num_variants] = replace_regex$(variants$, ",.+", "", 1) variants$ = replace$(variants$, v$[num_variants] + ",", "", 1) num_variants = num_variants + 1 endwhile v$[num_variants] = variants$ if num_variants = 1 exitScript: "Please specify 2 or more variants (separated with commas)" endif variants$# = empty$#(num_variants) for i from 1 to num_variants if v$[i] = unsure_code$ exitScript: "This script uses """ + unsure_code$ + """ to mark unsure tokens." + newline$ + "Please rename the """ + unsure_code$ + """ variant to something else." + newline$ endif if v$[i] = excluded_code$ exitScript: "This script uses """ + excluded_code$ + """ to mark excluded tokens." + newline$ + "Please rename the """ + excluded_code$ + """ variant to something else." + newline$ endif variants$#[i] = v$[i] endfor ##Determine next destination if clicked = 1 use_form = 0 else use_form = clicked endif elsif use_form = 2 ##Advanced settings beginPause: "Token coding - advanced settings" comment: "File paths" infile: "Input csv file (downloaded from APLS)", in_csv$ folder: "Folder that contains TextGrid files", tg_dir$ folder: "Folder that contains wav files", wav_dir$ if write outfile: "Output csv file (will be created if it doesn't exist)", out_csv$ endif comment: "Input column names" sentence: "Transcript", transcript_column$ sentence: "Utterance start time", utterance_start_time_column$ sentence: "Utterance end time", utterance_end_time_column$ sentence: "Word start time", word_start_time_column$ sentence: "Word end time", word_end_time_column$ if autoplays > 1 comment: "Other settings" positive: "Time between autoplays", autoplay_buffer endif clicked = endPause: "Continue", "Basic settings", 1 ##Validate/translate variables (validate paths below) in_csv$ = input_csv_file$ tg_dir$ = replace_regex$(folder_that_contains_TextGrid_files$, "/$", "", 1) wav_dir$ = replace_regex$(folder_that_contains_wav_files$, "/$", "", 1) if write if output_csv_file$ = "" exitScript: "Output csv file must not be blank" elsif not endsWith_caseInsensitive(output_csv_file$, ".csv") out_csv$ = output_csv_file$ + ".csv" else out_csv$ = output_csv_file$ endif endif if transcript$ <> "" transcript_column$ = transcript$ else exitScript: "Transcript column must not be blank" endif if utterance_start_time$ <> "" utterance_start_time_column$ = utterance_start_time$ else exitScript: "Utterance start time column must not be blank" endif if utterance_end_time$ <> "" utterance_end_time_column$ = utterance_end_time$ else exitScript: "Utterance end time column must not be blank" endif if word_start_time$ <> "" word_start_time_column$ = word_start_time$ else exitScript: "Word start time column must not be blank" endif if word_end_time$ <> "" word_end_time_column$ = word_end_time$ else exitScript: "Word end time column must not be blank" endif if autoplays > 1 autoplay_buffer = time_between_autoplays endif ##Determine next destination use_form = clicked - 1 already_visited_advanced = 1 endif ##Validate paths if not fileReadable(in_csv$) exitScript: "Input csv file " + in_csv$ + " doesn't exist" endif if not folderExists(tg_dir$) exitScript: "TextGrid folder " + tg_dir$ + " doesn't exist" endif if not folderExists(wav_dir$) exitScript: "Wav folder " + wav_dir$ + " doesn't exist" endif endwhile ##Ask before overwriting out_csv$ if write and fileReadable(out_csv$) beginPause: "Overwrite output file?" comment: "Output csv file already exists:" comment: out_csv$ comment: "" comment: "Continue and overwrite it?" clicked = endPause: "Yes", "No", 2 if clicked = 2 beginPause: "Bye!" comment: "Bye!" endPause: "OK", 1, 1 exitScript() endif endif ##Set up data table = Read Table from comma-separated file: in_csv$ num_tokens = Get number of rows has_code_column = Get column index: code_col$ ##Handle different possible data states if not has_code_column Append column: code_col$ tokens_to_code# = to#(num_tokens) else ##If code_col$ already exists and has entries, ask if the user wants to code all tokens (supply defaults for file_in_progress) blank# = List row numbers where... self$[row, code_col$] = "" coded# = List row numbers where... self$[row, code_col$] <> "" and self$[row, code_col$] <> unsure_code$ and self$[row, code_col$] <> excluded_code$ unsure# = List row numbers where... self$[row, code_col$] = unsure_code$ excluded# = List row numbers where... self$[row, code_col$] = excluded_code$ num_blank = size(blank#) num_coded = size(coded#) num_unsure = size(unsure#) num_excluded = size(excluded#) ##Defaults code_uncoded_tokens = 1 recode_coded_tokens = 0 recode_unsure_tokens = 0 recheck_excluded_tokens = 0 if num_blank = num_tokens tokens_to_code# = to#(num_tokens) else ##Supply defaults for file_in_progress if not file_in_progress beginPause: "Select tokens to code" comment: "Currently, the " + code_col$ + " column has:" comment: "Uncoded tokens: " + string$(num_blank) comment: "Coded tokens: " + string$(num_coded) comment: "'Unsure' tokens: " + string$(num_unsure) comment: "Excluded tokens: " + string$(num_excluded) comment: "" comment: "Which tokens would you like to code?" if num_blank > 0 boolean: "Code uncoded tokens", code_uncoded_tokens endif if num_coded > 0 boolean: "Recode coded tokens", recode_coded_tokens endif if num_unsure > 0 boolean: "Recode unsure tokens", recode_unsure_tokens endif if num_excluded > 0 boolean: "Recheck excluded tokens", recheck_excluded_tokens endif endPause: "Continue", 1 endif ##Construct tokens_to_code# tokens_to_code# = zero#(0) if code_uncoded_tokens tokens_to_code# = combine#(tokens_to_code#, blank#) endif if recode_coded_tokens tokens_to_code# = combine#(tokens_to_code#, coded#) endif if recode_unsure_tokens tokens_to_code# = combine#(tokens_to_code#, unsure#) endif if recheck_excluded_tokens tokens_to_code# = combine#(tokens_to_code#, excluded#) endif if size(tokens_to_code#) = 0 beginPause: "Bye!" comment: "No tokens selected. Bye!" endPause: "OK", 1, 1 exitScript() endif endif endif ##Optionally shuffle if shuffle random_initializeWithSeedUnsafelyButPredictably(1234) tokens_to_code# = shuffle#(tokens_to_code#) endif ##Loop over tokens and code i = 0 repeat ##Iterate i = i + 1 ##Get data from table token = tokens_to_code#[i] selectObject: table transcript$ = Get value: token, transcript_column$ line_start = Get value: token, utterance_start_time_column$ line_end = Get value: token, utterance_end_time_column$ word_start = Get value: token, word_start_time_column$ word_end = Get value: token, word_end_time_column$ ##Open in editor file_stem$ = replace$(transcript$, ".eaf", "", 1) + "__" + fixed$(line_start, 3) + "-" + fixed$(line_end, 3) sound = Read from file: wav_dir$ + "/" + file_stem$ + ".wav" tg = Read from file: tg_dir$ + "/" + file_stem$ + ".TextGrid" selectObject: sound, tg View & Edit editor: tg Select: word_start - line_start, word_end - line_start Zoom to selection Zoom out for ap from 1 to autoplays Play window if ap < autoplays sleep(autoplay_buffer) endif endfor still_deciding = 1 while still_deciding beginPause: "Code this token" choice: "Code", 1 for v from 1 to size(variants$#) option: variants$#[v] endfor clicked = endPause: "Code", "Unsure", "Exclude", "Replay", if write then "Save & exit" else "Exit" fi, 1, 5 still_deciding = 0 if clicked = 2 code$ = unsure_code$ elsif clicked = 3 code$ = excluded_code$ elsif clicked = 4 still_deciding = 1 Play window elsif clicked = 5 code$ = "" i = size(tokens_to_code#) endif endwhile endeditor ##Write to table selectObject: table Set string value: token, code_col$, code$ ##Clean up removeObject: sound, tg until i = size(tokens_to_code#) ##Optionally write to output csv file if write selectObject: table Save as comma-separated file: out_csv$ beginPause: "All done!" comment: "All done!" comment: "" comment: "Coding file saved as" comment: out_csv$ endPause: "OK", 1, 1 else beginPause: "All done!" comment: "All done!" comment: "" comment: "To save your coding file..." comment: "1. In the next window, click 'Save'" comment: "2. Click 'Save as comma-separated file...'" comment: "3. Choose a file location and name" endPause: "OK", 1, 1 endif ##Convert search names to filenames ##https://nzilbb.github.io/ag/apidocs/nzilbb/util/IO.html#SafeFileNameUrl(java.lang.String) procedure search_name_to_filename: .search_name$ .search_name$ = replace_regex$(.search_name$, "[\\?*+$]", "", 0) .search_name$ = replace$(.search_name$, "<=", "-le-", 0) .search_name$ = replace$(.search_name$, "<", "-lt-", 0) .search_name$ = replace$(.search_name$, ">=", "-ge-", 0) .search_name$ = replace$(.search_name$, ">", "-gt-", 0) .search_name$ = replace_regex$(.search_name$, "[|:!=^]", "_", 0) .search_name$ = replace$(.search_name$, ",", "-", 0) .search_name$ = replace$(.search_name$, "@", "-at-", 0) .search_name$ = replace$(.search_name$, "&", "-amp-", 0) .search_name$ = replace_regex$(.search_name$, "^\.", "_.", 0) .search_name$ = replace_regex$(.search_name$, "\.$", "._", 0) .search_name$ = replace$(.search_name$, newline$, "", 0) endproc #### End of main script ##Plugin: ParseValidateVersion.praat ##https://github.com/djvill/hwttiwtot/blob/abd33fa/Praat/ParseValidateVersion.praat procedure parseVersion: .v$ ##Ensure version string is correctly formatted versionRE$ = "^\d+\.\d+(\.\d+)?$" if not index_regex(.v$, versionRE$) exitScript: "Error in @parseVersion: Incorrect format for version string (", .v$, ")." endif ##Parse version string .major = number(replace_regex$(.v$, "\..+", "", 0)) dot1 = index(.v$, ".") dot2 = rindex(.v$, ".") if dot1 = dot2 .minor = number(replace_regex$(.v$, ".+\.", "", 0)) .patch = 0 else noMajor$ = replace_regex$(.v$, "^.+?\.", "", 0) .minor = number(replace_regex$(noMajor$, "\..+", "", 0)) .patch = number(replace_regex$(.v$, ".+\.", "", 0)) endif endproc procedure validateVersion: .minVersion$ ##Parse current and minimum versions @parseVersion: praatVersion$ currMajor = parseVersion.major currMinor = parseVersion.minor currPatch = parseVersion.patch @parseVersion: .minVersion$ minMajor = parseVersion.major minMinor = parseVersion.minor minPatch = parseVersion.patch ##Construct exit message newlineIndent$ = newline$ + replace_regex$("Error: ", ".", " ", 0) exitMsg$ = "This script requires Praat to be at least version " + .minVersion$ + newlineIndent$ + "You have version " + praatVersion$ + newlineIndent$ + "Please download a more recent version of Praat:" + newlineIndent$ + "https://www.fon.hum.uva.nl/praat/" ##Compare current to minimum if currMajor < minMajor exitScript: exitMsg$ elsif currMajor = minMajor if currMinor < minMinor exitScript: exitMsg$ elsif currMinor = minMinor if currPatch < minPatch exitScript: exitMsg$ endif endif endif endproc