[Onsei: Japanese pitch accent practice tool](https://github.com/itsupera/onsei)
================================================================================

<details>
<summary>Click here for instructions !</summary>
<ul>
<li>Select a sample to mimick and listen to it</li>
<li>Record yourself mimicking using the record button below "Your recording:"</li>
<li>Click on the "Compare" button</li>
<li>Check out the Pitch comparison graph below to see your mistakes are</li>
<li>Try again to match the teacher's pitch !</li>
<li>If this is too easy for you, disable the Autoplay in the Options and try reading the sentence without listening first :)</li>
</ul>
Any feedback or suggestion ? Please tell me in <a href="https://gitter.im/itsupera-onsei/community">this Gitter chat</a>
</details>

In [None]:
import os
import re

import tempfile;

import ipywidgets as widgets;
from ipywebrtc import AudioRecorder, CameraStream, AudioStream;

from onsei.speech_record import SpeechRecord, AlignmentMethod;
from onsei.utils import segment_speech;
from onsei.figures import ViewRecordFigure, CompareFigure;
from onsei.sentence import Sentence;
from onsei.widgets import SampleSelector, UploadSample;

# Globals


def get_jsut_samples():
    samples = {}
    basepath = "data/jsut_basic5000_sample"
    with open(os.path.join(basepath, "transcript_utf8.txt")) as f:
        for line in f:
            basename, sentence = line.rstrip().split(':')
            filename = os.path.join(basepath, f"{basename}.wav")
            samples[sentence] = {
                "filename": filename,
                "sentence": sentence,
            }
    return samples

def get_forvo_samples():
    samples = {}
    basepath = "data/forvo/everyday_phrases/greetings_and_apologies"
    for fname in sorted(os.listdir(basepath)):
        m = re.match(r"^pronunciation_ja_([^.]+).wav$", fname)
        if m:
            sentence = m.group(1)
            samples[sentence] = {
                "filename": os.path.join(basepath, fname),
                "sentence": sentence,
            }
    return samples

samples = {
    "Forvo": get_forvo_samples(),
    "JSUT Basic 5000 corpus": get_jsut_samples(),
    "My samples": {},  # Special collection for user's samples
}

teacher_rec = None
student_rec = None
sentence = None

default_autoplay = True
show_spaces_between_segments = False
default_crop_vad = True

my_samples_dir = tempfile.TemporaryDirectory();


# Create widgets

w_sample_selector = SampleSelector(samples)

w_upload_sample = UploadSample(samples, my_samples_dir.name)

w_autoplay_tick = widgets.Checkbox(
    value=default_autoplay,
    description='Autoplay',
    disabled=False,
    indent=False
)
w_show_spaces_tick = widgets.Checkbox(
    value=show_spaces_between_segments,
    description='Show spaces between sentence segments',
    disabled=False,
    indent=False
)
w_crop_vad_tick = widgets.Checkbox(
    value=default_crop_vad,
    description='Crop graphs to detected speech',
    disabled=False,
    indent=False
)
w_alignment_method_dropdown = widgets.Dropdown(
    options=[a.value for a in AlignmentMethod],
    description='Alignment method:',
    disabled=False,
)
w_options_accordion = widgets.Accordion(
    children=[widgets.VBox([
        w_autoplay_tick,
        w_show_spaces_tick,
        w_crop_vad_tick,
        w_alignment_method_dropdown,
    ])],
    selected_index=None,
)
w_options_accordion.set_title(0, "Options")

w_audio = widgets.Audio(value=b'', format='wav', autoplay=default_autoplay, loop=False)

w_sentence = widgets.HTML(value='')

camera = CameraStream(constraints={'audio': True, 'video': False})
w_recorder = AudioRecorder(stream=camera)

w_compare_btn = widgets.Button(description="Compare")

w_cmp_result = widgets.Label(value='')

fig_teacher = ViewRecordFigure(title="Teacher's recording")
fig_student = ViewRecordFigure(title="Your recording")

fig_cmp = CompareFigure()

# Callbacks


def add_uploaded_sample(change):
    global samples

    sample = change["new"]
    samples['My samples'][sample['sentence']] = sample

    # Switch to this sample
    w_sample_selector.set_selection('My samples', sample['sentence'])

w_upload_sample.observe(add_uploaded_sample, 'value')


def update_autoplay(change):
    w_audio.autoplay = change['new']

w_autoplay_tick.observe(update_autoplay, 'value')
    
def update_show_spaces(change):
    global show_spaces_between_segments
    show_spaces_between_segments = change['new']
    update_sentence()

w_show_spaces_tick.observe(update_show_spaces, 'value')


widgets.jslink((w_crop_vad_tick, 'value'), (fig_teacher, 'crop_vad'))
widgets.jslink((w_crop_vad_tick, 'value'), (fig_student, 'crop_vad'))


def get_sample_audio_data(sample):
    return open(sample['filename'], 'rb').read()


def update_sentence():
    if sentence:
        sentence_html = sentence.to_html()
        if not show_spaces_between_segments:
            sentence_html = sentence_html.replace(" ", "")
        w_sentence.value = f'<p style="font-size: xx-large">{sentence_html}</p>'
    else:
        w_sentence.value = ''


def update_sample(sample):
    global teacher_rec
    global sentence

    sentence = Sentence(sample["sentence"])

    with w_sentence.hold_sync(), w_audio.hold_sync(), fig_teacher.hold_sync(), fig_student.hold_sync(), fig_cmp.hold_sync(), w_cmp_result.hold_sync():
        update_sentence()

        teacher_rec = SpeechRecord(sample['filename'], sentence, name="Teacher");

        w_audio.value = get_sample_audio_data(sample);
    
        fig_teacher.update_data(teacher_rec);
        fig_student.clear();
        fig_cmp.clear();
    
        w_cmp_result.value = ""


update_sample(w_sample_selector.selected_sample());


def sample_changed(change):
    update_sample(dict(change["new"]));

w_sample_selector.observe(sample_changed, 'value')


def get_student_wav_filename():
    try:
        w_recorder.save('test.webm')
    except ValueError as exc:
        if str(exc).startswith('No data'):
            w_cmp_result.value = f"Record something first !"
        raise exc
            
    !ffmpeg -hide_banner -loglevel error -y -i test.webm -ar 16000 -ac 1 test.wav
    return 'test.wav'


def run_compare(_):
    global teacher_rec
    global student_rec

    student_wav_filename = get_student_wav_filename()
    # Alternatively, here is a sample:
    #student_wav_filename = "data/itsu_ga_ii_ka_wakarimasen.wav"

    student_rec = SpeechRecord(student_wav_filename, sentence, name="Student");
    fig_student.update_data(student_rec);
    
    alignment_method = w_alignment_method_dropdown.value

    try:
        student_rec.align_with(teacher_rec, method=alignment_method)
        mean_distance = student_rec.compare_pitch();
        w_cmp_result.value = f"Success !\nMean distance = {mean_distance:.2f}"
    except Exception as exc:
        w_cmp_result.value = "FAILED !"
        raise exc

    fig_cmp.update_data(teacher_rec, student_rec)


w_compare_btn.on_click(run_compare)

# Update the comparison if we change the alignment method
w_alignment_method_dropdown.observe(run_compare, 'value')


# Layout

w_tab = widgets.Tab()
w_tab.children = [w_sample_selector, w_upload_sample]
w_tab.set_title(0, "Samples")
w_tab.set_title(1, "Upload new samples")

box = widgets.Box([
    widgets.Box([
        w_tab,
        w_options_accordion
    ]),
    w_sentence,
    widgets.Box([
        widgets.VBox([widgets.Label(value="Teacher's recording:"), w_audio], layout=widgets.Layout(width='33%')),
        widgets.VBox([widgets.Label(value="Your recording:"), w_recorder], layout=widgets.Layout(width='33%')),
        widgets.VBox([w_compare_btn, w_cmp_result], layout=widgets.Layout(width='33%')),
    ]),
    fig_cmp,
    fig_student,
    fig_teacher,
], layout=widgets.Layout(display="flex", flex_flow="column", align_items="stretch", align_content="center")
)

display(box)