using UnityEngine; namespace Whisper.Utils { public static class AudioUtils { /// /// Transform audio clip samples to mono with desired sample rate. /// public static float[] Preprocess(float[] src, int srcSampleRate, int srcChannelsCount, int dstSampleRate) { // TODO: this probably can be done in one loop (mono + resample) var ret = src; if (srcChannelsCount > 1) ret = ConvertToMono(src, srcChannelsCount); if (srcSampleRate != dstSampleRate) ret = ChangeSampleRate(ret, srcSampleRate, dstSampleRate); return ret; } /// /// Convert audio buffer to mono. /// public static float[] ConvertToMono(float[] src, int channelsCount) { var srcLength = src.Length; var monoLength = srcLength / channelsCount; var mono = new float[monoLength]; for (var i = 0; i < monoLength; i++) { var sum = 0f; for (int j = 0; j < channelsCount; j++) sum += src[i * channelsCount + j]; mono[i] = sum / channelsCount; } return mono; } /// /// Resample audio buffer to new sample rate using linear interpolation. /// public static float[] ChangeSampleRate(float[] src, int srcSampleRate, int dstSampleRate) { var srcLen = src.Length; var srcLenSec = (float) srcLen / srcSampleRate; var dstLen = Mathf.RoundToInt(srcLenSec * dstSampleRate); var dst = new float[dstLen]; for (var i = 0; i < dstLen; i++) { var index = (float)i / dstLen * srcLen; var low = Mathf.FloorToInt(index); var dif = index - low; if (low + 1 >= srcLen) dst[i] = src[srcLen - 1]; else dst[i] = Mathf.Lerp(src[low], src[low + 1], dif); } return dst; } /// /// Naive energy based Voice Activity Detection (VAD). Returns true if lastSec contains speech. /// public static bool SimpleVad(float[] data, int sampleRate, float lastSec, float vadThd, float freqThd) { // https://github.com/ggerganov/whisper.cpp/blob/a792c4079ce61358134da4c9bc589c15a03b04ad/examples/common.cpp#L697 var nSamples = data.Length; var nSamplesLast = (int) (sampleRate * lastSec); if (nSamplesLast >= nSamples) { // not enough samples - assume no speech return false; } if (freqThd > 0.0f) HighPassFilter(data, freqThd, sampleRate); var energyAll = 0.0f; var energyLast = 0.0f; for (var i = 0; i < nSamples; i++) { energyAll += Mathf.Abs(data[i]); if (i >= nSamples - nSamplesLast) energyLast += Mathf.Abs(data[i]); } energyAll /= nSamples; energyLast /= nSamplesLast; return energyLast > vadThd * energyAll; } /// /// Return a copy of array after high pass filter. /// public static void HighPassFilter(float[] data, float cutoff, int sampleRate) { // https://github.com/ggerganov/whisper.cpp/blob/a792c4079ce61358134da4c9bc589c15a03b04ad/examples/common.cpp#L684 if (data.Length == 0) return; var rc = 1.0f / (2.0f * Mathf.PI * cutoff); var dt = 1.0f / sampleRate; var alpha = dt / (rc + dt); var y = data[0]; for (var i = 1; i < data.Length; i++) { y = alpha * (y + data[i] - data[i - 1]); data[i] = y; } } } }