# -*- coding: utf-8 -*-
"""AudioToText CLI

Original file is located at https://colab.research.google.com/github/Carleslc/AudioToText/blob/master/AudioToText.ipynb

# 🗣️ [**AudioToText**](https://github.com/Carleslc/AudioToText)

### 🛠 [Whisper by OpenAI (GitHub)](https://github.com/openai/whisper)
"""

print("AudioToText CLI\n")

import argparse

parser = argparse.ArgumentParser(
  description="Transcribe and translate audio to text using Whisper and DeepL.",
  epilog=f'web: https://carleslc.me/AudioToText/'
)
parser.add_argument("audio_file", nargs='+', help="source file to transcribe")
parser.add_argument("--task", help="transcribe (default) or translate (to English)",
                    default="transcribe", choices=["transcribe", "translate"])
parser.add_argument("--model", help="model to use (default: small)",
                    default="small", choices=["tiny", "base", "small", "medium", "large-v1", "large-v2"])
parser.add_argument("--language", help="source file language (default: Auto-Detect)",
                    default="Auto-Detect", choices=["Auto-Detect", "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Assamese", "Azerbaijani", "Bashkir", "Basque", "Belarusian", "Bengali", "Bosnian", "Breton", "Bulgarian", "Burmese", "Castilian", "Catalan", "Chinese", "Croatian", "Czech", "Danish", "Dutch", "English", "Estonian", "Faroese", "Finnish", "Flemish", "French", "Galician", "Georgian", "German", "Greek", "Gujarati", "Haitian", "Haitian Creole", "Hausa", "Hawaiian", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian", "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean", "Lao", "Latin", "Latvian", "Letzeburgesch", "Lingala", "Lithuanian", "Luxembourgish", "Macedonian", "Malagasy", "Malay", "Malayalam", "Maltese", "Maori", "Marathi", "Moldavian", "Moldovan", "Mongolian", "Myanmar", "Nepali", "Norwegian", "Nynorsk", "Occitan", "Panjabi", "Pashto", "Persian", "Polish", "Portuguese", "Punjabi", "Pushto", "Romanian", "Russian", "Sanskrit", "Serbian", "Shona", "Sindhi", "Sinhala", "Sinhalese", "Slovak", "Slovenian", "Somali", "Spanish", "Sundanese", "Swahili", "Swedish", "Tagalog", "Tajik", "Tamil", "Tatar", "Telugu", "Thai", "Tibetan", "Turkish", "Turkmen", "Ukrainian", "Urdu", "Uzbek", "Valencian", "Vietnamese", "Welsh", "Yiddish", "Yoruba"])
parser.add_argument("--prompt", help="provide context about the audio or encourage a specific writing style, see https://platform.openai.com/docs/guides/speech-to-text/prompting")
parser.add_argument("--coherence_preference", help="True (default): More coherence, but may repeat text. False: Less repetitions, but may have less coherence",
                    default='True', choices=[True, False], type=lambda b: b.lower() != 'false')
parser.add_argument("--api_key", help="if set with your OpenAI API Key (https://platform.openai.com/account/api-keys), the OpenAI API is used, which can improve the inference speed substantially, but it has an associated cost, see API pricing: https://openai.com/pricing#audio-models. API model is large-v2 (ignores --model)")
parser.add_argument("--output_formats", "--output_format", help="desired result formats (default: txt,vtt,srt,tsv,json)",
                    default="txt,vtt,srt,tsv,json")
parser.add_argument("--output_dir", help="folder to save results (default: audio_transcription)",
                    default="audio_transcription")
parser.add_argument("--deepl_api_key", help="DeepL API key, if you want to translate results using DeepL. Get a DeepL Developer Account API Key: https://www.deepl.com/pro-api")
parser.add_argument("--deepl_target_language", help="results target language if you want to translate results using DeepL (--deepl_api_key required)",
                    choices=["Bulgarian", "Chinese", "Chinese (simplified)", "Czech", "Danish", "Dutch", "English", "English (American)", "English (British)", "Estonian", "Finnish", "French", "German", "Greek", "Hungarian", "Indonesian", "Italian", "Japanese", "Korean", "Latvian", "Lithuanian", "Norwegian", "Polish", "Portuguese", "Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Slovak", "Slovenian", "Spanish", "Swedish", "Turkish", "Ukrainian"])
parser.add_argument("--deepl_coherence_preference", help="True (default): Share context between lines while translating. False: Translate each line independently",
                    default='True', choices=[True, False], type=lambda b: b.lower() != 'false')
parser.add_argument("--deepl_formality", help="whether the translated text should lean towards formal or informal language (languages with formality supported: German,French,Italian,Spanish,Dutch,Polish,Portuguese,Russian)",
                    default="default", choices=["default", "formal", "informal"])
parser.add_argument("--skip-install", help="skip pip dependencies installation", action="store_true", default=False)
args = parser.parse_args()

"""
## [Step 1] ⚙️ Install the required libraries
"""

import os, subprocess

from sys import platform as sys_platform

status, ffmpeg_version = subprocess.getstatusoutput("ffmpeg -version")

if status != 0:
  from platform import platform

  if sys_platform == 'linux' and 'ubuntu' in platform().lower():
    os.system("apt install ffmpeg")
  else:
    print("Install ffmpeg: https://ffmpeg.org/download.html")
    exit(status)
elif not args.skip_install:
  print(ffmpeg_version.split('\n')[0])

if not args.skip_install:
  os.system("pip install --user --upgrade pip")
  os.system("pip install git+https://github.com/openai/whisper.git@v20231117 openai==1.9.0 numpy scipy deepl pydub cohere ffmpeg-python torch==2.1.0 tensorflow-probability==0.23.0 typing-extensions==4.9.0")
  print()

"""## [Step 2] 📁 Upload your audio files to this folder

Almost any audio or video file format is [supported](https://gist.github.com/Carleslc/1d6b922c8bf4a7e9627a6970d178b3a6).

## [Step 3] 👂 Transcribe or Translate

3.1. Choose a --task:
  - `transcribe` speech to text in the same language of the source audio file.
  - `translate` speech to text in English.
  
Translation to other languages is not supported with _Whisper_ by default.
You may try to choose the _Transcribe_ task and set your desired --language, but translation is not guaranteed. However, you can use **_DeepL_** later in the Step 5 to translate the transcription to another language.

3.2. Set the audio_file to match your uploaded file name to transcribe.

- If you want to transcribe multiple files with the same parameters you must separate their file names with commas `,`

3.3. Run this cell and wait for the transcription to complete.

  You can try other parameters if the result with default parameters does not suit your needs.

  [Available models and languages](https://github.com/openai/whisper#available-models-and-languages)

  Setting the --language to the language of source audio file may provide better results than Auto-Detect.

  You can add an optional initial --prompt to provide context about the audio or encourage a specific writing style, see the [prompting guide](https://platform.openai.com/docs/guides/speech-to-text/prompting).

  If the execution takes too long to complete you can choose a smaller model in --model, with an accuracy tradeoff, or use the OpenAI API.

  By default the open-source models are used, but you can also use the OpenAI API if the --api_key parameter is set with your [OpenAI API Key](https://platform.openai.com/account/api-keys), which can improve the inference speed substantially, but it has an associated cost, see [API pricing](https://openai.com/pricing#audio-models).
  
  When using API some options are fixed: --model is ignored (uses large-v2) and --coherence_preference is ignored (uses More coherence).
  
  More parameters are available in the code `options` object.
"""

import whisper
from whisper.utils import format_timestamp, get_writer, WriteTXT

import numpy as np

import torch

import math

from openai import OpenAI

# select task

task = args.task

# set audio file path

audio_files = list(map(lambda audio_path: audio_path.strip(), args.audio_file))

for audio_path in audio_files:
  if not os.path.isfile(audio_path):
    raise FileNotFoundError(audio_path)

# set model

use_model = args.model

# detect device

if args.api_key:
  print("Using API")

  from pydub import AudioSegment
  from pydub.silence import split_on_silence
else:
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

  print(f"Using {'GPU' if DEVICE == 'cuda' else 'CPU ⚠️'}")

  # https://medium.com/analytics-vidhya/the-google-colab-system-specification-check-69d159597417
  if DEVICE == "cuda":
    os.system("nvidia-smi -L")
  else:
    if sys_platform == 'linux':
      os.system("lscpu | grep \"Model name\" | awk '{$1=$1};1'")

    if use_model not in ['tiny', 'base', 'small']:
      print("Not using GPU can result in a very slow execution")
      print("You may want to try a smaller model (tiny, base, small)")

# display language

WHISPER_LANGUAGES = [k.title() for k in whisper.tokenizer.TO_LANGUAGE_CODE.keys()]

language = args.language

if language == "Auto-Detect":
  language = "detect"

if language and language != "detect" and language not in WHISPER_LANGUAGES:
  print(f"\nLanguage '{language}' is invalid")
  language = "detect"

if language and language != "detect":
  print(f"\nLanguage: {language}")

# load model

if args.api_key:
  print()
else:
  MODELS_WITH_ENGLISH_VERSION = ["tiny", "base", "small", "medium"]

  if language == "English" and use_model in MODELS_WITH_ENGLISH_VERSION:
    use_model += ".en"

  print(f"\nLoading {use_model} model... {os.path.expanduser(f'~/.cache/whisper/{use_model}.pt')}")

  model = whisper.load_model(use_model, device=DEVICE)

  print(
      f"Model {use_model} is {'multilingual' if model.is_multilingual else 'English-only'} "
      f"and has {sum(np.prod(p.shape) for p in model.parameters()):,d} parameters.\n"
  )

# set options

## https://github.com/openai/whisper/blob/v20231117/whisper/transcribe.py#L37
## https://github.com/openai/whisper/blob/v20231117/whisper/decoding.py#L81
options = {
    'task': task,
    'verbose': True,
    'fp16': True,
    'best_of': 5,
    'beam_size': 5,
    'patience': None,
    'length_penalty': None,
    'suppress_tokens': '-1',
    'temperature': (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), # float or tuple
    'condition_on_previous_text': args.coherence_preference,
    'initial_prompt': args.prompt,
    'word_timestamps': False,
}

if args.api_key:
  api_client = OpenAI(api_key=args.api_key)

  api_supported_formats = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm']
  api_max_bytes = 25 * 1024 * 1024 # 25 MB

  api_transcribe = api_client.audio.transcriptions if task == 'transcribe' else api_client.audio.translations
  api_transcribe = api_transcribe.create

  api_model = 'whisper-1' # large-v2

  # https://platform.openai.com/docs/api-reference/audio?lang=python
  api_options = {
    'response_format': 'verbose_json',
  }

  if args.prompt:
    api_options['prompt'] = args.prompt
  
  api_temperature = options['temperature'][0] if isinstance(options['temperature'], (tuple, list)) else options['temperature']
  
  if isinstance(api_temperature, (float, int)):
    api_options['temperature'] = api_temperature
  else:
    raise ValueError("Invalid temperature type, it must be a float or a tuple of floats")
elif DEVICE == 'cpu':
  options['fp16'] = False
  torch.set_num_threads(os.cpu_count())

# execute task
# !whisper "{audio_file}" --task {task} --model {use_model} --output_dir {output_dir} --device {DEVICE} --verbose {options['verbose']}

if task == "translate":
  print("-- TRANSLATE TO ENGLISH --")
else:
  print("-- TRANSCRIPTION --")

results = {} # audio_path to result

for audio_path in audio_files:
  print(f"\nProcessing: {audio_path}\n")

  # detect language
  detect_language = not language or language == "detect"
  
  if not detect_language:
    options['language'] = language
    source_language_code = whisper.tokenizer.TO_LANGUAGE_CODE.get(language.lower())
  elif not args.api_key:
    # load audio and pad/trim it to fit 30 seconds
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)

    # make log-Mel spectrogram and move to the same device as the model
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    # detect the spoken language
    _, probs = model.detect_language(mel)

    source_language_code = max(probs, key=probs.get)
    options['language'] = whisper.tokenizer.LANGUAGES[source_language_code].title()
    
    print(f"Detected language: {options['language']}\n")

  # transcribe
  if args.api_key:
    # API
    if task == "transcribe" and not detect_language:
      api_options['language'] = source_language_code
    
    source_audio_name_path, source_audio_ext = os.path.splitext(audio_path)
    source_audio_ext = source_audio_ext[1:]

    if source_audio_ext in api_supported_formats:
      api_audio_path = audio_path
      api_audio_ext = source_audio_ext
    else:
      ## convert audio file to a supported format
      if options['verbose']:
        print(f"API supported formats: {','.join(api_supported_formats)}")
        print(f"Converting {source_audio_ext} audio to a supported format...")

      api_audio_ext = 'mp3'

      api_audio_path = f'{source_audio_name_path}.{api_audio_ext}'

      subprocess.run(['ffmpeg', '-i', audio_path, api_audio_path], check=True, capture_output=True)

      if options['verbose']:
        print(api_audio_path, end='\n\n')

    ## split audio file in chunks
    api_audio_chunks = []

    audio_bytes = os.path.getsize(api_audio_path)

    if audio_bytes >= api_max_bytes:
      if options['verbose']:
        print(f"Audio exceeds API maximum allowed file size.\nSplitting audio in chunks...")
      
      audio_segment_file = AudioSegment.from_file(api_audio_path, api_audio_ext)

      min_chunks = math.ceil(audio_bytes / (api_max_bytes / 2))

      # print(f"Min chunks: {min_chunks}")

      max_chunk_milliseconds = int(len(audio_segment_file) // min_chunks)

      # print(f"Max chunk milliseconds: {max_chunk_milliseconds}")

      def add_chunk(api_audio_chunk):
        api_audio_chunk_path = f"{source_audio_name_path}_{len(api_audio_chunks) + 1}.{api_audio_ext}"
        api_audio_chunk.export(api_audio_chunk_path, format=api_audio_ext)
        api_audio_chunks.append(api_audio_chunk_path)
      
      def raw_split(big_chunk):
        subchunks = math.ceil(len(big_chunk) / max_chunk_milliseconds)

        for subchunk_i in range(subchunks):
          chunk_start = max_chunk_milliseconds * subchunk_i
          chunk_end = min(max_chunk_milliseconds * (subchunk_i + 1), len(big_chunk))
          add_chunk(big_chunk[chunk_start:chunk_end])
      
      non_silent_chunks = split_on_silence(audio_segment_file,
                                           seek_step=5, # ms
                                           min_silence_len=1250, # ms
                                           silence_thresh=-25, # dB
                                           keep_silence=True) # needed to aggregate timestamps

      # print(f"Non silent chunks: {len(non_silent_chunks)}")
      
      current_chunk = non_silent_chunks[0] if non_silent_chunks else audio_segment_file

      for next_chunk in non_silent_chunks[1:]:
        if len(current_chunk) > max_chunk_milliseconds:
          raw_split(current_chunk)
          current_chunk = next_chunk
        elif len(current_chunk) + len(next_chunk) <= max_chunk_milliseconds:
          current_chunk += next_chunk
        else:
          add_chunk(current_chunk)
          current_chunk = next_chunk
      
      if len(current_chunk) > max_chunk_milliseconds:
        raw_split(current_chunk)
      else:
        add_chunk(current_chunk)
      
      if options['verbose']:
        print(f'Total chunks: {len(api_audio_chunks)}\n')
    else:
      api_audio_chunks.append(api_audio_path)
    
    ## process chunks
    result = None

    for api_audio_chunk_path in api_audio_chunks:
      ## API request
      with open(api_audio_chunk_path, 'rb') as api_audio_file:
        api_result = api_transcribe(model=api_model, file=api_audio_file, **api_options)
        api_result = api_result.model_dump() # to dict
      
      api_segments = api_result['segments']
      
      if result:
        ## update timestamps
        last_segment_timestamp = result['segments'][-1]['end'] if result['segments'] else 0

        for segment in api_segments:
          segment['start'] += last_segment_timestamp
          segment['end'] += last_segment_timestamp

        ## append new segments
        result['segments'].extend(api_segments)
        
        if 'duration' in result:
          result['duration'] += api_result.get('duration', 0)
      else:
        ## first request
        result = api_result
        
        if detect_language:
          print(f"Detected language: {result['language'].title()}\n")
    
      ## display segments
      if options['verbose']:
        for segment in api_segments:
          print(f"[{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}] {segment['text']}")
  else:
    # Open-Source
    result = whisper.transcribe(model, audio_path, **options)

  # fix results formatting
  for segment in result['segments']:
    segment['text'] = segment['text'].strip()
  
  result['text'] = '\n'.join(map(lambda segment: segment['text'], result['segments']))

  # set results for this audio file
  results[audio_path] = result

"""## [Step 4] 💾 **Save results**

Run this cell to write the transcription as a file output.

Results will be available in the **audio_transcription** folder in the formats selected in `output_formats`.

If you don't see that folder, you may need to refresh 🔄 the Files folder.

Available formats: `txt,vtt,srt,tsv,json`
"""

# set output folder
output_dir = args.output_dir

# set output formats: https://github.com/openai/whisper/blob/v20231117/whisper/utils.py#L283
output_formats = args.output_formats
output_formats = output_formats.split(',')

from typing import TextIO

class WriteText(WriteTXT):

  def write_result(self, result: dict, file: TextIO, **kwargs):
    print(result['text'], file=file, flush=True)

def write_result(result, output_format, output_file_name):
  output_format = output_format.strip()

  # start captions in non-zero timestamp (some media players does not detect the first caption)
  fix_vtt = output_format == 'vtt' and result['segments'] and result['segments'][0].get('start') == 0
  
  if fix_vtt:
    result['segments'][0]['start'] += 1/1000 # +1ms

  # write result in the desired format
  writer = WriteText(output_dir) if output_format == 'txt' else get_writer(output_format, output_dir)
  writer(result, output_file_name)

  if fix_vtt:
    result['segments'][0]['start'] = 0 # reset change

  output_file_path = os.path.join(output_dir, f"{output_file_name}.{output_format}")
  print(output_file_path)

# save results

print("\nWriting results...")

os.makedirs(output_dir, exist_ok=True)

for audio_path, result in results.items():
  print(end='\n')
  
  output_file_name = os.path.splitext(os.path.basename(audio_path))[0]

  for output_format in output_formats:
    write_result(result, output_format, output_file_name)

"""## [Step 5] 💬 Translate results with DeepL (API key needed)

This is an **optional** step to translate the transcription to another language using the **DeepL** API.

[Get a DeepL Developer Account API Key](https://www.deepl.com/pro-api)

Set the `deepl_api_key` to translate the transcription to a supported language in `deepl_target_language`.
"""

import deepl

# translation service options (DeepL Developer Account)

deepl_api_key = args.deepl_api_key

deepl_target_language = args.deepl_target_language

deepl_coherence_preference = args.deepl_coherence_preference

deepl_formality = "default"

if deepl_api_key and not deepl_target_language:
  deepl_target_language = 'English'

if deepl_target_language:
  if not deepl_api_key:
    print("\nRequired: --deepl_api_key")
    print("Get a DeepL Developer Account API Key: https://www.deepl.com/pro-api")
  elif deepl_target_language == 'English':
    deepl_target_language = "English (British)"
  elif deepl_target_language == 'Chinese':
    deepl_target_language = "Chinese (simplified)"
  elif deepl_target_language == 'Portuguese':
    deepl_target_language = "Portuguese (European)"

use_deepl_translation = deepl_api_key and deepl_target_language

if use_deepl_translation:
  print(end='\n')

  if args.deepl_formality != 'default':
    deepl_formality = 'prefer_more' if args.deepl_formality == 'formal' else 'prefer_less'

  translated_results = {} # audio_path to translated results

  try:
    deepl_translator = deepl.Translator(deepl_api_key)

    deepl_source_languages = [lang.code.upper() for lang in deepl_translator.get_source_languages()]
    
    deepl_target_languages_dict = deepl_translator.get_target_languages()
    deepl_target_languages = [lang.name for lang in deepl_target_languages_dict]

    deepl_target_language_code = next(lang.code for lang in deepl_target_languages_dict if lang.name == deepl_target_language).upper()
    target_language_code = deepl_target_language_code.split('-')[0]
    
    for audio_path, result in results.items():
      deepl_usage = deepl_translator.get_usage()
      
      if deepl_usage.any_limit_reached:
        print(audio_path)
        raise deepl.DeepLException("Quota for this billing period has been exceeded, message: Quota Exceeded")
      else:
        print(audio_path + '\n')
      
      # translate results (DeepL)
      source_language_code = whisper.tokenizer.TO_LANGUAGE_CODE.get(result['language'].lower()).upper()

      if (task == 'translate' and target_language_code != 'EN') or (task == 'transcribe' and source_language_code in deepl_source_languages and source_language_code != target_language_code):
        source_lang = source_language_code if task == 'transcribe' else None
        translate_from = f"from {result['language'].title()} [{source_language_code}] " if source_lang else ''
        print(f"DeepL: Translate results {translate_from}to {deepl_target_language} [{deepl_target_language_code}]\n")

        segments = result['segments']

        translated_results[audio_path] = { 'text': '', 'segments': [], 'language': deepl_target_language }

        # segments / request (max 128 KiB / request, so deepl_batch_requests_size is limited to around 1000)
        deepl_batch_requests_size = 200 # 200 segments * ~100 bytes / segment = ~20 KB / request  (~15 minutes of speech)
        
        for batch_segments in [segments[i:i + deepl_batch_requests_size] for i in range(0, len(segments), deepl_batch_requests_size)]:
          batch_segments_text = [segment['text'] for segment in batch_segments]

          if deepl_coherence_preference:
            batch_segments_text = '<br/>'.join(batch_segments_text)

          # DeepL request
          deepl_results = deepl_translator.translate_text(
              text=batch_segments_text,
              source_lang=source_lang,
              target_lang=deepl_target_language_code,
              formality=deepl_formality,
              split_sentences='nonewlines',
              tag_handling='xml' if deepl_coherence_preference else None,
              ignore_tags='br' if deepl_coherence_preference else None, # used to synchronize sentences with whisper lines but without splitting sentences in DeepL
              outline_detection=False if deepl_coherence_preference else None
          )
          
          deepl_results_segments = deepl_results.text.split('<br/>') if deepl_coherence_preference else [deepl_result_segment.text for deepl_result_segment in deepl_results]

          for j, translated_text in enumerate(deepl_results_segments):
            segment = batch_segments[j]

            # fix sentence formatting
            translated_text = translated_text.lstrip(',.。 ').rstrip()

            if not deepl_coherence_preference and translated_text and translated_text[-1] in '.。' and segment['text'][-1] not in '.。':
              translated_text = translated_text[:-1]

            # add translated segments
            translated_results[audio_path]['segments'].append(dict(id=segment['id'], start=segment['start'], end=segment['end'], text=translated_text))

            if options['verbose']:
              print(f"[{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}] {translated_text}")
        
        deepl_usage = deepl_translator.get_usage()
        
        if deepl_usage.character.valid:
          print(f"\nDeepL: Character usage: {deepl_usage.character.count} / {deepl_usage.character.limit} ({100*(deepl_usage.character.count/deepl_usage.character.limit):.2f}%)\n")
      elif source_language_code == target_language_code:
        print(f"Nothing to translate. Results are already in {result['language']}.")
      elif task == 'transcribe' and source_language_code not in deepl_source_languages:
        print(f"DeepL: {result['language']} is not yet supported")
  except deepl.DeepLException as e:
    if isinstance(e, deepl.AuthorizationException) and str(e) == "Authorization failure, check auth_key":
      e = "Authorization failure, check deepl_api_key"
    print(f"\nDeepL: [Error] {e}\n")
  
  # save translated results (if any)

  if translated_results:
    print("Writing translated results...")

    for audio_path, translated_result in translated_results.items():
      print(end='\n')

      translated_result['text'] = '\n'.join(map(lambda translated_segment: translated_segment['text'], translated_result['segments']))
      
      output_file_name = os.path.splitext(os.path.basename(audio_path))[0]
      translated_output_file_name = f"{output_file_name}_{deepl_target_language}"

      for output_format in output_formats:
        write_result(translated_result, output_format, translated_output_file_name)