import argparse import codecs import math import os import re import html SUPPORTED_EXTENSIONS = [".xml", ".vtt", ".srt"] def leading_zeros(value, digits=2): value = "000000" + str(value) return value[-digits:] def convert_time(raw_time): if int(raw_time) == 0: return "00:00:00,000" ms = '000' if len(raw_time) > 4: ms = leading_zeros(int(raw_time[:-4]) % 1000, 3) time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0 second = leading_zeros(time_in_seconds % 60) minute = leading_zeros(int(math.floor(time_in_seconds / 60)) % 60) hour = leading_zeros(int(math.floor(time_in_seconds / 3600))) return "{}:{}:{},{}".format(hour, minute, second, ms) def xml_id_display_align_before(text): """ displayAlign="before" means the current sub will be displayed on top. That is and not at bottom. We check what's the xml:id associated to it to have an {\an8} position tag in the output file. """ align_before_re = re.compile(r'') has_align_before = re.search(align_before_re, text) if has_align_before: return has_align_before.group(1) return u"" def xml_get_cursive_style_ids(text): style_section = re.search("(.*)", text, flags=re.DOTALL) if not style_section: return [] style_ids_re = re.compile( '" if s[1] in cursive_ids else u"") text = has_cursive[-1].join(text.split(s[0], 1)) return text, has_cursive def xml_cleanup_spans_end(span_end_re, text, has_cursive): span_end_tags = re.findall(span_end_re, text) for s, cursive in zip(span_end_tags, has_cursive): cursive = u"" if cursive else u"" text = cursive.join(text.split(s, 1)) return text def to_srt(text, extension, delay_ms): if extension.lower() == ".xml": text = xml_to_srt(text) elif extension.lower() == ".vtt": text = vtt_to_srt(text) return shift_srt_timestamp(text, delay_ms) def shift_srt_timestamp(text, delay_ms=0): if not delay_ms: return text def shift_time(time_str, shift): h, m, s_ms = time_str.split(":") s, ms = s_ms.split(",") total_ms = int(h)*3600000 + int(m)*60000 + int(s)*1000 + int(ms) new_ms = total_ms + shift h = new_ms // 3600000; new_ms %= 3600000 m = new_ms // 60000; new_ms %= 60000 s = new_ms // 1000; ms = new_ms % 1000 return f"{h:02}:{m:02}:{s:02},{ms:03}" def replace_timestamp(match): start = shift_time(match[1], delay_ms) end = shift_time(match[2], delay_ms) return f"{start} --> {end}" if start and end else match[0] timestamp_regex = r"(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})" return re.sub(timestamp_regex, replace_timestamp, text) def convert_vtt_time(line): times = line.replace(".", ",").split(" --> ") if len(times[0]) == 9: times = ["00:" + t for t in times] return "{} --> {}".format(times[0], times[1].split(" ")[0]) def vtt_to_srt(text): if not text.startswith(u"\ufeffWEBVTT") and not text.startswith(u"WEBVTT"): raise Exception(".vtt format must start with WEBVTT, wrong file?") styles = get_vtt_styles(text) style_tag_re = re.compile(r'(.*)') lines = [] current_sub_line = [] for line in text.split("\n"): if current_sub_line: if line: style_tag = re.search(style_tag_re, line) if style_tag: line = style_tag.group(2) # line is just the text part color = styles.get(style_tag.group(1).split(".")[0]) if color: line = r'{}'.format( color, line) current_sub_line.append(line) else: lines.append("\n".join(current_sub_line) + "\n\n") current_sub_line = [] elif " --> " in line: current_sub_line = [convert_vtt_time(line)] if current_sub_line: lines.append("\n".join(current_sub_line)) return "".join((u"{}\n{}".format(i, l) for i, l in enumerate(lines, 1))) def get_vtt_styles(text): # just using it for color ATM styles = {} lines = text.split("\n") n = 0 style_name_re = re.compile(r'::cue\(\.(.*)\).*') color_re = re.compile(r'.*color: (\#.*);') while n < len(lines): # not efficient to go through all text, but it's ok style_name = re.search(style_name_re, lines[n]) if style_name and style_name.groups(): name = style_name.group(1) color = re.search(color_re, lines[n + 1]) if color and color.groups(): styles[name] = color.group(1) n += 1 return styles def xml_to_srt(text): def append_subs(start, end, prev_content, format_time): subs.append({ "start_time": convert_time(start) if format_time else start, "end_time": convert_time(end) if format_time else end, "content": u"\n".join(prev_content), }) display_align_before = xml_id_display_align_before(text) p_tag_re = re.compile(r'(]*begin=[^>]*>.*?

)', re.DOTALL) sub_lines = p_tag_re.findall(text) subs = [] prev_time = {"start": 0, "end": 0} prev_content = [] start = end = '' start_re = re.compile(r'begin\="([0-9:\.]*)') end_re = re.compile(r'end\="([0-9:\.]*)') content_re = re.compile(r'\">(.*)

', re.DOTALL) # some span tags are used for italics, we'll replace them by and , # which is the standard for .srt files. We ignore all other uses. cursive_ids = xml_get_cursive_style_ids(text) span_id_re = re.compile(r'()+') span_end_re = re.compile(r'()+') br_re = re.compile(r'()+') fmt_t = True whitespace_re = re.compile(r'[\r\n\t]+') for s in sub_lines: s = re.sub(whitespace_re, ' ', s) s, has_cursive = xml_cleanup_spans_start( span_id_re, cursive_ids, s) string_region_re = r'(.*)

' s = re.sub(string_region_re, r'{\\an8}\2

', s) content = re.search(content_re, s).group(1) br_tags = re.search(br_re, content) if br_tags: content = u"\n".join(content.split(br_tags.group())) content = xml_cleanup_spans_end( span_end_re, content, has_cursive) content = html.unescape(content) prev_start = prev_time["start"] start = re.search(start_re, s).group(1) end = re.search(end_re, s).group(1) if len(start.split(":")) > 1: fmt_t = False start = start.replace(".", ",") end = end.replace(".", ",") if (prev_start == start and prev_time["end"] == end) or not prev_start: # Fix for multiple lines starting at the same time prev_time = {"start": start, "end": end} prev_content.append(content) continue append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t) prev_time = {"start": start, "end": end} prev_content = [content] append_subs(start, end, prev_content, fmt_t) lines = (u"{}\n{} --> {}\n{}\n".format( s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"]) for s in range(len(subs))) return u"\n".join(lines) def main(): directory = "." help_text = u"path to the {} directory (defaults to current directory)" parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, default=directory, help=help_text.format("input", directory)) parser.add_argument("-o", "--output", type=str, default=directory, help=help_text.format("output", directory)) parser.add_argument("-d", "--delay", type=int, default=0, help="delay all subtitles by the given number of milliseconds") a = parser.parse_args() filenames = [fn for fn in os.listdir(a.input) if fn[-4:].lower() in SUPPORTED_EXTENSIONS] for fn in filenames: with codecs.open(u"{}/{}".format(a.input, fn), 'rb', "utf-8") as f: text = f.read() with codecs.open(u"{}/{}.srt".format(a.output, fn[:fn.rfind('.')]), 'wb', "utf-8") as f: f.write(to_srt(text, fn[-4:], a.delay)) if __name__ == '__main__': main()