import argparse
import codecs
import math
import os
import re
import html
SUPPORTED_EXTENSIONS = [".xml", ".vtt", ".srt"]
def leading_zeros(value, digits=2):
value = "000000" + str(value)
return value[-digits:]
def convert_time(raw_time):
if int(raw_time) == 0:
return "00:00:00,000"
ms = '000'
if len(raw_time) > 4:
ms = leading_zeros(int(raw_time[:-4]) % 1000, 3)
time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0
second = leading_zeros(time_in_seconds % 60)
minute = leading_zeros(int(math.floor(time_in_seconds / 60)) % 60)
hour = leading_zeros(int(math.floor(time_in_seconds / 3600)))
return "{}:{}:{},{}".format(hour, minute, second, ms)
def xml_id_display_align_before(text):
"""
displayAlign="before" means the current sub will be displayed on top.
That is and not at bottom. We check what's the xml:id associated to it
to have an {\an8} position tag in the output file.
"""
align_before_re = re.compile(r'')
has_align_before = re.search(align_before_re, text)
if has_align_before:
return has_align_before.group(1)
return u""
def xml_get_cursive_style_ids(text):
style_section = re.search("(.*)", text, flags=re.DOTALL)
if not style_section:
return []
style_ids_re = re.compile(
'" if s[1] in cursive_ids else u"")
text = has_cursive[-1].join(text.split(s[0], 1))
return text, has_cursive
def xml_cleanup_spans_end(span_end_re, text, has_cursive):
span_end_tags = re.findall(span_end_re, text)
for s, cursive in zip(span_end_tags, has_cursive):
cursive = u"" if cursive else u""
text = cursive.join(text.split(s, 1))
return text
def to_srt(text, extension, delay_ms):
if extension.lower() == ".xml":
text = xml_to_srt(text)
elif extension.lower() == ".vtt":
text = vtt_to_srt(text)
return shift_srt_timestamp(text, delay_ms)
def shift_srt_timestamp(text, delay_ms=0):
if not delay_ms:
return text
def shift_time(time_str, shift):
h, m, s_ms = time_str.split(":")
s, ms = s_ms.split(",")
total_ms = int(h)*3600000 + int(m)*60000 + int(s)*1000 + int(ms)
new_ms = total_ms + shift
h = new_ms // 3600000; new_ms %= 3600000
m = new_ms // 60000; new_ms %= 60000
s = new_ms // 1000; ms = new_ms % 1000
return f"{h:02}:{m:02}:{s:02},{ms:03}"
def replace_timestamp(match):
start = shift_time(match[1], delay_ms)
end = shift_time(match[2], delay_ms)
return f"{start} --> {end}" if start and end else match[0]
timestamp_regex = r"(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})"
return re.sub(timestamp_regex, replace_timestamp, text)
def convert_vtt_time(line):
times = line.replace(".", ",").split(" --> ")
if len(times[0]) == 9:
times = ["00:" + t for t in times]
return "{} --> {}".format(times[0], times[1].split(" ")[0])
def vtt_to_srt(text):
if not text.startswith(u"\ufeffWEBVTT") and not text.startswith(u"WEBVTT"):
raise Exception(".vtt format must start with WEBVTT, wrong file?")
styles = get_vtt_styles(text)
style_tag_re = re.compile(r'(.*)')
lines = []
current_sub_line = []
for line in text.split("\n"):
if current_sub_line:
if line:
style_tag = re.search(style_tag_re, line)
if style_tag:
line = style_tag.group(2) # line is just the text part
color = styles.get(style_tag.group(1).split(".")[0])
if color:
line = r'{}'.format(
color, line)
current_sub_line.append(line)
else:
lines.append("\n".join(current_sub_line) + "\n\n")
current_sub_line = []
elif " --> " in line:
current_sub_line = [convert_vtt_time(line)]
if current_sub_line:
lines.append("\n".join(current_sub_line))
return "".join((u"{}\n{}".format(i, l) for i, l in enumerate(lines, 1)))
def get_vtt_styles(text): # just using it for color ATM
styles = {}
lines = text.split("\n")
n = 0
style_name_re = re.compile(r'::cue\(\.(.*)\).*')
color_re = re.compile(r'.*color: (\#.*);')
while n < len(lines): # not efficient to go through all text, but it's ok
style_name = re.search(style_name_re, lines[n])
if style_name and style_name.groups():
name = style_name.group(1)
color = re.search(color_re, lines[n + 1])
if color and color.groups():
styles[name] = color.group(1)
n += 1
return styles
def xml_to_srt(text):
def append_subs(start, end, prev_content, format_time):
subs.append({
"start_time": convert_time(start) if format_time else start,
"end_time": convert_time(end) if format_time else end,
"content": u"\n".join(prev_content),
})
display_align_before = xml_id_display_align_before(text)
p_tag_re = re.compile(r'(]*begin=[^>]*>.*?
)', re.DOTALL)
sub_lines = p_tag_re.findall(text)
subs = []
prev_time = {"start": 0, "end": 0}
prev_content = []
start = end = ''
start_re = re.compile(r'begin\="([0-9:\.]*)')
end_re = re.compile(r'end\="([0-9:\.]*)')
content_re = re.compile(r'\">(.*)
', re.DOTALL)
# some span tags are used for italics, we'll replace them by and ,
# which is the standard for .srt files. We ignore all other uses.
cursive_ids = xml_get_cursive_style_ids(text)
span_id_re = re.compile(r'()+')
span_end_re = re.compile(r'()+')
br_re = re.compile(r'(
)+')
fmt_t = True
whitespace_re = re.compile(r'[\r\n\t]+')
for s in sub_lines:
s = re.sub(whitespace_re, ' ', s)
s, has_cursive = xml_cleanup_spans_start(
span_id_re, cursive_ids, s)
string_region_re = r'(.*)
'
s = re.sub(string_region_re, r'{\\an8}\2
', s)
content = re.search(content_re, s).group(1)
br_tags = re.search(br_re, content)
if br_tags:
content = u"\n".join(content.split(br_tags.group()))
content = xml_cleanup_spans_end(
span_end_re, content, has_cursive)
content = html.unescape(content)
prev_start = prev_time["start"]
start = re.search(start_re, s).group(1)
end = re.search(end_re, s).group(1)
if len(start.split(":")) > 1:
fmt_t = False
start = start.replace(".", ",")
end = end.replace(".", ",")
if (prev_start == start and prev_time["end"] == end) or not prev_start:
# Fix for multiple lines starting at the same time
prev_time = {"start": start, "end": end}
prev_content.append(content)
continue
append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t)
prev_time = {"start": start, "end": end}
prev_content = [content]
append_subs(start, end, prev_content, fmt_t)
lines = (u"{}\n{} --> {}\n{}\n".format(
s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"])
for s in range(len(subs)))
return u"\n".join(lines)
def main():
directory = "."
help_text = u"path to the {} directory (defaults to current directory)"
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input", type=str, default=directory,
help=help_text.format("input", directory))
parser.add_argument("-o", "--output", type=str, default=directory,
help=help_text.format("output", directory))
parser.add_argument("-d", "--delay", type=int, default=0,
help="delay all subtitles by the given number of milliseconds")
a = parser.parse_args()
filenames = [fn for fn in os.listdir(a.input)
if fn[-4:].lower() in SUPPORTED_EXTENSIONS]
for fn in filenames:
with codecs.open(u"{}/{}".format(a.input, fn), 'rb', "utf-8") as f:
text = f.read()
with codecs.open(u"{}/{}.srt".format(a.output, fn[:fn.rfind('.')]), 'wb', "utf-8") as f:
f.write(to_srt(text, fn[-4:], a.delay))
if __name__ == '__main__':
main()