/* * Subtitle reader with format autodetection * * Copyright (c) 2001 laaz * Some code cleanup & realloc() by A'rpi/ESP-team * * This file is part of MPlayer. * * MPlayer is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * MPlayer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along * with MPlayer; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include #include #include #include #ifndef _MSC_VER #include #include #endif #ifdef HAVE_GLOB #include #else #include "osdep/glob.h" #endif #include "ass_mp.h" #include "config.h" #include "mp_msg.h" #include "mpcommon.h" #include "path.h" #include "subreader.h" #include "subassconvert.h" #include "sub.h" #include "vobsub.h" #include "stream/stream.h" #include "libavutil/common.h" #include "libavutil/avstring.h" #include "osdep/osdep.h" #ifdef CONFIG_ENCA #include #endif #define ERR ((void *) -1) #ifdef CONFIG_ICONV #include #endif char *sub_cp=NULL; char *enca_sub_cp=NULL; #ifdef CONFIG_FRIBIDI #include char *fribidi_charset = NULL; ///character set that will be passed to FriBiDi int flip_hebrew = 1; ///flip subtitles using fribidi int fribidi_flip_commas = 0; ///flip comma when fribidi is used #endif /* Maximal length of line of a subtitle */ #define LINE_LEN 1000 static float mpsub_position=0; static float mpsub_multiplier=1.; static int sub_slacktime = 20000; //20 sec int sub_no_text_pp=0; // 1 => do not apply text post-processing // like {\...} elimination in SSA format. int sub_match_fuzziness=0; // level of sub name matching fuzziness /* Use the SUB_* constant defined in the header file */ int sub_format=SUB_INVALID; #ifdef CONFIG_SORTSUB /* Some subtitling formats, namely AQT and Subrip09, define the end of a subtitle as the beginning of the following. Since currently we read one subtitle at time, for these format we keep two global *subtitle, previous_aqt_sub and previous_subrip09_sub, pointing to previous subtitle, so we can change its end when we read current subtitle starting time. When CONFIG_SORTSUB is defined, we use a single global unsigned long, previous_sub_end, for both (and even future) formats, to store the end of the previous sub: it is initialized to 0 in sub_read_file and eventually modified by sub_read_aqt_line or sub_read_subrip09_line. */ unsigned long previous_sub_end; #endif #ifdef __MINGW32__ #include static FILE *win32_fopen_sub(const char *path) { int cnt; wchar_t path_w[MAX_PATH]; cnt = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, path, -1, path_w, sizeof(path_w) / sizeof(*path_w)); if (cnt <= 0) return fopen(path, "rt"); else return _wfopen(path_w, L"rt"); } #endif static int eol(unsigned char p) { return p=='\r' || p=='\n' || p=='\0'; } /* Remove leading and trailing space */ static void trail_space(unsigned char *s) { int i = 0; while (isspace(s[i])) ++i; if (i) strcpy(s, s + i); i = strlen(s) - 1; while (i > 0 && isspace(s[i])) s[i--] = '\0'; } static char *stristr(const char *haystack, const char *needle) { int len = 0; const char *p = haystack; if (!(haystack && needle)) return NULL; len=strlen(needle); while (*p != '\0') { if (av_strncasecmp(p, needle, len) == 0) return (char*)p; p++; } return NULL; } static void sami_add_line(subtitle *current, char *buffer, char **pos) { char *p = *pos; *p = 0; trail_space(buffer); if (*buffer && current->lines < SUB_MAX_TEXT) current->text[current->lines++] = strdup(buffer); *pos = buffer; } static subtitle *sub_read_line_sami(stream_t* st, subtitle *current, int utf16) { static char line[LINE_LEN+1]; static char *s = NULL, *slacktime_s; char text[LINE_LEN+1], *p=NULL, *q; int state; current->lines = current->start = current->end = 0; current->alignment = SUB_ALIGNMENT_BOTTOMCENTER; state = 0; /* read the first line */ if (!s) if (!(s = stream_read_line(st, line, LINE_LEN, utf16))) return 0; do { switch (state) { case 0: /* find "START=" or "Slacktime:" */ slacktime_s = stristr (s, "Slacktime:"); if (slacktime_s) sub_slacktime = strtol (slacktime_s+10, NULL, 0) / 10; s = stristr (s, "Start="); if (s) { current->start = strtol (s + 6, &s, 0) / 10; /* eat '>' */ for (; *s != '>' && *s != '\0'; s++); s++; state = 1; continue; } break; case 1: /* find (optional) " TAG */ if (*s == '\0') break; s++; continue; case 2: /* find ">" */ if ((s = strchr (s, '>'))) { s++; state = 3; p = text; continue; } break; case 3: /* get all text until '<' appears */ if (p - text >= LINE_LEN) sami_add_line(current, text, &p); if (*s == '\0') break; else if (!av_strncasecmp (s, "
", 4)) { sami_add_line(current, text, &p); s += 4; } else if ((*s == '{') && !sub_no_text_pp) { state = 5; ++s; continue; } else if (*s == '<') { state = 4; } else if (!av_strncasecmp (s, " ", 6)) { *p++ = ' '; s += 6; } else if (*s == '\t') { *p++ = ' '; s++; } else if (*s == '\r' || *s == '\n') { s++; } else *p++ = *s++; /* skip duplicated space */ if (p > text + 2) if (*(p-1) == ' ' && *(p-2) == ' ') p--; continue; case 4: /* get current->end or skip */ q = stristr (s, "Start="); if (q) { current->end = strtol (q + 6, &q, 0) / 10 - 1; *p = '\0'; trail_space (text); if (text[0] != '\0') current->text[current->lines++] = strdup (text); if (current->lines > 0) { state = 99; break; } state = 0; continue; } s = strchr (s, '>'); if (s) { s++; state = 3; continue; } break; case 5: /* get rid of {...} text, but read the alignment code */ if ((*s == '\\') && (*(s + 1) == 'a') && !sub_no_text_pp) { if (stristr(s, "\\a1") != NULL) { current->alignment = SUB_ALIGNMENT_BOTTOMLEFT; s = s + 3; } if (stristr(s, "\\a2") != NULL) { current->alignment = SUB_ALIGNMENT_BOTTOMCENTER; s = s + 3; } else if (stristr(s, "\\a3") != NULL) { current->alignment = SUB_ALIGNMENT_BOTTOMRIGHT; s = s + 3; } else if ((stristr(s, "\\a4") != NULL) || (stristr(s, "\\a5") != NULL) || (stristr(s, "\\a8") != NULL)) { current->alignment = SUB_ALIGNMENT_TOPLEFT; s = s + 3; } else if (stristr(s, "\\a6") != NULL) { current->alignment = SUB_ALIGNMENT_TOPCENTER; s = s + 3; } else if (stristr(s, "\\a7") != NULL) { current->alignment = SUB_ALIGNMENT_TOPRIGHT; s = s + 3; } else if (stristr(s, "\\a9") != NULL) { current->alignment = SUB_ALIGNMENT_MIDDLELEFT; s = s + 3; } else if (stristr(s, "\\a10") != NULL) { current->alignment = SUB_ALIGNMENT_MIDDLECENTER; s = s + 4; } else if (stristr(s, "\\a11") != NULL) { current->alignment = SUB_ALIGNMENT_MIDDLERIGHT; s = s + 4; } } if (*s == '}') state = 3; ++s; continue; } /* read next line */ if (state != 99 && !(s = stream_read_line (st, line, LINE_LEN, utf16))) { if (current->start > 0) { break; // if it is the last subtitle } else { return 0; } } } while (state != 99); // For the last subtitle if (current->end <= 0) { current->end = current->start + sub_slacktime; sami_add_line(current, text, &p); } return current; } static const char *sub_readtext(const char *source, char **dest) { int len=0; const char *p=source; // printf("src=%p dest=%p \n",source,dest); while ( !eol(*p) && *p!= '|' ) { p++,len++; } *dest= malloc (len+1); if (!*dest) {return ERR;} strncpy(*dest, source, len); (*dest)[len]=0; while (*p=='\r' || *p=='\n' || *p=='|') p++; if (*p) return p; // not-last text field else return NULL; // last text field } static subtitle *set_multiline_text(subtitle *current, const char *text, int start) { int i = start; while ((text = sub_readtext(text, current->text + i))) { if (current->text[i] == ERR) return ERR; i++; if (i >= SUB_MAX_TEXT) { mp_msg(MSGT_SUBREADER, MSGL_WARN, "Too many lines in a subtitle\n"); current->lines = i; return current; } } current->lines = i + 1; return current; } static subtitle *sub_read_line_microdvd(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; char line2[LINE_LEN+1]; char *p; do { if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; } while ((sscanf (line, "{%ld}{}%[^\r\n]", &(current->start), line2) < 2) && (sscanf (line, "{%ld}{%ld}%[^\r\n]", &(current->start), &(current->end), line2) < 3)); #ifdef CONFIG_ASS if (ass_enabled) { subassconvert_microdvd(line2, line, LINE_LEN + 1); p = line; } else #endif p = line2; return set_multiline_text(current, p, 0); } static subtitle *sub_read_line_mpl2(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; char line2[LINE_LEN+1]; do { if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; } while ((sscanf (line, "[%ld][%ld]%[^\r\n]", &(current->start), &(current->end), line2) < 3)); current->start *= 10; current->end *= 10; return set_multiline_text(current, line2, 0); } static subtitle *sub_read_line_subrip(stream_t* st, subtitle *current, int utf16) { char line[LINE_LEN+1]; int a1,a2,a3,a4,b1,b2,b3,b4; char *p=NULL, *q=NULL; int len; while (1) { if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if (sscanf (line, "%d:%d:%d.%d,%d:%d:%d.%d",&a1,&a2,&a3,&a4,&b1,&b2,&b3,&b4) < 8) continue; current->start = a1*360000+a2*6000+a3*100+a4; current->end = b1*360000+b2*6000+b3*100+b4; if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; p=q=line; for (current->lines=1; current->lines < SUB_MAX_TEXT; current->lines++) { for (q=p,len=0; *p && *p!='\r' && *p!='\n' && *p!='|' && strncmp(p,"[br]",4); p++,len++); current->text[current->lines-1]=malloc (len+1); if (!current->text[current->lines-1]) return ERR; strncpy (current->text[current->lines-1], q, len); current->text[current->lines-1][len]='\0'; if (!*p || *p=='\r' || *p=='\n') break; if (*p=='|') p++; else while (*p++!=']'); } break; } return current; } #ifdef CONFIG_ASS static subtitle *sub_ass_read_line_subviewer(stream_t *st, subtitle *current, int utf16) { int h1, m1, s1, ms1, h2, m2, s2, ms2, j = 0; while (!current->text[0]) { char line[LINE_LEN + 1], full_line[LINE_LEN + 1]; int i; /* Parse SubRip header */ if (!stream_read_line(st, line, LINE_LEN, utf16)) return NULL; if (sscanf(line, "%d:%d:%d%*[,.:]%d --> %d:%d:%d%*[,.:]%d", &h1, &m1, &s1, &ms1, &h2, &m2, &s2, &ms2) < 8) continue; current->start = h1 * 360000 + m1 * 6000 + s1 * 100 + ms1 / 10; current->end = h2 * 360000 + m2 * 6000 + s2 * 100 + ms2 / 10; /* Concat lines */ full_line[0] = 0; for (i = 0; i < SUB_MAX_TEXT; i++) { int blank = 1, len = 0; char *p; if (!stream_read_line(st, line, LINE_LEN, utf16)) break; for (p = line; *p != '\n' && *p != '\r' && *p; p++, len++) if (*p != ' ' && *p != '\t') blank = 0; if (blank) break; *p = 0; if (len >= sizeof(full_line) - j - 2) break; if (j != 0) full_line[j++] = '\n'; strcpy(&full_line[j], line); j += len; } /* Use the ASS/SSA converter to transform the whole lines */ if (full_line[0]) { char converted_line[LINE_LEN + 1]; subassconvert_subrip(full_line, converted_line, LINE_LEN + 1); current->text[0] = strdup(converted_line); current->lines = 1; } } return current; } #endif static subtitle *sub_read_line_subviewer(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; int a1,a2,a3,a4,b1,b2,b3,b4; char *p=NULL; int i,len; #ifdef CONFIG_ASS if (ass_enabled) return sub_ass_read_line_subviewer(st, current, utf16); #endif while (!current->text[0]) { if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if ((len=sscanf (line, "%d:%d:%d%*[,.:]%d --> %d:%d:%d%*[,.:]%d",&a1,&a2,&a3,&a4,&b1,&b2,&b3,&b4)) < 8) continue; current->start = a1*360000+a2*6000+a3*100+a4/10; current->end = b1*360000+b2*6000+b3*100+b4/10; for (i=0; itext[i]=malloc (len+1); if (!current->text[i]) return ERR; //strncpy (current->text[i], line, len); current->text[i][len]='\0'; for(; j') { skip=0; continue; } if(line[j]=='<') { skip=1; continue; } if(skip) { continue; } *curptr=line[j]; curptr++; } *curptr='\0'; i++; } else { break; } } current->lines=i; } return current; } static subtitle *sub_read_line_subviewer2(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; int a1,a2,a3,a4; char *p=NULL; int i,len; while (!current->text[0]) { if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if (line[0]!='{') continue; if ((len=sscanf (line, "{T %d:%d:%d:%d",&a1,&a2,&a3,&a4)) < 4) continue; current->start = a1*360000+a2*6000+a3*100+a4/10; for (i=0; itext[i]=malloc (len+1); if (!current->text[i]) return ERR; strncpy (current->text[i], line, len); current->text[i][len]='\0'; ++i; } else { break; } } current->lines=i; } return current; } static subtitle *sub_read_line_vplayer(stream_t *st,subtitle *current, int utf16) { char line[LINE_LEN+1]; int a1,a2,a3; char *p=NULL, separator; int len,plen; while (!current->text[0]) { if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; if ((len=sscanf (line, "%d:%d:%d%c%n",&a1,&a2,&a3,&separator,&plen)) < 4) continue; if (!(current->start = a1*360000+a2*6000+a3*100)) continue; /* removed by wodzu p=line; // finds the body of the subtitle for (i=0; i<3; i++){ p=strchr(p,':'); if (p==NULL) break; ++p; } if (p==NULL) { printf("SUB: Skipping incorrect subtitle line!\n"); continue; } */ // by wodzu: hey! this time we know what length it has! what is // that magic for? it can't deal with space instead of third // colon! look, what simple it can be: p = &line[ plen ]; if (*p!='|') { // return set_multiline_text(current, p, 0); } } return current; } static subtitle *sub_read_line_google(stream_t *st, subtitle *current, int utf16) { uint8_t part[LINE_LEN+1]; uint8_t *p; double start, duration; do { if (!stream_read_until(st, part, LINE_LEN, '>', utf16)) return NULL; } while (sscanf(part, "start = start * 100; current->end = current->start + duration * 100; // find start of end tag if (!stream_read_until(st, part, LINE_LEN, '<', utf16)) return NULL; // discard end tag opening p = strchr(part, '<'); if (p) *p = 0; // This is the actual text. if (set_multiline_text(current, part, 0) == ERR) return ERR; // discard rest of closing tag if (!stream_read_until(st, part, LINE_LEN, '>', utf16)) return NULL; return current; } static subtitle *sub_read_line_rt(stream_t *st,subtitle *current, int utf16) { //TODO: This format uses quite rich (sub/super)set of xhtml // I couldn't check it since DTD is not included. // WARNING: full XML parses can be required for proper parsing char line[LINE_LEN+1]; int a1,a2,a3,a4,b1,b2,b3,b4; char *p=NULL,*next=NULL; int plen; while (!current->text[0]) { int match = 0; if (!stream_read_line (st, line, LINE_LEN, utf16)) return NULL; //TODO: it seems that format of time is not easily determined, it may be 1:12, 1:12.0 or 0:1:12.0 //to describe the same moment in time. Maybe there are even more formats in use. //This probably should be changed to do something nicer than //"brute-forcing" a long list of format strings. //if ((len=sscanf (line, "