# Libaries import re; # Generator function def gen(lex): # Language descriptor descriptor = lex.Descriptor([ u"IGNORE", u"TAG", u"DECLARATION_TAG", u"PERCENT_TAG", u"QUESTION_TAG", u"SUB_DECLARATION", u"RAW_HTML_DATA", ], u"IGNORE"); flags = descriptor.flags; descriptor.define_types({ u"COMMENT": 0, u"CDATA": 0, u"TEXT": 0, u"RAW_DATA": 0, u"TAG_OPEN": 0, u"TAG_CLOSE": 0, u"TAG_NAME": 0, u"ATTRIBUTE": 0, u"ATTRIBUTE_WHITESPACE": 0, u"ATTRIBUTE_OPERATOR": 0, u"ATTRIBUTE_STRING": 0, }); # Matching logic re_comment = re.compile(u"-->"); re_cdata = re.compile(u"\\]\\]>"); re_newlines_search = re.compile(u"[\\r\\n\u2028\u2029]"); re_newlines_split = re.compile(u"[\\n\u2028\u2029]|\\r\\n?"); re_bracket_open = re.compile(u"<"); re_bracket_open_or_sub_close = re.compile(u"[<\\]]"); re_bracket_open_char = re.compile(u"/|[\\w\\-\\?%!]"); def match_string(self, p): # Match to end of string escaped = False; p_max = len(self.text); quote = self.text[self.pos] while (p < p_max): c = self.text[p]; if (escaped): escaped = False; if (c == u"\r" and p + 1 < p_max and self.text[p + 1] == u"\n"): p += 1; else: if (c == quote): p += 1; break; elif (c == u"\\"): escaped = True; elif (string_contains_newline(c)): break; p += 1; return p; def match_generic(self, p, match_regex): # Match the comment m = match_regex.search(self.text, p); if (m is None): return len(self.text); return m.end(); # Raw data terminator state class RawDataTerminator(object): def __init__(self, match_string, match_regex, match_token_type, match_token_flags, resume_state, resume_extra_flags, resume_tag): self.match_string = match_string; self.match_regex = match_regex; self.match_token_type = match_token_type; self.match_token_flags = match_token_flags; self.resume_state = resume_state; self.resume_extra_flags = resume_extra_flags; self.resume_tag = resume_tag; # Constructor def on_new(self): self.tags = []; # < = p_max): break; # end of string # Check for match if (m.group(0) == u"]" or re_bracket_open_char.match(self.text, p) is not None): # Match p_max = m.start(); break; # End of string if (p_max > start): return [ 0 , p_max ]; return None; def check_is_raw_data(self, p): start = p; m = self.raw_data_terminator.match_regex.match(self.text, p); if (m is None): p = len(self.text); else: p = m.end(); if (p > start): return [ 0 , p ]; return None; def check_is_raw_data_terminator(self, p): return [ 0 , p + len(self.raw_data_terminator.match_string) ]; def create_token_comment(self, flags, p): p = match_generic(self, p, re_comment); return self.create_token(descriptor.COMMENT, flags, p); def create_token_cdata(self, flags, p): p = match_generic(self, p, re_cdata); return self.create_token(descriptor.CDATA, flags, p); def create_token_string(self, flags, p): p = match_string(self, p); return self.create_token(descriptor.ATTRIBUTE_STRING, flags, p); def create_token_raw_data_terminator(self, flags, p): rdt = self.raw_data_terminator; self.raw_data_terminator = None; self.state = rdt.resume_state; self.extra_flags = rdt.resume_extra_flags; if (rdt.resume_tag is not None): self.tags.append(rdt.resume_tag); return self.create_token(rdt.match_token_type, flags | rdt.match_token_flags, p); check_string = [ lex.check_regex(u"['\"]"), # string create_token_string, ]; descriptor.define_state([ # state 0: outside of tags [ lex.check_regex(u"", descriptor.TAG_OPEN, flags.QUESTION_TAG), ], [ lex.check_string(u"<%"), create_token_raw_data_opener_fn(u"%>", descriptor.TAG_OPEN, flags.PERCENT_TAG), ], [ lex.check_string(u"]"), create_token_close_tag_fn(descriptor.TAG_CLOSE, False), ], None, # self state should never be reached; will throw an exception if it happens ], 0); descriptor.define_state([ # state 1: tag name [ lex.check_regex(u"[a-zA-Z_]+"), create_token_tag_name_fn(descriptor.TAG_NAME, states.ATTRIBUTES), ], ], 0); descriptor.define_state([ # state 2: question tag name [ lex.check_regex(u"[a-zA-Z_]+"), create_token_tag_name_fn(descriptor.TAG_NAME, states.ATTRIBUTES_QUESTION), ], ], 0); descriptor.define_state([ # state 3: declaration tag name [ lex.check_regex(u"[a-zA-Z_]+"), create_token_tag_name_fn(descriptor.TAG_NAME, states.ATTRIBUTES_DECLARATION), ], ], 0); descriptor.define_state([ # state 4: attributes [ lex.check_string(u">"), # closing tag create_token_close_tag_fn(descriptor.TAG_CLOSE, True), ], [ lex.check_string(u"/>"), # closing tag create_token_close_tag_fn(descriptor.TAG_CLOSE, False), ], check_string, [ lex.check_regex(u"[\\w\\-]+?(?=/?>|[^\\w\\-]|$)"), # word lex.create_token(descriptor.ATTRIBUTE), ], [ lex.check_regex(u"[^'\"\\w\\s]+?(?=/?>|['\"\\w\\s]|$)"), # operator lex.create_token(descriptor.ATTRIBUTE_OPERATOR), ], [ lex.check_regex(u"[\\s]+?(?=/?>|[^\\s]|$)"), # whitespace lex.create_token(descriptor.ATTRIBUTE_WHITESPACE), ], ], 0); descriptor.define_state([ # state 5: question attributes [ lex.check_string(u"?>"), # closing tag create_token_close_tag_fn(descriptor.TAG_CLOSE, False), ], check_string, [ lex.check_regex(u"[\\w\\-]+?(?=\\?>|[^\\w\\-]|$)"), # word lex.create_token(descriptor.ATTRIBUTE), ], [ lex.check_regex(u"[^'\"\\w\\s]+?(?=\\?>|['\"\\w\\s]|$)"), # operator lex.create_token(descriptor.ATTRIBUTE_OPERATOR), ], [ lex.check_regex(u"[\\s]+?(?=\\?>|[^\\s]|$)"), # whitespace lex.create_token(descriptor.ATTRIBUTE_WHITESPACE), ], ], 0); descriptor.define_state([ # state 6: declaration attributes [ lex.check_string(u">"), # closing tag create_token_close_tag_fn(descriptor.TAG_CLOSE, False), ], [ lex.check_string(u"["), # sub-declarations tag create_token_open_tag_fn(descriptor.TAG_OPEN, states.DEFAULT, flags.SUB_DECLARATION), ], check_string, [ lex.check_regex(u"[^'\"\\s]+?(?=[>\\[]|['\"\\s]|$)"), # word lex.create_token(descriptor.ATTRIBUTE), ], [ lex.check_regex(u"[\\s]+?(?=[>\\[]|[^\\s]|$)"), # whitespace lex.create_token(descriptor.ATTRIBUTE_WHITESPACE), ], ], 0); descriptor.define_state([ # state 7: raw data [ check_is_raw_data, lex.create_token(descriptor.RAW_DATA), ], [ check_is_raw_data_terminator, create_token_raw_data_terminator, ], None, # invalid ], 0); # Additional functions def string_contains_newline(text): return re_newlines_search.search(text) is not None; def string_splitlines(text): return re_newlines_split.split(text); setattr(descriptor, u"string_contains_newline", string_contains_newline); setattr(descriptor, u"string_splitlines", string_splitlines); # Complete return descriptor;