import re _SEPARATOR = r'@' _RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) _AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) _AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) _UNDO_AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)' + _SEPARATOR + r'(\w)', re.UNICODE) _UNDO_AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)' + _SEPARATOR + r'(\w)', re.UNICODE) def _replace_with_separator(text, separator, regexs): replacement = r"\1" + separator + r"\2" result = text for regex in regexs: result = regex.sub(replacement, result) return result def split_sentence(text, best=True): text = re.sub(r'([。!??])([^”’])', r"\1\n\2", text) text = re.sub(r'(\.{6})([^”’])', r"\1\n\2", text) text = re.sub(r'(…{2})([^”’])', r"\1\n\2", text) text = re.sub(r'([。!??][”’])([^,。!??])', r'\1\n\2', text) for chunk in text.split("\n"): chunk = chunk.strip() if not chunk: continue if not best: yield chunk continue processed = _replace_with_separator(chunk, _SEPARATOR, [_AB_SENIOR, _AB_ACRONYM]) sents = list(_RE_SENTENCE.finditer(processed)) if not sents: yield chunk continue for sentence in sents: sentence = _replace_with_separator(sentence.group(), r" ", [_UNDO_AB_SENIOR, _UNDO_AB_ACRONYM]) yield sentence