diff --git a/src/unicode.cpp b/src/unicode.cpp index 6b3b2dbe7d..cc7030d1dd 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -499,6 +499,7 @@ return bpe_offsets; } +#if 0 // use std::wregex to split the text static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { std::wregex expr(regex_expr); @@ -528,6 +529,7 @@ return bpe_offsets; } +#endif // use std::regex to split the text static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { @@ -818,20 +820,22 @@ //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str()); bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); } else { - // no unicode category used, we can use std::wregex directly - const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); + fprintf(stderr, "Only use utf-8"); + std::abort(); + // // no unicode category used, we can use std::wregex directly + // const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); - // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback - std::wstring wtext(cpts.begin(), cpts.end()); - for (size_t i = 0; i < wtext.size(); ++i) { - if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) { - wtext[i] = 0x0B; - } - } + // // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback + // std::wstring wtext(cpts.begin(), cpts.end()); + // for (size_t i = 0; i < wtext.size(); ++i) { + // if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) { + // wtext[i] = 0x0B; + // } + // } - //printf("text: %s\n", text.c_str()); - //printf("regex_expr: %s\n", regex_expr.c_str()); - bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); + // //printf("text: %s\n", text.c_str()); + // //printf("regex_expr: %s\n", regex_expr.c_str()); + // bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); } } catch (std::regex_error & e) { fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());