/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* Utilities for UTF-16 and surrogate handling. */ #ifndef mozilla_Utf16_h #define mozilla_Utf16_h #include "mozilla/Assertions.h" #include "mozilla/Likely.h" namespace mozilla { /** * Code points U+10000 and greater lie outside the Basic Multilingual Plane and * must be encoded in UTF-16 as a high/low surrogate pair. */ constexpr inline char32_t kPlane1Base = 0x00010000; /** The Unicode replacement character, substituted for malformed input. */ constexpr inline char16_t kReplacementChar = 0xFFFD; /** The largest valid Unicode code point. */ constexpr inline char32_t kUnicodeMax = 0x0010FFFF; /** Whether a code point lies in the Basic Multilingual Plane. */ constexpr bool IsInBMP(char32_t aCodePoint) { return aCodePoint < kPlane1Base; } /** Whether a code unit is a high surrogate: U+D800 - U+DBFF. */ constexpr bool IsHighSurrogate(char32_t aChar) { return (aChar & 0xFFFFFC00) == 0xD800; } /** Whether a code unit is a low surrogate: U+DC00 - U+DFFF. */ constexpr bool IsLowSurrogate(char32_t aChar) { return (aChar & 0xFFFFFC00) == 0xDC00; } /** Whether a code unit is either kind of surrogate: U+D800 - U+DFFF. */ constexpr bool IsSurrogate(char32_t aChar) { return (aChar & 0xFFFFF800) == 0xD800; } /** Whether |aHigh| and |aLow| form a high/low surrogate pair. */ constexpr bool IsSurrogatePair(char32_t aHigh, char32_t aLow) { return IsHighSurrogate(aHigh) && IsLowSurrogate(aLow); } /** Whether a value is a valid Unicode code point (in range and not a * surrogate). */ constexpr bool IsValidCodePoint(char32_t aCodePoint) { return aCodePoint <= kUnicodeMax && !IsSurrogate(aCodePoint); } /** The high surrogate code unit for a non-BMP code point. */ constexpr char16_t HighSurrogate(char32_t aCodePoint) { MOZ_ASSERT(!IsInBMP(aCodePoint)); // Since (c - 0x10000) >> 10 == (c >> 10) - 0x80 and 0xD7C0 == 0xD800 - 0x80, // ((c - 0x10000) >> 10) + 0xD800 simplifies to the following. return char16_t((aCodePoint >> 10) + 0xD7C0); } /** The low surrogate code unit for a non-BMP code point. */ constexpr char16_t LowSurrogate(char32_t aCodePoint) { MOZ_ASSERT(!IsInBMP(aCodePoint)); // Since 0x10000 & 0x3FF == 0, (c - 0x10000) & 0x3FF == c & 0x3FF. return char16_t((aCodePoint & 0x3FF) | 0xDC00); } /** The code point encoded by a high/low surrogate pair. */ constexpr char32_t SurrogateToUCS4(char16_t aHigh, char16_t aLow) { MOZ_ASSERT(IsHighSurrogate(aHigh)); MOZ_ASSERT(IsLowSurrogate(aLow)); return ((char32_t(aHigh) & 0x3FF) << 10) + (char32_t(aLow) & 0x3FF) + kPlane1Base; } /** * Extract the next Unicode scalar value from a UTF-16 buffer and return it. * |*aBuffer| is advanced to the start of the next character. Upon encountering * an unpaired surrogate the return value is U+FFFD, |*aBuffer| is advanced over * the unpaired surrogate, and |*aErr| is set to true (if |aErr| is non-null). * * Note: This function never sets |*aErr| to false, to allow error accumulation * across multiple calls. * * Precondition: |*aBuffer < aEnd|. */ inline char32_t DecodeOneUtf16CodePoint(const char16_t** aBuffer, const char16_t* aEnd, bool* aErr = nullptr) { MOZ_ASSERT(aBuffer, "null buffer pointer pointer"); MOZ_ASSERT(aEnd, "null end pointer"); const char16_t* p = *aBuffer; MOZ_ASSERT(p, "null buffer"); MOZ_ASSERT(p < aEnd, "Bogus range"); char16_t c = *p++; if (MOZ_LIKELY(!IsSurrogate(c))) { *aBuffer = p; return c; } if (MOZ_LIKELY(IsHighSurrogate(c)) && MOZ_LIKELY(p != aEnd) && IsLowSurrogate(*p)) { char16_t low = *p; *aBuffer = ++p; return SurrogateToUCS4(c, low); } // Unpaired surrogate. *aBuffer = p; if (aErr) { *aErr = true; } return kReplacementChar; } } // namespace mozilla #endif /* mozilla_Utf16_h */