19 #include <glog/logging.h> 28 #if !FOLLY_SSE_PREREQ(4, 2) 43 #include <emmintrin.h> 44 #include <nmmintrin.h> 45 #include <smmintrin.h> 55 static constexpr
size_t kMinPageSize = 4096;
58 "kMinPageSize must be at least SSE register size");
61 static inline uintptr_t page_for(
T*
addr) {
62 return reinterpret_cast<uintptr_t
>(
addr) / kMinPageSize;
65 static inline size_t nextAlignedIndex(
const char* arr) {
66 auto firstPossible =
reinterpret_cast<uintptr_t
>(arr) + 1;
68 ((firstPossible + 15) & ~0xF)
73 size_t qfind_first_byte_of_needles16(
76 DCHECK_GT(haystack.
size(), 0u);
77 DCHECK_GT(needles.
size(), 0u);
78 DCHECK_LE(needles.
size(), 16u);
79 if ((needles.
size() <= 2 && haystack.
size() >= 256) ||
81 (haystack.
size() < 16 &&
82 page_for(haystack.
end() - 1) != page_for(haystack.
data() + 15)) ||
84 page_for(needles.
end() - 1) != page_for(needles.
data() + 15)) {
88 auto arr2 = _mm_loadu_si128_unchecked(
89 reinterpret_cast<const __m128i*>(needles.
data()));
91 auto arr1 = _mm_loadu_si128_unchecked(
92 reinterpret_cast<const __m128i*>(haystack.
data()));
94 _mm_cmpestri(arr2,
int(needles.
size()), arr1,
int(haystack.
size()), 0);
100 size_t i = nextAlignedIndex(haystack.
data());
101 for (; i < haystack.
size(); i += 16) {
102 arr1 = _mm_load_si128_unchecked(
103 reinterpret_cast<const __m128i*>(haystack.
data() +
i));
104 index = _mm_cmpestri(
105 arr2,
int(needles.
size()), arr1,
int(haystack.
size() -
i), 0);
110 return std::string::npos;
117 template <
bool HAYSTACK_ALIGNED>
118 size_t scanHaystackBlock(
122 DCHECK_GT(needles.
size(), 16u);
124 blockStartIdx + 16 <= haystack.
size() ||
125 (page_for(haystack.
data() + blockStartIdx) ==
126 page_for(haystack.
data() + blockStartIdx + 15)));
129 if (HAYSTACK_ALIGNED) {
130 arr1 = _mm_load_si128_unchecked(
131 reinterpret_cast<const __m128i*>(haystack.
data() + blockStartIdx));
133 arr1 = _mm_loadu_si128_unchecked(
134 reinterpret_cast<const __m128i*>(haystack.
data() + blockStartIdx));
138 auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.
data()));
140 _mm_cmpestri(arr2, 16, arr1,
int(haystack.
size() - blockStartIdx), 0);
142 size_t j = nextAlignedIndex(needles.
data());
143 for (; j < needles.
size(); j += 16) {
144 arr2 = _mm_load_si128_unchecked(
145 reinterpret_cast<const __m128i*>(needles.
data() + j));
147 auto index = _mm_cmpestri(
149 int(needles.
size() - j),
151 int(haystack.
size() - blockStartIdx),
157 return blockStartIdx +
b;
159 return std::string::npos;
170 return std::string::npos;
171 }
else if (needles.
size() <= 16) {
174 return qfind_first_byte_of_needles16(haystack, needles);
177 if (haystack.
size() < 16 &&
178 page_for(haystack.
end() - 1) != page_for(haystack.
data() + 16)) {
180 if (haystack.
size() <= 2) {
186 auto ret = scanHaystackBlock<false>(haystack, needles, 0);
187 if (ret != std::string::npos) {
191 size_t i = nextAlignedIndex(haystack.data());
192 for (; i < haystack.size(); i += 16) {
193 ret = scanHaystackBlock<true>(haystack, needles,
i);
194 if (ret != std::string::npos) {
199 return std::string::npos;
size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, const StringPieceLite needles)
size_t qfind_first_byte_of_std(const StringPieceLite haystack, const StringPieceLite needles)
size_t qfind_first_byte_of_byteset(const StringPieceLite haystack, const StringPieceLite needles)
—— Concurrent Priority Queue Implementation ——
const char * data() const
size_t qfind_first_byte_of_nosse(const StringPieceLite haystack, const StringPieceLite needles)
ThreadPoolListHook * addr