proxygen
RangeSse42.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2015-present Facebook, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #include <glog/logging.h>
20 
21 #include <folly/Portability.h>
22 
23 // Essentially, two versions of this file: one with an SSE42 implementation
24 // and one with a fallback implementation. We determine which version to use by
25 // testing for the presence of the required headers.
26 //
27 // TODO: Maybe this should be done by the build system....
28 #if !FOLLY_SSE_PREREQ(4, 2)
29 namespace folly {
30 namespace detail {
32  const StringPieceLite haystack,
33  const StringPieceLite needles) {
34  return qfind_first_byte_of_nosse(haystack, needles);
35 }
36 } // namespace detail
37 } // namespace folly
38 #else
39 #include <cstdint>
40 #include <limits>
41 #include <string>
42 
43 #include <emmintrin.h>
44 #include <nmmintrin.h>
45 #include <smmintrin.h>
46 
47 #include <folly/Likely.h>
48 #include <folly/detail/Sse.h>
49 
50 namespace folly {
51 namespace detail {
52 
53 // It's okay if pages are bigger than this (as powers of two), but they should
54 // not be smaller.
55 static constexpr size_t kMinPageSize = 4096;
56 static_assert(
57  kMinPageSize >= 16,
58  "kMinPageSize must be at least SSE register size");
59 
60 template <typename T>
61 static inline uintptr_t page_for(T* addr) {
62  return reinterpret_cast<uintptr_t>(addr) / kMinPageSize;
63 }
64 
65 static inline size_t nextAlignedIndex(const char* arr) {
66  auto firstPossible = reinterpret_cast<uintptr_t>(arr) + 1;
67  return 1 + // add 1 because the index starts at 'arr'
68  ((firstPossible + 15) & ~0xF) // round up to next multiple of 16
69  - firstPossible;
70 }
71 
72 // helper method for case where needles.size() <= 16
73 size_t qfind_first_byte_of_needles16(
74  const StringPieceLite haystack,
75  const StringPieceLite needles) {
76  DCHECK_GT(haystack.size(), 0u);
77  DCHECK_GT(needles.size(), 0u);
78  DCHECK_LE(needles.size(), 16u);
79  if ((needles.size() <= 2 && haystack.size() >= 256) ||
80  // must bail if we can't even SSE-load a single segment of haystack
81  (haystack.size() < 16 &&
82  page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) ||
83  // can't load needles into SSE register if it could cross page boundary
84  page_for(needles.end() - 1) != page_for(needles.data() + 15)) {
85  return detail::qfind_first_byte_of_nosse(haystack, needles);
86  }
87 
88  auto arr2 = _mm_loadu_si128_unchecked(
89  reinterpret_cast<const __m128i*>(needles.data()));
90  // do an unaligned load for first block of haystack
91  auto arr1 = _mm_loadu_si128_unchecked(
92  reinterpret_cast<const __m128i*>(haystack.data()));
93  auto index =
94  _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0);
95  if (index < 16) {
96  return size_t(index);
97  }
98 
99  // Now, we can do aligned loads hereafter...
100  size_t i = nextAlignedIndex(haystack.data());
101  for (; i < haystack.size(); i += 16) {
102  arr1 = _mm_load_si128_unchecked(
103  reinterpret_cast<const __m128i*>(haystack.data() + i));
104  index = _mm_cmpestri(
105  arr2, int(needles.size()), arr1, int(haystack.size() - i), 0);
106  if (index < 16) {
107  return i + index;
108  }
109  }
110  return std::string::npos;
111 }
112 
113 // Scans a 16-byte block of haystack (starting at blockStartIdx) to find first
114 // needle. If HAYSTACK_ALIGNED, then haystack must be 16byte aligned.
115 // If !HAYSTACK_ALIGNED, then caller must ensure that it is safe to load the
116 // block.
117 template <bool HAYSTACK_ALIGNED>
118 size_t scanHaystackBlock(
119  const StringPieceLite haystack,
120  const StringPieceLite needles,
121  uint64_t blockStartIdx) {
122  DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method
123  DCHECK(
124  blockStartIdx + 16 <= haystack.size() ||
125  (page_for(haystack.data() + blockStartIdx) ==
126  page_for(haystack.data() + blockStartIdx + 15)));
127 
128  __m128i arr1;
129  if (HAYSTACK_ALIGNED) {
130  arr1 = _mm_load_si128_unchecked(
131  reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
132  } else {
133  arr1 = _mm_loadu_si128_unchecked(
134  reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx));
135  }
136 
137  // This load is safe because needles.size() >= 16
138  auto arr2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needles.data()));
139  auto b =
140  _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0);
141 
142  size_t j = nextAlignedIndex(needles.data());
143  for (; j < needles.size(); j += 16) {
144  arr2 = _mm_load_si128_unchecked(
145  reinterpret_cast<const __m128i*>(needles.data() + j));
146 
147  auto index = _mm_cmpestri(
148  arr2,
149  int(needles.size() - j),
150  arr1,
151  int(haystack.size() - blockStartIdx),
152  0);
153  b = std::min(index, b);
154  }
155 
156  if (b < 16) {
157  return blockStartIdx + b;
158  }
159  return std::string::npos;
160 }
161 
163  const StringPieceLite haystack,
164  const StringPieceLite needles);
165 
167  const StringPieceLite haystack,
168  const StringPieceLite needles) {
169  if (UNLIKELY(needles.empty() || haystack.empty())) {
170  return std::string::npos;
171  } else if (needles.size() <= 16) {
172  // we can save some unnecessary load instructions by optimizing for
173  // the common case of needles.size() <= 16
174  return qfind_first_byte_of_needles16(haystack, needles);
175  }
176 
177  if (haystack.size() < 16 &&
178  page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) {
179  // We can't safely SSE-load haystack. Use a different approach.
180  if (haystack.size() <= 2) {
181  return qfind_first_byte_of_std(haystack, needles);
182  }
183  return qfind_first_byte_of_byteset(haystack, needles);
184  }
185 
186  auto ret = scanHaystackBlock<false>(haystack, needles, 0);
187  if (ret != std::string::npos) {
188  return ret;
189  }
190 
191  size_t i = nextAlignedIndex(haystack.data());
192  for (; i < haystack.size(); i += 16) {
193  ret = scanHaystackBlock<true>(haystack, needles, i);
194  if (ret != std::string::npos) {
195  return ret;
196  }
197  }
198 
199  return std::string::npos;
200 }
201 } // namespace detail
202 } // namespace folly
203 #endif
size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, const StringPieceLite needles)
Definition: RangeSse42.cpp:31
char b
size_t qfind_first_byte_of_std(const StringPieceLite haystack, const StringPieceLite needles)
Definition: RangeCommon.h:72
size_t qfind_first_byte_of_byteset(const StringPieceLite haystack, const StringPieceLite needles)
Definition: RangeCommon.cpp:42
folly::std T
—— Concurrent Priority Queue Implementation ——
Definition: AtomicBitSet.h:29
LogLevel min
Definition: LogLevel.cpp:30
const char * data() const
Definition: RangeCommon.h:43
size_t qfind_first_byte_of_nosse(const StringPieceLite haystack, const StringPieceLite needles)
Definition: RangeCommon.h:92
#define UNLIKELY(x)
Definition: Likely.h:48
const char * end() const
Definition: RangeCommon.h:49
ThreadPoolListHook * addr