// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "fuzzed_data_provider.h" #include #include "util.h" namespace atheris { namespace { uint16_t FastByteToHex(unsigned char byte) { const char hex[] = "0123456789ABCDEF"; return (hex[byte & 0xf] << 8) + static_cast(hex[(byte >> 4) & 0xf]); } } // namespace py::object FuzzedDataProvider::ConsumeUnicodeImpl(size_t count, bool filter_surrogates) { if (count == 0) return UnicodeFromKindAndData(1, nullptr, 0); if (remaining_bytes_ == 0) return UnicodeFromKindAndData(1, nullptr, 0); if (remaining_bytes_ == 1) { Advance(1); return UnicodeFromKindAndData(1, nullptr, 0); } uint8_t string_spec = *data_ptr_; Advance(1); // 50% of the time, make a pure ASCII string. If we didn't do this, it would // be unlikely for libFuzzer to produce an ASCII string, so any API that // expected an ASCII string might be difficult to fuzz. if (string_spec & 1) { size_t bytes = std::min(count, remaining_bytes_); std::string buf(reinterpret_cast(data_ptr_), bytes); for (char& c : buf) { c &= 0b01111111; // Unset the first bit, clamp to ASCII } py::object ret = UnicodeFromKindAndData(1, buf.data(), buf.size()); Advance(bytes); return ret; } if (string_spec & 2) { // Otherwise, 50% chance of utf-16-compatible string size_t bytes = std::min(count * 2, remaining_bytes_); size_t even_bytes = bytes & ~1ULL; std::vector buf(even_bytes / 2); if (!buf.empty()) { memcpy(&buf[0], data_ptr_, even_bytes); } if (filter_surrogates) { for (uint16_t& codepoint : buf) { if (codepoint >= 0xd800 && codepoint < 0xe000) codepoint -= 0xd800; } } py::object ret = UnicodeFromKindAndData(2, buf.data(), buf.size()); Advance(bytes); return ret; } else { // Otherwise, full 21-bitish Unicode characters, encoded into 32-bit chunks. size_t bytes = std::min(count * 4, remaining_bytes_); size_t group_bytes = bytes & ~3ULL; std::vector buf(group_bytes / 4); if (!buf.empty()) { memcpy(buf.data(), data_ptr_, group_bytes); } for (uint32_t& codepoint : buf) { codepoint &= 0x1fffffU; // 21 bits // The actual maximum is 0x10ffff, so if bit 0x100000 is set, zero out // nibble 0x0f0000. if (codepoint & 0x100000U) { codepoint &= ~0x0f0000U; } } if (filter_surrogates) { for (uint32_t& codepoint : buf) { if (codepoint >= 0xd800U && codepoint < 0xe000U) codepoint -= 0xd800U; } } auto ret = UnicodeFromKindAndData(4, buf.data(), buf.size()); Advance(bytes); return ret; } } py::bytes FuzzedDataProvider::ConsumeBytes(size_t count) { size_t num_bytes = std::min(count, remaining_bytes_); if (!num_bytes) return py::bytes("", 0); py::bytes ret(reinterpret_cast(data_ptr_), num_bytes); Advance(num_bytes); return ret; } py::object FuzzedDataProvider::ConsumeString(size_t count) { #if PY_MAJOR_VERSION >= 3 return ConsumeUnicode(count); #else return ConsumeBytes(count); #endif // PY_MAJOR_VERSION >= 3 } #define INT_TO_PYINT_FUNC PyLong_FromLong #define UINT_TO_PYINT_FUNC PyLong_FromUnsignedLong py::int_ FuzzedDataProvider::ConsumeInt(size_t bytes) { bytes = std::min(bytes, remaining_bytes_); PyObject* ret = nullptr; if (bytes == 0) { ret = INT_TO_PYINT_FUNC(0); } else if (bytes == 1) { ret = INT_TO_PYINT_FUNC(*reinterpret_cast(data_ptr_)); } else if (bytes == 2) { ret = INT_TO_PYINT_FUNC(*reinterpret_cast(data_ptr_)); } else if (bytes == 4) { ret = INT_TO_PYINT_FUNC(*reinterpret_cast(data_ptr_)); } else if (bytes == 8) { ret = PyLong_FromLongLong(*reinterpret_cast(data_ptr_)); } else if (bytes < 8) { int64_t tmp = 0; // We need to copy a number of bytes that fits into a 64-bit integer, so we // can just use that without having to go via a string. However, // we want to make sure sign extension works, so we get both positive and // negative values. To do that, we copy the integer into the high order // bytes of a 64-bit integer, then shift it down to the low bytes. memcpy(reinterpret_cast(&tmp) + 8 - bytes, data_ptr_, bytes); tmp >>= (64 - 8 * bytes); ret = PyLong_FromLongLong(tmp); } else { // The API for constructing a Big integer requires going through a string, // unfortunately. Emit a hex string, then convert it to a Big integer. // If the high-order bit is negative, do some Math (tm) to effect the // two's complement. char* buf = reinterpret_cast(alloca(2 * bytes + 2)); buf[2 * bytes + 1] = '\0'; uint16_t* number = reinterpret_cast(buf + 1); for (int i = 0; i < bytes; ++i) { number[bytes - i - 1] = FastByteToHex(data_ptr_[i]); } py::int_ obj(py::handle(PyLong_FromString(buf + 1, nullptr, 16)), false); if (data_ptr_[bytes - 1] & 0x80) { // number is negative py::int_ magnitude(1); magnitude = magnitude.attr("__lshift__")(bytes * 8); obj = obj - magnitude; } Advance(bytes); return obj; } Advance(bytes); return py::int_(py::handle(ret), false); } py::int_ FuzzedDataProvider::ConsumeUInt(size_t bytes) { bytes = std::min(bytes, remaining_bytes_); PyObject* ret = nullptr; if (bytes == 0) { ret = UINT_TO_PYINT_FUNC(0); } else if (bytes == 1) { uint8_t byte = *reinterpret_cast(data_ptr_); ret = UINT_TO_PYINT_FUNC(byte); } else if (bytes == 2) { uint16_t bytes = *reinterpret_cast(data_ptr_); ret = UINT_TO_PYINT_FUNC(bytes); } else if (bytes == 4) { uint32_t bytes = *reinterpret_cast(data_ptr_); ret = UINT_TO_PYINT_FUNC(bytes); } else if (bytes == 8) { uint64_t bytes = *reinterpret_cast(data_ptr_); ret = PyLong_FromUnsignedLongLong(bytes); } else if (bytes < 8) { uint64_t tmp = 0; // We need to copy a number of bytes that fits into a 64-bit integer, so we // can just use that without having to go via a string. memcpy(reinterpret_cast(&tmp), data_ptr_, bytes); ret = PyLong_FromUnsignedLongLong(tmp); } else { // The API for constructing a Big integer requires going through a string, // unfortunately. Emit a hex string, then convert it to a Big integer. char* buf = reinterpret_cast(alloca(2 * bytes + 2)); buf[2 * bytes + 1] = '\0'; uint16_t* number = reinterpret_cast(buf + 1); for (int i = 0; i < bytes; ++i) { number[bytes - i - 1] = FastByteToHex(data_ptr_[i]); } py::int_ obj(py::handle(PyLong_FromString(buf + 1, nullptr, 16)), false); Advance(bytes); return obj; } Advance(bytes); py::int_ tmp(py::handle(ret), false); return tmp; } py::int_ FuzzedDataProvider::ConsumeIntInRange(py::int_ min, py::int_ max) { py::int_ delta = max - min; if (delta < py::int_(0)) { std::cerr << Colorize(STDERR_FILENO, "ConsumeIntInRange: min must be <= max") << " (got min=" << min << ", max=" << max << std::endl; exit(1); } int size = py::int_(delta.attr("bit_length")()); if (size <= 64) { uint64_t native_delta = delta; uint64_t small_int = ConsumeSmallIntInRange(size, native_delta); py::int_ off = py::int_(small_int); py::int_ ret(min + off); return ret; } py::int_ ret = min + ConsumeInt(size / 8).attr("__mod__")(delta + py::int_(1)); return ret; } int64_t FuzzedDataProvider::ConsumeSmallIntInRange(size_t n, uint64_t range) { uint64_t result = 0; size_t offset = 0; while (offset < n && (range >> offset) > 0 && remaining_bytes_ != 0) { --remaining_bytes_; result = (result << 8) | data_ptr_[remaining_bytes_]; offset += 8; } if (range != std::numeric_limits::max()) result = result % (range + 1); return result; } py::list FuzzedDataProvider::ConsumeIntList(size_t count, size_t bytes) { py::list ret(count); for (size_t i = 0; i < count; ++i) { ret[i] = ConsumeInt(bytes); } return ret; } py::list FuzzedDataProvider::ConsumeIntListInRange(size_t len, py::int_ min, py::int_ max) { py::list ret(len); for (size_t i = 0; i < len; ++i) { ret[i] = ConsumeIntInRange(min, max); } return ret; } const double kUInt64ToProbabilityDivisor = std::numeric_limits::max(); double FuzzedDataProvider::ConsumeProbability() { uint64_t integral = 0; size_t bytes = std::min(8, remaining_bytes_); memcpy(&integral, data_ptr_, bytes); Advance(bytes); return static_cast(integral) / kUInt64ToProbabilityDivisor; } double FuzzedDataProvider::ConsumeFloat() { if (!remaining_bytes_) return 0.0; uint8_t type_val = *data_ptr_; Advance(1); if (type_val <= 10) { // Consume the same amount of bytes as for a regular float Advance(std::min(9, remaining_bytes_)); if (type_val == 0) return 0.0; if (type_val == 1) return -0.0; if (type_val == 2) return std::numeric_limits::infinity(); if (type_val == 3) return -std::numeric_limits::infinity(); if (type_val == 4) return std::numeric_limits::quiet_NaN(); // An sNaN is not valid in Python, so we don't create one here. if (type_val == 5) return std::numeric_limits::denorm_min(); if (type_val == 6) return -std::numeric_limits::denorm_min(); if (type_val == 7) return std::numeric_limits::min(); if (type_val == 8) return -std::numeric_limits::min(); if (type_val == 9) return std::numeric_limits::max(); if (type_val == 10) return -std::numeric_limits::max(); } double regular = ConsumeRegularFloat(); return regular; } double FuzzedDataProvider::ConsumeRegularFloat() { return ConsumeFloatInRange(-std::numeric_limits::max(), std::numeric_limits::max()); } double FuzzedDataProvider::ConsumeFloatInRange(double min, double max) { if (min > max) { std::cerr << Colorize(STDERR_FILENO, "ConsumeFloatInRange: min must be <= max") << " (got min=" << min << ", max=" << max << ")" << std::endl; exit(1); } double range = 0.0; double result = min; // Deal with overflow, in the event min and max are very far apart if (min < 0 && max > 0 && min + std::numeric_limits::max() < max) { range = (max / 2) - (min / 2); if (ConsumeBool()) { result += range; } } else { range = max - min; } double probability = ConsumeProbability(); return result + range * probability; } py::list FuzzedDataProvider::ConsumeFloatList(size_t count) { py::list ret(count); for (size_t i = 0; i < count; ++i) { ret[i] = py::float_(ConsumeFloat()); } return ret; } py::list FuzzedDataProvider::ConsumeRegularFloatList(size_t count) { py::list ret(count); for (size_t i = 0; i < count; ++i) { ret[i] = py::float_(ConsumeRegularFloat()); } return ret; } py::list FuzzedDataProvider::ConsumeProbabilityList(size_t count) { py::list ret(count); for (int i = 0; i < count; ++i) { ret[i] = py::float_(ConsumeProbability()); } return ret; } py::list FuzzedDataProvider::ConsumeFloatListInRange(size_t count, double min, double max) { py::list ret(count); for (size_t i = 0; i < count; ++i) { ret[i] = py::float_(ConsumeFloatInRange(min, max)); } return ret; } bool FuzzedDataProvider::ConsumeBool() { if (!remaining_bytes_) return false; bool ret = *data_ptr_ & 1; Advance(1); return ret; } py::object FuzzedDataProvider::PickValueInList(py::list list) { if (list.size() <= std::numeric_limits::max()) { return list[ConsumeSmallIntInRange(8, list.size() - 1)]; } else if (list.size() <= std::numeric_limits::max()) { return list[ConsumeSmallIntInRange(16, list.size() - 1)]; } else if (list.size() <= std::numeric_limits::max()) { return list[ConsumeSmallIntInRange(32, list.size() - 1)]; } else { return list[ConsumeSmallIntInRange(64, list.size() - 1)]; } } void FuzzedDataProvider::Advance(size_t bytes) { if (bytes > remaining_bytes_) { remaining_bytes_ = 0; } else { data_ptr_ += bytes; remaining_bytes_ -= bytes; } } } // namespace atheris