diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1a90a1eb88..6f5f3d2868 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include "moz-overrides.h" @@ -926,7 +925,7 @@ GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; - std::vector>> validation_result; + std::vector> validation_result; // 4 staging buffers for async uploads, each sized 1MB seems to be a good default for single NVMe drives. // NVMe raid configurations might require more / larger buffers. @@ -1041,9 +1040,7 @@ uint8_t * data = (uint8_t *) mapping->addr() + weight->offs; if (check_tensors) { - validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { - return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); - })); + validation_result.push_back(std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size))); } GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated @@ -1066,9 +1063,7 @@ file->seek(weight->offs, SEEK_SET); file->read_raw(cur->data, n_size); if (check_tensors) { - validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { - return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); - })); + validation_result.push_back(std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size))); } } else { // If upload_backend is valid load the tensor in chunks to pinned memory and upload the buffers asynchronously to the GPU. @@ -1116,8 +1111,7 @@ // check validation results bool validation_failed = false; - for (auto & future : validation_result) { - auto result = future.get(); + for (const auto & result : validation_result) { if (!result.second) { LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); validation_failed = true;