diff --git a/ggml/include/gguf.h b/ggml/include/gguf.h index 79ee202062..63be8d26dc 100644 --- a/ggml/include/gguf.h +++ b/ggml/include/gguf.h @@ -78,7 +78,8 @@ GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); + GGML_API struct gguf_context * gguf_init_from_buffer(const void * buffer, size_t buffer_size, struct gguf_init_params params); + GGML_API struct gguf_context * gguf_init_from_file_handle(FILE * file, struct gguf_init_params params); GGML_API void gguf_free(struct gguf_context * ctx); diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index a00c1b6369..ed5fd9fe8e 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -128,6 +128,8 @@ std::vector data; std::vector data_string; + gguf_kv() : is_array(false), type(GGUF_TYPE_COUNT) {} + template gguf_kv(const std::string & key, const T value) : key(key), is_array(false), type(type_to_gguf_type::value) { @@ -288,12 +290,112 @@ } }; +struct gguf_buffer_reader { + const uint8_t * buffer; + size_t buffer_size; + size_t offset; + + gguf_buffer_reader(const void * buffer, size_t buffer_size) + : buffer(static_cast(buffer)), buffer_size(buffer_size), offset(0) {} + + template + bool read(T & dst) const { + if (offset + sizeof(T) > buffer_size) { + return false; + } + memcpy(&dst, buffer + offset, sizeof(T)); + const_cast(this)->offset += sizeof(T); + return true; + } + + template + bool read(std::vector & dst, const size_t n) const { + dst.resize(n); + for (size_t i = 0; i < dst.size(); ++i) { + if constexpr (std::is_same::value) { + bool tmp; + if (!read(tmp)) { + return false; + } + dst[i] = tmp; + } else { + if (!read(dst[i])) { + return false; + } + } + } + return true; + } + + bool read(bool & dst) const { + int8_t tmp = -1; + if (!read(tmp)) { + return false; + } + dst = tmp != 0; + return true; + } + + bool read(enum ggml_type & dst) const { + int32_t tmp = -1; + if (!read(tmp)) { + return false; + } + dst = ggml_type(tmp); + return true; + } + + bool read(enum gguf_type & dst) const { + int32_t tmp = -1; + if (!read(tmp)) { + return false; + } + dst = gguf_type(tmp); + return true; + } + + bool read(std::string & dst) const { + uint64_t size = -1; + if (!read(size)) { + return false; + } + if (offset + size > buffer_size) { + return false; + } + dst.resize(size); + memcpy(dst.data(), buffer + offset, size); + const_cast(this)->offset += size; + return true; + } + + bool read(void * dst, const size_t size) const { + if (offset + size > buffer_size) { + return false; + } + memcpy(dst, buffer + offset, size); + const_cast(this)->offset += size; + return true; + } + + bool seek(size_t position) { + if (position > buffer_size) { + return false; + } + offset = position; + return true; + } + + size_t tell() const { + return offset; + } +}; + struct gguf_context * gguf_init_empty(void) { return new gguf_context; } -template -bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector & kv, const std::string & key, const bool is_array, const size_t n) { +template +bool gguf_read_emplace_helper_template(const Reader & gr, std::vector & kv, const std::string & key, const bool is_array, const size_t n) { if (is_array) { std::vector value; try { @@ -318,8 +420,57 @@ return true; } -struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) { - const struct gguf_reader gr(file); +template +bool gguf_read_emplace_helper(const struct gguf_reader & gr, std::vector & kv, const std::string & key, const bool is_array, const size_t n) { + return gguf_read_emplace_helper_template(gr, kv, key, is_array, n); +} + +template +bool gguf_read_emplace_helper(const struct gguf_buffer_reader & gr, std::vector & kv, const std::string & key, const bool is_array, const size_t n) { + return gguf_read_emplace_helper_template(gr, kv, key, is_array, n); +} + +template +bool gguf_read_tensor_shape(const Reader & gr, gguf_tensor_info & info, bool & ok) { + uint32_t n_dims = -1; + ok = ok && gr.read(n_dims); + if (n_dims > GGML_MAX_DIMS) { + GGML_LOG_ERROR("%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n", + __func__, info.t.name, n_dims, GGML_MAX_DIMS); + ok = false; + return false; + } + for (uint32_t j = 0; ok && j < GGML_MAX_DIMS; ++j) { + info.t.ne[j] = 1; + if (j < n_dims) { + ok = ok && gr.read(info.t.ne[j]); + } + + // check that all ne are non-negative + if (info.t.ne[j] < 0) { + GGML_LOG_ERROR("%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n", + __func__, info.t.name, j, info.t.ne[j]); + ok = false; + return false; + } + } + + // check that the total number of elements is representable + if (ok && ((INT64_MAX/info.t.ne[1] <= info.t.ne[0]) || + (INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) || + (INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) { + + GGML_LOG_ERROR("%s: total number of elements in tensor '%s' with shape " + "(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n", + __func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX); + ok = false; + return false; + } + return true; +} + +template +struct gguf_context * gguf_init_impl(Reader & gr, struct gguf_init_params params) { struct gguf_context * ctx = new gguf_context; bool ok = true; @@ -428,12 +579,15 @@ GGML_LOG_ERROR("%s: encountered bad_alloc error while reading key %" PRIi64 "\n", __func__, i); ok = false; } + + // Check for duplicate keys for (size_t j = 0; ok && j < ctx->kv.size(); ++j) { if (key == ctx->kv[j].key) { GGML_LOG_ERROR("%s: duplicate key '%s' for tensors %zu and %" PRIi64 " \n", __func__, key.c_str(), j, i); ok = false; } } + if (!ok) { break; } @@ -488,120 +642,91 @@ } // read the tensor info - for (int64_t i = 0; ok && i < n_tensors; ++i) { - struct gguf_tensor_info info; - - // tensor name - { - std::string name; - try { - ok = ok && gr.read(name); - } catch (std::length_error &) { - GGML_LOG_ERROR("%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i); - ok = false; - } catch (std::bad_alloc &) { - GGML_LOG_ERROR("%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i); - ok = false; - } - if (name.length() >= GGML_MAX_NAME) { - GGML_LOG_ERROR("%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME); - ok = false; - break; - } - ggml_set_name(&info.t, name.c_str()); - - // make sure there are no duplicate tensor names - for (int64_t j = 0; ok && j < i; ++j) { - if (strcmp(info.t.name, ctx->info[j].t.name) == 0) { - GGML_LOG_ERROR("%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i); - ok = false; - break; - } - } - } - if (!ok) { - break; - } - - // tensor shape - { - uint32_t n_dims = -1; - ok = ok && gr.read(n_dims); - if (n_dims > GGML_MAX_DIMS) { - GGML_LOG_ERROR("%s: tensor '%s' has invalid number of dimensions: %" PRIu32 " > %" PRIu32 "\n", - __func__, info.t.name, n_dims, GGML_MAX_DIMS); - ok = false; - break; - } - for (uint32_t j = 0; ok && j < GGML_MAX_DIMS; ++j) { - info.t.ne[j] = 1; - if (j < n_dims) { - ok = ok && gr.read(info.t.ne[j]); - } - - // check that all ne are non-negative - if (info.t.ne[j] < 0) { - GGML_LOG_ERROR("%s: tensor '%s' dimension %" PRIu32 " has invalid number of elements: %" PRIi64 " < 0\n", - __func__, info.t.name, j, info.t.ne[j]); - ok = false; - break; - } - } - - // check that the total number of elements is representable - if (ok && ((INT64_MAX/info.t.ne[1] <= info.t.ne[0]) || - (INT64_MAX/info.t.ne[2] <= info.t.ne[0]*info.t.ne[1]) || - (INT64_MAX/info.t.ne[3] <= info.t.ne[0]*info.t.ne[1]*info.t.ne[2]))) { - - GGML_LOG_ERROR("%s: total number of elements in tensor '%s' with shape " - "(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n", - __func__, info.t.name, info.t.ne[0], info.t.ne[1], info.t.ne[2], info.t.ne[3], INT64_MAX); - ok = false; - break; - } - } - if (!ok) { - break; - } - - // tensor type - { - ok = ok && gr.read(info.t.type); - - // check that tensor type is within defined range - if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) { - GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d (%s)\n", - __func__, info.t.name, info.t.type, ggml_type_name(info.t.type)); - ok = false; - break; - } - const size_t type_size = ggml_type_size(info.t.type); - const int64_t blck_size = ggml_blck_size(info.t.type); - - // check that row size is divisible by block size - if (blck_size == 0 || info.t.ne[0] % blck_size != 0) { - GGML_LOG_ERROR("%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, " - "not a multiple of block size (%" PRId64 ")\n", - __func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size); - ok = false; - break; - } - - // calculate byte offsets given the tensor shape and type - info.t.nb[0] = type_size; - info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size); - for (int j = 2; j < GGML_MAX_DIMS; ++j) { - info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1]; - } - } - if (!ok) { - break; - } - - // tensor data offset within buffer - ok = ok && gr.read(info.offset); - - ctx->info.push_back(info); + if (n_tensors > 0) { + ctx->info.resize(n_tensors); + + for (int64_t i = 0; ok && i < n_tensors; ++i) { + gguf_tensor_info & info = ctx->info[i]; + + // tensor name + { + std::string name; + try { + ok = ok && gr.read(name); + } catch (std::length_error &) { + GGML_LOG_ERROR("%s: encountered length_error while reading tensor name %" PRIi64 "\n", __func__, i); + ok = false; + } catch (std::bad_alloc &) { + GGML_LOG_ERROR("%s: encountered bad_alloc error while reading tensor name %" PRIi64 "\n", __func__, i); + ok = false; + } + if (name.length() >= GGML_MAX_NAME) { + GGML_LOG_ERROR("%s: tensor name %" PRIi64 " is too long: %zu >= %d\n", __func__, i, name.length(), GGML_MAX_NAME); + ok = false; + break; + } + ggml_set_name(&info.t, name.c_str()); + + // make sure there are no duplicate tensor names + for (int64_t j = 0; ok && j < i; ++j) { + if (strcmp(info.t.name, ctx->info[j].t.name) == 0) { + GGML_LOG_ERROR("%s: duplicate tensor name '%s' for tensors %" PRIi64 " and %" PRIi64 "\n", __func__, info.t.name, j, i); + ok = false; + break; + } + } + } + if (!ok) { + break; + } + + // tensor shape + if (!gguf_read_tensor_shape(gr, info, ok)) { + break; + } + if (!ok) { + break; + } + + // tensor type + { + ok = ok && gr.read(info.t.type); + + // check that tensor type is within defined range + if (info.t.type < 0 || info.t.type >= GGML_TYPE_COUNT) { + GGML_LOG_ERROR("%s: tensor '%s' has invalid ggml type %d (%s)\n", + __func__, info.t.name, info.t.type, ggml_type_name(info.t.type)); + ok = false; + break; + } + + // Validation logic for both file and buffer readers + const size_t type_size = ggml_type_size(info.t.type); + const int64_t blck_size = ggml_blck_size(info.t.type); + + // check that row size is divisible by block size + if (blck_size == 0 || info.t.ne[0] % blck_size != 0) { + GGML_LOG_ERROR("%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, " + "not a multiple of block size (%" PRId64 ")\n", + __func__, info.t.name, (int) info.t.type, ggml_type_name(info.t.type), info.t.ne[0], blck_size); + ok = false; + break; + } + + // calculate byte offsets given the tensor shape and type + info.t.nb[0] = type_size; + info.t.nb[1] = info.t.nb[0]*(info.t.ne[0]/blck_size); + for (int j = 2; j < GGML_MAX_DIMS; ++j) { + info.t.nb[j] = info.t.nb[j - 1]*info.t.ne[j - 1]; + } + } + if (!ok) { + break; + } + + // tensor data offset within buffer + ok = ok && gr.read(info.offset); + } } if (!ok) { @@ -611,16 +736,35 @@ } GGML_ASSERT(int64_t(ctx->info.size()) == n_tensors); - // we require the data section to be aligned, so take into account any padding - if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) { - GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); - gguf_free(ctx); - return nullptr; + // Handle alignment and data section positioning + if constexpr (std::is_same_v) { + // File reader: use fseek and ftell + FILE* file = gr.file; + if (fseek(file, GGML_PAD(ftell(file), ctx->alignment), SEEK_SET) != 0) { + GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); + gguf_free(ctx); + return nullptr; + } + ctx->offset = ftell(file); + } else { + // Buffer reader: use seek and tell + const size_t current_offset = gr.tell(); + const size_t aligned_offset = GGML_PAD(current_offset, ctx->alignment); + + // For vocab-only files or when there's no tensor data, the aligned offset might be beyond buffer size + if (n_tensors == 0 || aligned_offset >= gr.buffer_size) { + // No tensor data section - use current offset as the data offset + ctx->offset = current_offset; + } else { + if (!gr.seek(aligned_offset)) { + GGML_LOG_ERROR("%s: failed to seek to beginning of data section\n", __func__); + gguf_free(ctx); + return nullptr; + } + ctx->offset = gr.tell(); + } } - // store the current file offset - this is where the data section starts - ctx->offset = ftell(file); - // compute the total size of the data section, taking into account the alignment { ctx->size = 0; @@ -726,12 +870,17 @@ return nullptr; } - ggml_set_no_alloc(ctx_data, params.no_alloc); + ggml_set_no_alloc(ctx_data, false); } return ctx; } +struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) { + struct gguf_reader gr(file); + return gguf_init_impl(gr, params); +} + struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { FILE * file = ggml_fopen(fname, "rb"); @@ -745,6 +894,26 @@ return result; } + +struct gguf_context * gguf_init_from_buffer(const void * buffer, size_t buffer_size, struct gguf_init_params params) { + if (buffer == nullptr || buffer_size == 0) { + GGML_LOG_ERROR("%s: invalid buffer parameters\n", __func__); + return nullptr; + } + + struct gguf_buffer_reader gr(buffer, buffer_size); + return gguf_init_impl(gr, params); +} + +struct gguf_context * gguf_init_from_file_handle(FILE * file, struct gguf_init_params params) { + if (file == nullptr) { + GGML_LOG_ERROR("%s: invalid file handle\n", __func__); + return nullptr; + } + // Note: The caller is responsible for closing the file handle + return gguf_init_from_file_impl(file, params); +} + void gguf_free(struct gguf_context * ctx) { if (ctx == nullptr) { return; diff --git a/include/llama.h b/include/llama.h index 135eaf1b65..fa3dd307f1 100644 --- a/include/llama.h +++ b/include/llama.h @@ -422,6 +422,20 @@ size_t n_paths, struct llama_model_params params); + // Load the model from a buffer + // The buffer must contain a complete GGUF file + LLAMA_API struct llama_model * llama_model_load_from_buffer( + const void * buffer, + size_t buffer_size, + struct llama_model_params params); + + // Load the model from a file handle + // The file handle must be positioned at the beginning of a complete GGUF file + // The caller is responsible for closing the file handle + LLAMA_API struct llama_model * llama_model_load_from_file_handle( + FILE * file, + struct llama_model_params params); + LLAMA_API void llama_model_save_to_file( const struct llama_model * model, const char * path_model); diff --git a/load-from-buffer-or-fd.patch b/load-from-buffer-or-fd.patch new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 510bf00ad6..a49de9850c 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -717,6 +717,149 @@ this->check_tensors = check_tensors; } +llama_model_loader::llama_model_loader( + const void * buffer, + size_t buffer_size, + bool check_tensors, + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { + // Tracing not implemented for buffer-based loading + + if (param_overrides_p != nullptr) { + for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { + kv_overrides.insert({std::string(p->key), *p}); + } + } + + tensor_buft_overrides = param_tensor_buft_overrides_p; + + // Store buffer information + this->buffer_data = buffer; + this->buffer_size = buffer_size; + + // Load the GGUF from buffer + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta.reset(gguf_init_from_buffer(buffer, buffer_size, params)); + if (!meta) { + throw std::runtime_error(format("%s: failed to load model from buffer", __func__)); + } + + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + contexts.emplace_back(ctx); + + // Build tensors index for weights + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there are no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(buffer_size, 0, meta.get(), cur)); + } + + // Buffer-based loading doesn't support splits - set defaults + ftype = LLAMA_FTYPE_GUESSED; + fver = GGUF_FILE_VERSION_V3; + + // Validate file version + if (fver != GGUF_FILE_VERSION_V1 && fver != GGUF_FILE_VERSION_V2 && fver != GGUF_FILE_VERSION_V3) { + throw std::runtime_error(format("invalid GGUF version: %d", fver)); + } + + n_tensors = weights_map.size(); + + LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from buffer (%zu MB)\n", + __func__, n_kv, n_tensors, buffer_size / (1024 * 1024)); + + // Buffer-based loading uses no mmap and stores tensors in buffer + this->use_mmap = false; + this->check_tensors = check_tensors; +} + +llama_model_loader::llama_model_loader( + FILE * file, + bool check_tensors, + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { + // Tracing not implemented for file handle-based loading + + if (param_overrides_p != nullptr) { + for (const struct llama_model_kv_override * p = param_overrides_p; p->key[0] != 0; p++) { + kv_overrides.insert({std::string(p->key), *p}); + } + } + + tensor_buft_overrides = param_tensor_buft_overrides_p; + + // Store file handle information + this->file_handle = file; + this->owns_file_handle = false; // Caller owns the file handle + + // Get file size + long current_pos = ftell(file); + fseek(file, 0, SEEK_END); + size_t file_size = ftell(file); + fseek(file, current_pos, SEEK_SET); + + // Load the GGUF from file handle + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta.reset(gguf_init_from_file_handle(file, params)); + if (!meta) { + throw std::runtime_error(format("%s: failed to load model from file handle", __func__)); + } + + get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); + llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + contexts.emplace_back(ctx); + + // Build tensors index for weights + // Since we're using a file handle directly, we won't populate the files vector + // Instead, we'll handle file I/O through the file_handle member + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + std::string tensor_name = std::string(cur->name); + // make sure there are no duplicated tensor names + if (weights_map.find(tensor_name) != weights_map.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", ggml_get_name(cur))); + } + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + weights_map.emplace(tensor_name, llama_tensor_weight(file_size, 0, meta.get(), cur)); + } + + // File handle-based loading doesn't support splits - set defaults + ftype = LLAMA_FTYPE_GUESSED; + fver = GGUF_FILE_VERSION_V3; + + // Validate file version + if (fver != GGUF_FILE_VERSION_V1 && fver != GGUF_FILE_VERSION_V2 && fver != GGUF_FILE_VERSION_V3) { + throw std::runtime_error(format("invalid GGUF version: %d", fver)); + } + + n_tensors = weights_map.size(); + + LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from file handle (%zu MB)\n", + __func__, n_kv, n_tensors, file_size / (1024 * 1024)); + + // File handle-based loading uses no mmap + this->use_mmap = false; + this->check_tensors = check_tensors; +} + std::string llama_model_loader::get_arch_name() const { return arch_name; } @@ -904,7 +1047,21 @@ } else { memcpy(cur->data, (uint8_t *)mapping->addr() + w.offs, ggml_nbytes(cur)); } + } else if (buffer_data != nullptr) { + // Buffer-based loading + GGML_ASSERT(cur->data != nullptr); + GGML_ASSERT(w.offs + ggml_nbytes(cur) <= buffer_size); + memcpy(cur->data, (const uint8_t *)buffer_data + w.offs, ggml_nbytes(cur)); + } else if (file_handle != nullptr) { + // File handle-based loading + GGML_ASSERT(cur->data != nullptr); + fseek(file_handle, w.offs, SEEK_SET); + size_t bytes_read = fread(cur->data, 1, ggml_nbytes(cur), file_handle); + if (bytes_read != ggml_nbytes(cur)) { + throw std::runtime_error(format("failed to read tensor '%s' data", ggml_get_name(cur))); + } } else { + // File-based loading GGML_ASSERT(cur->data != nullptr); GGML_ASSERT(w.idx < files.size()); const auto & file = files.at(w.idx); @@ -1058,6 +1215,51 @@ } else { ggml_backend_tensor_set(cur, data, 0, n_size); } + } else if (buffer_data != nullptr) { + // Buffer-based loading + if (weight->offs + n_size > this->buffer_size) { + LLAMA_LOG_ERROR("Buffer bounds check failed: tensor='%s', offs=%zu, size=%zu, total=%zu, buffer_size=%zu\n", + ggml_get_name(cur), weight->offs, n_size, weight->offs + n_size, this->buffer_size); + } + GGML_ASSERT(weight->offs + n_size <= this->buffer_size); + const uint8_t * src_data = (const uint8_t *)buffer_data + weight->offs; + + if (ggml_backend_buffer_is_host(cur->buffer)) { + memcpy(cur->data, src_data, n_size); + if (check_tensors) { + validation_result.push_back(std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size))); + } + } else { + // For GPU buffers, copy data directly + ggml_backend_tensor_set(cur, src_data, 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, src_data, n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } + } + } else if (file_handle != nullptr) { + // File handle-based loading + if (ggml_backend_buffer_is_host(cur->buffer)) { + fseek(file_handle, weight->offs, SEEK_SET); + size_t bytes_read = fread(cur->data, 1, n_size, file_handle); + if (bytes_read != n_size) { + throw std::runtime_error(format("failed to read tensor '%s' data", ggml_get_name(cur))); + } + if (check_tensors) { + validation_result.push_back(std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size))); + } + } else { + // For GPU buffers, read to temporary buffer then copy + read_buf.resize(n_size); + fseek(file_handle, weight->offs, SEEK_SET); + size_t bytes_read = fread(read_buf.data(), 1, n_size, file_handle); + if (bytes_read != n_size) { + throw std::runtime_error(format("failed to read tensor '%s' data", ggml_get_name(cur))); + } + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } + } } else { const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index 9ede44378d..6469f586c7 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -44,6 +44,20 @@ std::abort(); } } + + llama_tensor_weight(size_t buffer_size, uint16_t idx, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, ggml_get_name(tensor)); + if (tensor_idx < 0) { + // throw std::runtime_error(format("tensor '%s' not found in the model", ggml_get_name(tensor))); + std::abort(); + } + + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > buffer_size) { + // throw std::runtime_error(format("tensor '%s' data is not within the buffer bounds, model is corrupted or incomplete", ggml_get_name(tensor))); + std::abort(); + } + } }; // custom comparator to sort weights more nicely by layer @@ -74,6 +88,14 @@ bool use_mmap = false; bool check_tensors; + // Buffer-based loading members + const void * buffer_data = nullptr; + size_t buffer_size = 0; + + // File handle-based loading members + FILE * file_handle = nullptr; + bool owns_file_handle = false; + llama_files files; llama_ftype ftype; llama_fver fver; @@ -102,6 +124,19 @@ const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); + llama_model_loader( + const void * buffer, + size_t buffer_size, + bool check_tensors, + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); + + llama_model_loader( + FILE * file, + bool check_tensors, + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); + template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, bool required = true); diff --git a/src/llama.cpp b/src/llama.cpp index 0adb16598e..2da539f982 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -86,7 +86,8 @@ } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { +template +static int llama_model_load_impl(llama_model & model, llama_model_params & params, LoaderFactory && create_loader) { // loading time will be recalculated after the first eval, so // we take page faults deferred by mmap() into consideration model.t_load_us = 0; @@ -95,7 +96,7 @@ model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + auto ml = create_loader(); ml.print_info(); @@ -136,6 +137,18 @@ return 0; } +static int llama_model_load(const std::string & fname, std::vector & splits, llama_model & model, llama_model_params & params) { + return llama_model_load_impl(model, params, [&]() { + return llama_model_loader(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + }); +} + +static int llama_model_load_from_buffer(const void * buffer, size_t buffer_size, llama_model & model, llama_model_params & params) { + return llama_model_load_impl(model, params, [&]() { + return llama_model_loader(buffer, buffer_size, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + }); +} + static struct llama_model * llama_model_load_from_file_impl( const std::string & path_model, std::vector & splits, @@ -182,7 +195,7 @@ // skip CPU backends since they are handled separately break; - case GGML_BACKEND_DEVICE_TYPE_GPU: + case GGML_BACKEND_DEVICE_TYPE_GPU: { ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); if (ggml_backend_reg_name(reg) == std::string("RPC")) { rpc_servers.push_back(dev); @@ -190,6 +203,7 @@ model->devices.push_back(dev); } break; + } } } // add RPC servers at the front of the list @@ -236,6 +250,118 @@ return model; } +static struct llama_model * llama_model_load_from_buffer_impl( + const void * buffer, + size_t buffer_size, + struct llama_model_params params) { + ggml_time_init(); + + if (!params.vocab_only && ggml_backend_reg_count() == 0) { + LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__); + return nullptr; + } + + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + LLAMA_LOG_CONT("."); + if (percentage >= 100) { + LLAMA_LOG_CONT("\n"); + } + } + return true; + }; + } + + llama_model * model = new llama_model(params); + + // create list of devices to use with this model + if (params.devices) { + for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { + model->devices.push_back(*dev); + } + } else { + std::vector rpc_servers; + // use all available devices + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + switch (ggml_backend_dev_type(dev)) { + case GGML_BACKEND_DEVICE_TYPE_CPU: + case GGML_BACKEND_DEVICE_TYPE_ACCEL: + // skip CPU backends since they are handled separately + break; + + case GGML_BACKEND_DEVICE_TYPE_GPU: { + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (ggml_backend_reg_name(reg) == std::string("RPC")) { + rpc_servers.push_back(dev); + } else { + model->devices.push_back(dev); + } + break; + } + + default: + break; + } + } + + // add the RPC servers at the end since they are usually slower + model->devices.insert(model->devices.end(), rpc_servers.begin(), rpc_servers.end()); + + // if no GPU device is found, we use the CPU device to avoid errors + if (model->devices.empty()) { + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + model->devices.push_back(dev); + break; + } + } + } + + if (params.main_gpu >= 0 && params.main_gpu < (int) model->devices.size()) { + auto main_gpu = model->devices[params.main_gpu]; + model->devices.erase(model->devices.begin() + params.main_gpu); + model->devices.insert(model->devices.begin(), main_gpu); + } else if (params.main_gpu >= (int) model->devices.size()) { + LLAMA_LOG_WARN("%s: main_gpu is out of range: %d, using device 0\n", __func__, params.main_gpu); + } else if (params.main_gpu < 0 && !model->devices.empty()) { + auto main_gpu = model->devices[0]; + model->devices.erase(model->devices.begin()); + model->devices.push_back(main_gpu); + model->devices.clear(); + model->devices.push_back(main_gpu); + } + } + + for (auto * dev : model->devices) { + size_t free, total; // NOLINT + ggml_backend_dev_memory(dev, &free, &total); + LLAMA_LOG_INFO("%s: using device %s (%s) - %zu MiB free\n", __func__, ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free/1024/1024); + } + + const int status = llama_model_load_from_buffer(buffer, buffer_size, *model, params); + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } + + llama_model_free(model); + return nullptr; + } + + return model; +} + // deprecated struct llama_model * llama_load_model_from_file( const char * path_model, @@ -265,6 +391,92 @@ return llama_model_load_from_file_impl(splits.front(), splits, params); } +struct llama_model * llama_model_load_from_buffer( + const void * buffer, + size_t buffer_size, + struct llama_model_params params) { + return llama_model_load_from_buffer_impl(buffer, buffer_size, params); +} + +struct llama_model * llama_model_load_from_file_handle( + FILE * file, + struct llama_model_params params) { + ggml_time_init(); + + if (!params.vocab_only && ggml_backend_reg_count() == 0) { + LLAMA_LOG_ERROR("%s: no backends are loaded. hint: use ggml_backend_load() or ggml_backend_load_all() to load a backend before calling this function\n", __func__); + return nullptr; + } + + unsigned cur_percentage = 0; + if (params.progress_callback == NULL) { + params.progress_callback_user_data = &cur_percentage; + params.progress_callback = [](float progress, void * ctx) { + unsigned * cur_percentage_p = (unsigned *) ctx; + unsigned percentage = (unsigned) (100 * progress); + while (percentage > *cur_percentage_p) { + *cur_percentage_p = percentage; + LLAMA_LOG_CONT("."); + if (percentage >= 100) { + LLAMA_LOG_CONT("\n"); + } + } + return true; + }; + } + + llama_model * model = new llama_model(params); + + // create list of devices to use with this model + if (params.devices) { + for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) { + model->devices.push_back(*dev); + } + } else { + std::vector rpc_servers; + // use all available devices + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + switch (ggml_backend_dev_type(dev)) { + case GGML_BACKEND_DEVICE_TYPE_CPU: + case GGML_BACKEND_DEVICE_TYPE_ACCEL: + // skip CPU backends since they are handled separately + break; + + case GGML_BACKEND_DEVICE_TYPE_GPU: { + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + if (ggml_backend_reg_name(reg) == std::string("RPC")) { + rpc_servers.push_back(dev); + } else { + model->devices.push_back(dev); + } + break; + } + } + } + // add RPC servers at the front of the list + model->devices.insert(model->devices.begin(), rpc_servers.begin(), rpc_servers.end()); + } + + const int status = llama_model_load_impl(*model, params, [&]() { + return llama_model_loader(file, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + }); + + GGML_ASSERT(status <= 0); + if (status < 0) { + if (status == -1) { + LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); + } else if (status == -2) { + LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); + } + + llama_model_free(model); + return nullptr; + } + + return model; +} + void llama_model_save_to_file(const struct llama_model * model, const char * path_model) { llama_model_saver ms(*model); ms.add_kv_from_model();