#
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import os
import jinja2
import json
import shutil
import tempfile
from pathlib import Path

def add_common_arguments(parser):
    parser.add_argument('--model_repository_path', required=False, default='models', help='Where the model should be exported to', dest='model_repository_path')
    parser.add_argument('--source_model', required=True, help='HF model name or path to the local folder with PyTorch or OpenVINO model', dest='source_model')
    parser.add_argument('--model_name', required=False, default=None, help='Model name that should be used in the deployment. Equal to source_model if HF model name is used', dest='model_name')
    parser.add_argument('--weight-format', default='int8', help='precision of the exported model', dest='precision')
    parser.add_argument('--config_file_path', default='config.json', help='path to the config file', dest='config_file_path')
    parser.add_argument('--overwrite_models', default=False, action='store_true', help='Overwrite the model if it already exists in the models repository', dest='overwrite_models')
    parser.add_argument('--target_device', default="CPU", help='CPU, GPU, NPU or HETERO, default is CPU', dest='target_device')
    parser.add_argument('--ov_cache_dir', default=None, help='Folder path for compilation cache to speedup initialization time', dest='ov_cache_dir')
    parser.add_argument('--extra_quantization_params', required=False, help='Add advanced quantization parameters. Check optimum-intel documentation. Example: "--sym --group-size -1 --ratio 1.0 --awq --scale-estimation --dataset wikitext2"', dest='extra_quantization_params')

parser = argparse.ArgumentParser(description='Export Hugging face models to OVMS models repository including all configuration for deployments')

subparsers = parser.add_subparsers(help='subcommand help', required=True, dest='task')
parser_text = subparsers.add_parser('text_generation', help='export model for chat and completion endpoints')
add_common_arguments(parser_text)
parser_text.add_argument('--pipeline_type', default=None, choices=["LM", "LM_CB", "VLM", "VLM_CB", "AUTO"], help='Type of the pipeline to be used. AUTO is used by default', dest='pipeline_type')
parser_text.add_argument('--kv_cache_precision', default=None, choices=["u8"], help='u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.', dest='kv_cache_precision')
parser_text.add_argument('--enable_prefix_caching', type=lambda x: (str(x).lower() == 'true'), default=True, help='This algorithm is used to cache the prompt tokens.', dest='enable_prefix_caching')
parser_text.add_argument('--disable_dynamic_split_fuse', action='store_false', help='The maximum number of tokens that can be batched together.', dest='dynamic_split_fuse')
parser_text.add_argument('--max_num_batched_tokens', default=None, help='empty or integer. The maximum number of tokens that can be batched together.', dest='max_num_batched_tokens')
parser_text.add_argument('--max_num_seqs', default=None, help='256 by default. The maximum number of sequences that can be processed together.', dest='max_num_seqs')
parser_text.add_argument('--cache_size', default=0, type=int, help='KV cache size in GB. If not set, cache is allocated dynamically.', dest='cache_size')
parser_text.add_argument('--draft_source_model', required=False, default=None, help='HF model name or path to the local folder with PyTorch or OpenVINO draft model. '
                         'Using this option will create configuration for speculative decoding', dest='draft_source_model')
parser_text.add_argument('--draft_model_name', required=False, default=None, help='Draft model name that should be used in the deployment. '
                         'Equal to draft_source_model if HF model name is used. Available only in draft_source_model has been specified.', dest='draft_model_name')
parser_text.add_argument('--draft_eagle3_mode', action='store_true', help='Set this flag if you use EAGLE3 draft model for speculative decoding', dest='draft_eagle3_mode')
parser_text.add_argument('--max_prompt_len', required=False, type=int, default=None, help='Sets NPU specific property for maximum number of tokens in the prompt. '
                         'Not effective if target device is not NPU', dest='max_prompt_len')
parser_text.add_argument('--prompt_lookup_decoding', action='store_true', help='Set pipeline to use prompt lookup decoding', dest='prompt_lookup_decoding')
parser_text.add_argument('--reasoning_parser', choices=["qwen3", "gptoss"], help='Set the type of the reasoning parser for reasoning content extraction', dest='reasoning_parser')
parser_text.add_argument('--tool_parser', choices=["llama3", "phi4", "hermes3", "mistral", "qwen3coder", "gptoss", "devstral"], help='Set the type of the tool parser for tool calls extraction', dest='tool_parser')
parser_text.add_argument('--enable_tool_guided_generation', action='store_true', help='Enables enforcing tool schema during generation. Requires setting tool_parser', dest='enable_tool_guided_generation')

parser_embeddings_ov = subparsers.add_parser('embeddings_ov', help='export model for embeddings endpoint with directory structure aligned with OpenVINO tools')
add_common_arguments(parser_embeddings_ov)
parser_embeddings_ov.add_argument('--skip_normalize', default=True, action='store_false', help='Skip normalize the embeddings.', dest='normalize')
parser_embeddings_ov.add_argument('--pooling', default="CLS", choices=["CLS", "LAST", "MEAN"], help='Embeddings pooling mode', dest='pooling')
parser_embeddings_ov.add_argument('--truncate', default=False, action='store_true', help='Truncate the prompts to fit to the embeddings model', dest='truncate')
parser_embeddings_ov.add_argument('--num_streams', default=1,type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams')

parser_rerank = subparsers.add_parser('rerank', help='[deprecated] export model for rerank endpoint with models split into separate, versioned directories')
add_common_arguments(parser_rerank)
parser_rerank.add_argument('--num_streams', default=1, type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams')
parser_rerank.add_argument('--max_doc_length', default=16000, type=int, help='Maximum length of input documents in tokens', dest='max_doc_length')
parser_rerank.add_argument('--version', default="1", help='version of the model', dest='version')

parser_rerank_ov = subparsers.add_parser('rerank_ov', help='export model for rerank endpoint with directory structure aligned with OpenVINO tools')
add_common_arguments(parser_rerank_ov)
parser_rerank_ov.add_argument('--num_streams', default=1, type=int, help='The number of parallel execution streams to use for the model. Use at least 2 on 2 socket CPU systems.', dest='num_streams')
parser_rerank_ov.add_argument('--max_doc_length', default=16000, type=int, help='Maximum length of input documents in tokens', dest='max_doc_length')

parser_image_generation = subparsers.add_parser('image_generation', help='export model for image generation endpoint')
add_common_arguments(parser_image_generation)
parser_image_generation.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
parser_image_generation.add_argument('--resolution', default="", help='Selection of allowed resolutions in a format of WxH; W=width H=height, space separated. If only one is selected, the pipeline will be reshaped to static.', dest='resolution')
parser_image_generation.add_argument('--guidance_scale', default="", help='Static guidance scale for the image generation requests. If not specified, default 7.5f is used.', dest='guidance_scale')
parser_image_generation.add_argument('--num_images_per_prompt', default="", help='Static number of images to be generated per the image generation request. If not specified, default 1 is used.', dest='num_images_per_prompt')
parser_image_generation.add_argument('--max_resolution', default="", help='Max allowed resolution in a format of WxH; W=width H=height', dest='max_resolution')
parser_image_generation.add_argument('--default_resolution', default="", help='Default resolution when not specified by client', dest='default_resolution')
parser_image_generation.add_argument('--max_num_images_per_prompt', type=int, default=0, help='Max allowed number of images client is allowed to request for a given prompt', dest='max_num_images_per_prompt')
parser_image_generation.add_argument('--default_num_inference_steps', type=int, default=0, help='Default number of inference steps when not specified by client', dest='default_num_inference_steps')
parser_image_generation.add_argument('--max_num_inference_steps', type=int, default=0, help='Max allowed number of inference steps client is allowed to request for a given prompt', dest='max_num_inference_steps')

parser_text2speech = subparsers.add_parser('text2speech', help='export model for text2speech endpoint')
add_common_arguments(parser_text2speech)
parser_text2speech.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
parser_text2speech.add_argument('--vocoder', type=str, help='The vocoder model to use for text2speech. For example microsoft/speecht5_hifigan', dest='vocoder')

parser_speech2text = subparsers.add_parser('speech2text', help='export model for speech2text endpoint')
add_common_arguments(parser_speech2text)
parser_speech2text.add_argument('--num_streams', default=0, type=int, help='The number of parallel execution streams to use for the models in the pipeline.', dest='num_streams')
args = vars(parser.parse_args())

t2s_graph_template = """
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node {
  name: "T2sExecutor"
  input_side_packet: "TTS_NODE_RESOURCES:t2s_servable"
  calculator: "T2sCalculator"
  input_stream: "HTTP_REQUEST_PAYLOAD:input"
  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
  node_options: {
    [type.googleapis.com / mediapipe.T2sCalculatorOptions]: {
      models_path: "{{model_path}}",
      plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
      target_device: "{{target_device|default("CPU", true)}}"
    }
  }
}
"""

s2t_graph_template = """
input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"
node {
  name: "S2tExecutor"
  input_side_packet: "STT_NODE_RESOURCES:s2t_servable"
  calculator: "S2tCalculator"
  input_stream: "HTTP_REQUEST_PAYLOAD:input"
  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
  node_options: {
    [type.googleapis.com / mediapipe.S2tCalculatorOptions]: {
      models_path: "{{model_path}}",
      plugin_config: '{ "NUM_STREAMS": "{{num_streams|default(1, true)}}" }',
      target_device: "{{target_device|default("CPU", true)}}"
    }
  }
}
"""

embedding_graph_ov_template = """
input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
node {
  name: "EmbeddingsExecutor"
  input_side_packet: "EMBEDDINGS_NODE_RESOURCES:embeddings_servable"
  calculator: "EmbeddingsCalculatorOV"
  input_stream: "REQUEST_PAYLOAD:input"
  output_stream: "RESPONSE_PAYLOAD:output"
  node_options: {
    [type.googleapis.com / mediapipe.EmbeddingsCalculatorOVOptions]: {
      models_path: "{{model_path}}",
      plugin_config: '{"NUM_STREAMS": "{{num_streams}}" }',
      normalize_embeddings: {% if not normalize %}false{% else %}true{% endif%},
      {%- if pooling %}
      pooling: {{pooling}},{% endif %}
      {%- if truncate %}
      truncate: true,{% endif %}
      target_device: "{{target_device|default("CPU", true)}}"
    }
  }
}
"""

rerank_graph_ov_template = """
input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
node {
  name: "RerankExecutor"
  input_side_packet: "RERANK_NODE_RESOURCES:rerank_servable"
  calculator: "RerankCalculatorOV"
  input_stream: "REQUEST_PAYLOAD:input"
  output_stream: "RESPONSE_PAYLOAD:output"
  node_options: {
    [type.googleapis.com / mediapipe.RerankCalculatorOVOptions]: {
      models_path: "{{model_path}}",
      plugin_config: '{"NUM_STREAMS": "{{num_streams}}" }',
      target_device: "{{target_device|default("CPU", true)}}"
    }
  }
}
"""

rerank_graph_template = """input_stream: "REQUEST_PAYLOAD:input"
output_stream: "RESPONSE_PAYLOAD:output"
node {
  calculator: "OpenVINOModelServerSessionCalculator"
  output_side_packet: "SESSION:tokenizer"
  node_options: {
    [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: {
      servable_name: "{{model_name}}_tokenizer_model"
    }
  }
}
node {
  calculator: "OpenVINOModelServerSessionCalculator"
  output_side_packet: "SESSION:rerank"
  node_options: {
    [type.googleapis.com / mediapipe.OpenVINOModelServerSessionCalculatorOptions]: {
      servable_name: "{{model_name}}_rerank_model"
    }
  }
}
node {
    input_side_packet: "TOKENIZER_SESSION:tokenizer"
    input_side_packet: "RERANK_SESSION:rerank"
    calculator: "RerankCalculator"
    input_stream: "REQUEST_PAYLOAD:input"
    output_stream: "RESPONSE_PAYLOAD:output"
}
"""

text_generation_graph_template = """input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"

node: {
  name: "LLMExecutor"
  calculator: "HttpLLMCalculator"
  input_stream: "LOOPBACK:loopback"
  input_stream: "HTTP_REQUEST_PAYLOAD:input"
  input_side_packet: "LLM_NODE_RESOURCES:llm"
  output_stream: "LOOPBACK:loopback"
  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
  input_stream_info: {
    tag_index: 'LOOPBACK:0',
    back_edge: true
  }
  node_options: {
      [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
          {%- if pipeline_type %}
          pipeline_type: {{pipeline_type}},{% endif %}
          models_path: "{{model_path}}",
          plugin_config: '{{plugin_config}}',
          enable_prefix_caching: {% if not enable_prefix_caching %}false{% else %} true{% endif%},
          cache_size: {{cache_size|default("0", true)}},
          {%- if max_num_batched_tokens %}
          max_num_batched_tokens: {{max_num_batched_tokens}},{% endif %}
          {%- if not dynamic_split_fuse %}
          dynamic_split_fuse: false, {% endif %}
          max_num_seqs: {% if draft_eagle3_mode %}1{% else %}{{max_num_seqs|default("256", true)}}{% endif %},
          device: "{{target_device|default("CPU", true)}}",
          {%- if draft_model_dir_name %}
          # Speculative decoding configuration
          draft_models_path: "./{{draft_model_dir_name}}",
          draft_device: "{{target_device|default("CPU", true)}}",
          draft_eagle3_mode: {{draft_eagle3_mode|default(false)}},{% endif %}
          {%- if reasoning_parser %}
          reasoning_parser: "{{reasoning_parser}}",{% endif %}
          {%- if tool_parser %}
          tool_parser: "{{tool_parser}}",{% endif %}
          {%- if enable_tool_guided_generation %}
          enable_tool_guided_generation: {% if not enable_tool_guided_generation %}false{% else %} true{% endif%},{% endif %}
      }
  }
  input_stream_handler {
    input_stream_handler: "SyncSetInputStreamHandler",
    options {
      [mediapipe.SyncSetInputStreamHandlerOptions.ext] {
        sync_set {
          tag_index: "LOOPBACK:0"
        }
      }
    }
  }
}"""

rerank_subconfig_template = """{
    "model_config_list": [
    { "config": 
	    {
                "name": "{{model_name}}_tokenizer_model",
                "base_path": "tokenizer"
            }
	},
    { "config": 
	    {
                "name": "{{model_name}}_rerank_model",
                "base_path": "rerank",
                "target_device": "{{target_device|default("CPU", true)}}",
                "plugin_config": { "NUM_STREAMS": "{{num_streams|default(1, true)}}" }
            }
	}
   ]
}"""

image_generation_graph_template = """input_stream: "HTTP_REQUEST_PAYLOAD:input"
output_stream: "HTTP_RESPONSE_PAYLOAD:output"

node: {
  name: "ImageGenExecutor"
  calculator: "ImageGenCalculator"
  input_stream: "HTTP_REQUEST_PAYLOAD:input"
  input_side_packet: "IMAGE_GEN_NODE_RESOURCES:pipes"
  output_stream: "HTTP_RESPONSE_PAYLOAD:output"
  node_options: {
    [type.googleapis.com / mediapipe.ImageGenCalculatorOptions]: {
      models_path: "{{model_path}}",
      {%- if plugin_config_str %}
      plugin_config: '{{plugin_config_str}}',{% endif %}
      device: "{{target_device|default("CPU", true)}}",
      {%- if resolution %}
      resolution: "{{resolution}}",{% endif %}
      {%- if num_images_per_prompt %}
      num_images_per_prompt: {{num_images_per_prompt}},{% endif %}
      {%- if guidance_scale %}
      guidance_scale: {{guidance_scale}},{% endif %}
      {%- if max_resolution %}
      max_resolution: '{{max_resolution}}',{% endif %}
      {%- if default_resolution %}
      default_resolution: '{{default_resolution}}',{% endif %}
      {%- if max_num_images_per_prompt > 0 %}
      max_num_images_per_prompt: {{max_num_images_per_prompt}},{% endif %}
      {%- if default_num_inference_steps > 0 %}
      default_num_inference_steps: {{default_num_inference_steps}},{% endif %}
      {%- if max_num_inference_steps > 0 %}
      max_num_inference_steps: {{max_num_inference_steps}},{% endif %}
    }
  }
}"""

def export_rerank_tokenizer(source_model, destination_path, max_length):
    import openvino as ov
    from openvino_tokenizers import convert_tokenizer
    from transformers import AutoTokenizer
    hf_tokenizer = AutoTokenizer.from_pretrained(source_model)
    hf_tokenizer.model_max_length = max_length
    hf_tokenizer.save_pretrained(destination_path)
    ov_tokenizer = convert_tokenizer(hf_tokenizer, add_special_tokens=False)
    ov.save_model(ov_tokenizer, os.path.join(destination_path, "openvino_tokenizer.xml"))

def set_rt_info(model_folder_path, model_filename, config_filename):
    import openvino as ov
    model = ov.Core().read_model(os.path.join(model_folder_path, model_filename))
    with open(os.path.join(model_folder_path, config_filename), 'r') as config_file:
        config_data = json.load(config_file)
        for key, value in config_data.items():
          try:
              model.set_rt_info(value, ['model_info', key])
          except Exception as e:
              model.set_rt_info(str(value), ['model_info', key])
    temp_model_name = model_filename.replace('.xml', '_temp.xml')
    ov.save_model(model, os.path.join(model_folder_path, temp_model_name))
    del model
    shutil.move(os.path.join(model_folder_path, temp_model_name), os.path.join(model_folder_path, model_filename))
    shutil.move(os.path.join(model_folder_path, temp_model_name.replace('.xml','.bin')), os.path.join(model_folder_path, model_filename.replace('.xml','.bin')))

def get_models_max_context(tmpdirname, config_filename):
    with open(os.path.join(tmpdirname, config_filename), 'r') as config_file:
        config_data = json.load(config_file)
        if config_data['max_position_embeddings'] is not None:
            return config_data['max_position_embeddings']
        if config_data['n_positions'] is not None:
            return config_data['n_positions']
        return None

def add_servable_to_config(config_path, model_name, base_path):
    base_path = Path(base_path).as_posix()
    print(config_path, model_name, base_path)
    if not os.path.isfile(config_path):
        print("Creating new config file")
        with open(config_path, 'w') as config_file:
            json.dump({'mediapipe_config_list': [], "model_config_list": []}, config_file, indent=4)
    with open(config_path, 'r') as config_file:
        config_data = json.load(config_file)
        if 'model_config_list' not in config_data:
            config_data['model_config_list'] = []
        ## read legacy mediapipe_config_list to model_config_list
        if 'mediapipe_config_list' in config_data:
            for mp_config in config_data['mediapipe_config_list']:
                if 'name' in mp_config and 'base_path' in mp_config:
                    if not any(d['config']['name'] == mp_config['name'] + "_model" for d in config_data['model_config_list']):
                        config_data['model_config_list'].append({'config': {'name': mp_config['name'] + "_model", 'base_path': mp_config['base_path']}})
            del config_data['mediapipe_config_list']
        model_list = config_data['model_config_list']
        updated = False
        for model_config in model_list:
            if model_config['config']['name'] == model_name:
                model_config['config']['base_path'] = base_path
                updated = True
        if not updated:
            model_list.append({'config': {'name': model_name, 'base_path': base_path}})
    with open(config_path, 'w') as config_file:
        json.dump(config_data, config_file, indent=4)
    print("Added servable to config file", config_path)

def export_text_generation_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
    model_path = "./"
    # validation for tool parsing
    if (task_parameters.get('tool_parser', None) == 'gptoss' or task_parameters.get('reasoning_parser', None) == 'gptoss'):
        if (task_parameters.get('tool_parser', None) != task_parameters.get('reasoning_parser', None)):
            raise ValueError("Both tool_parser and reasoning_parser need to be set to gptoss when one of them is set to gptoss")
    ### Export model
    if os.path.isfile(os.path.join(source_model, 'openvino_model.xml')) or os.path.isfile(os.path.join(source_model, 'openvino_language_model.xml')):
        print("OV model is source folder. Skipping conversion.")
        model_path = source_model
    elif source_model.startswith("OpenVINO/"):
        if precision:
            print("Precision change is not supported for OpenVINO models. Parameter --weight-format {} will be ignored.".format(precision))
        hugging_face_cmd = "huggingface-cli download {} --local-dir {} ".format(source_model, os.path.join(model_repository_path, model_name))
        if os.system(hugging_face_cmd):
            raise ValueError("Failed to download llm model", source_model)
    else: # assume HF model name or local pytorch model folder
        llm_model_path = os.path.join(model_repository_path, model_name)
        print("Exporting LLM model to ", llm_model_path)
        if not os.path.isdir(llm_model_path) or args['overwrite_models']:
            if task_parameters['target_device'] == 'NPU':
                if precision != 'int4' and precision != 'nf4':
                    print("NPU target device requires int4 or nf4 precision. Changing to int4")
                    precision = 'int4'
                if task_parameters['extra_quantization_params'] == "":
                    print("Using default quantization parameters for NPU: --sym --ratio 1.0 --group-size -1")
                    task_parameters['extra_quantization_params'] = "--sym --ratio 1.0 --group-size -1"
            optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], llm_model_path)
            if os.system(optimum_command):
                raise ValueError("Failed to export llm model", source_model)
            if not (os.path.isfile(os.path.join(llm_model_path, 'openvino_detokenizer.xml'))):
                print("Tokenizer and detokenizer not found in the exported model. Exporting tokenizer and detokenizer from HF model")
                convert_tokenizer_command = "convert_tokenizer --with-detokenizer -o {} {}".format(llm_model_path, source_model)
                if os.system(convert_tokenizer_command):
                    raise ValueError("Failed to export tokenizer and detokenizer", source_model)
    ### Export draft model for speculative decoding 
    draft_source_model = task_parameters.get("draft_source_model", None)
    draft_model_dir_name = None   
    if draft_source_model:
        draft_model_dir_name = draft_source_model.replace("/", "-") # flatten the name so we don't create nested directory structure
        draft_llm_model_path = os.path.join(model_repository_path, model_name, draft_model_dir_name)
        if os.path.isfile(os.path.join(draft_llm_model_path, 'openvino_model.xml')):
            print("OV model is source folder. Skipping conversion.")
        elif source_model.startswith("OpenVINO/"):
            if precision:
                print("Precision change is not supported for OpenVINO models. Parameter --weight-format {} will be ignored.".format(precision))
            hugging_face_cmd = "huggingface-cli download {} --local-dir {} ".format(source_model, os.path.join(draft_llm_model_path, draft_source_model))
            if os.system(hugging_face_cmd):
                raise ValueError("Failed to download llm model", source_model)    
        else: # assume HF model name or local pytorch model folder
            print("Exporting draft LLM model to ", draft_llm_model_path)
            if not os.path.isdir(draft_llm_model_path) or args['overwrite_models']:
                additional_options = ""
                if args["draft_eagle3_mode"]:
                    print("Using eagle3 option for the draft model export")
                    additional_options += " --eagle3  --task text-generation-with-past"
                optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {} {}".format(draft_source_model, precision, additional_options, draft_llm_model_path)
                if os.system(optimum_command):
                    raise ValueError("Failed to export llm model", source_model)

    ### Prepare plugin config string for jinja rendering
    plugin_config = {}
    if task_parameters['kv_cache_precision'] is not None:
        plugin_config['KV_CACHE_PRECISION'] = task_parameters['kv_cache_precision']
    if task_parameters['max_prompt_len'] is not None:
        if task_parameters['target_device'] != 'NPU':
            raise ValueError("max_prompt_len is only supported for NPU target device")
        if task_parameters['max_prompt_len'] <= 0:
            raise ValueError("max_prompt_len should be a positive integer")
    if task_parameters['ov_cache_dir'] is not None:
        plugin_config['CACHE_DIR'] = task_parameters['ov_cache_dir']

    if task_parameters['prompt_lookup_decoding']:
        plugin_config['prompt_lookup'] = True
    
    # Additional plugin properties for HETERO
    if "HETERO" in task_parameters['target_device']:
        plugin_config['MODEL_DISTRIBUTION_POLICY'] = 'PIPELINE_PARALLEL'

    if task_parameters['target_device'] == 'NPU':
        max_prompt_len = task_parameters['max_prompt_len']
        npu_properties = {}
        if max_prompt_len is not None:
            npu_properties['MAX_PROMPT_LEN'] = max_prompt_len
        if task_parameters['enable_prefix_caching']:
            npu_properties['NPUW_LLM_ENABLE_PREFIX_CACHING'] = True
        device_properties = { "NPU": npu_properties }
        plugin_config['DEVICE_PROPERTIES'] = device_properties

    plugin_config_str = json.dumps(plugin_config)
    task_parameters['plugin_config'] = plugin_config_str
    
    os.makedirs(os.path.join(model_repository_path, model_name), exist_ok=True)
    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(text_generation_graph_template)
    print("task_parameters", task_parameters)
    graph_content = gtemplate.render(model_path=model_path, draft_model_dir_name=draft_model_dir_name, **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
        f.write(graph_content)
    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_embeddings_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, truncate=True):
    set_max_context_length = ""
    destination_path = os.path.join(model_repository_path, model_name)
    print("Exporting embeddings model to ",destination_path)
    if not os.path.isdir(destination_path) or args['overwrite_models']:
        optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task feature-extraction --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
        print('Running command:', optimum_command)  # for debug purposes
        if os.system(optimum_command):
            raise ValueError("Failed to export embeddings model", source_model)
        print("Exporting tokenizer to ", destination_path)
        convert_tokenizer_command = "convert_tokenizer -o {} {} {}".format(destination_path, source_model, set_max_context_length) 
        print('Running command:', convert_tokenizer_command)  # for debug purposes
        if (os.system(convert_tokenizer_command)):
            raise ValueError("Failed to export tokenizer model", source_model)
    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(embedding_graph_ov_template)
    graph_content = gtemplate.render(model_path="./", **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
        f.write(graph_content)
    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_text2speech_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
    destination_path = os.path.join(model_repository_path, model_name)
    print("Exporting text2speech model to ",destination_path)
    if not os.path.isdir(destination_path) or args['overwrite_models']:
        optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code --model-kwargs \"{{\\\"vocoder\\\": \\\"{}\\\"}}\" {}".format(source_model, precision, task_parameters['vocoder'], destination_path)
        if os.system(optimum_command):
            raise ValueError("Failed to export text2speech model", source_model)
    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(t2s_graph_template)
    graph_content = gtemplate.render(model_path="./", **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
        f.write(graph_content)
    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_speech2text_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path):
    destination_path = os.path.join(model_repository_path, model_name)
    print("Exporting speech2text model to ",destination_path)
    if not os.path.isdir(destination_path) or args['overwrite_models']:
        optimum_command = "optimum-cli export openvino --model {} --weight-format {} --trust-remote-code {}".format(source_model, precision, destination_path)
        if os.system(optimum_command):
            raise ValueError("Failed to export speech2text model", source_model)
    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(s2t_graph_template)
    graph_content = gtemplate.render(model_path="./", **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
        f.write(graph_content)
    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_rerank_model_ov(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, max_doc_length):
    destination_path = os.path.join(model_repository_path, model_name)
    print("Exporting rerank model to ",destination_path)
    if not os.path.isdir(destination_path) or args['overwrite_models']:
        optimum_command = "optimum-cli export openvino --model {} --disable-convert-tokenizer --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], destination_path)
        if os.system(optimum_command):
            raise ValueError("Failed to export rerank model", source_model)
        print("Exporting tokenizer to ", destination_path)
        export_rerank_tokenizer(source_model, destination_path, max_doc_length)
    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_ov_template)
    graph_content = gtemplate.render(model_path="./", **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
        f.write(graph_content)
    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))

def export_rerank_model(model_repository_path, source_model, model_name, precision, task_parameters, version, config_file_path, max_doc_length):
    if os.path.isfile(os.path.join(model_name, 'openvino_model.xml')):
        print("OV model is source folder. Skipping conversion.")
        os.makedirs(os.path.join(model_repository_path, model_name, 'rerank', version), exist_ok=True)
        os.makedirs(os.path.join(model_repository_path, model_name, 'tokenizer', version), exist_ok=True)
        shutil.move(os.path.join(model_repository_path, model_name, 'openvino_tokenizer.xml'), os.path.join(model_repository_path, model_name, 'tokenizer', version, 'model.xml'))
        shutil.move(os.path.join(model_repository_path, model_name, 'openvino_tokenizer.bin'), os.path.join(model_repository_path, model_name, 'tokenizer', version, 'model.bin'))
        shutil.move(os.path.join(model_repository_path, model_name, 'openvino_model.xml'), os.path.join(model_repository_path, model_name, 'rerank', version, 'model.xml'))
        shutil.move(os.path.join(model_repository_path, model_name, 'openvino_model.bin'), os.path.join(model_repository_path, model_name, 'rerank', version, 'model.bin'))
    else: # assume HF model name
        with tempfile.TemporaryDirectory() as tmpdirname:
            embeddings_path = os.path.join(model_repository_path, model_name, 'rerank', version)
            print("Exporting rerank model to ",embeddings_path)
            if not os.path.isdir(embeddings_path) or args['overwrite_models']:
                optimum_command = "optimum-cli export openvino --disable-convert-tokenizer --model {} --task text-classification --weight-format {} {} --trust-remote-code {}".format(source_model, precision, task_parameters['extra_quantization_params'], tmpdirname)
                if os.system(optimum_command):
                    raise ValueError("Failed to export rerank model", source_model)
                set_rt_info(tmpdirname, 'openvino_model.xml', 'config.json')
                os.makedirs(embeddings_path, exist_ok=True)
                shutil.move(os.path.join(tmpdirname, 'openvino_model.xml'), os.path.join(embeddings_path, 'model.xml'))
                shutil.move(os.path.join(tmpdirname, 'openvino_model.bin'), os.path.join(embeddings_path, 'model.bin'))
            tokenizer_path = os.path.join(model_repository_path, model_name,'tokenizer', version)
            print("Exporting tokenizer to ",tokenizer_path)
            if not os.path.isdir(tokenizer_path) or args['overwrite_models']:
                export_rerank_tokenizer(source_model, tmpdirname, max_doc_length)
                set_rt_info(tmpdirname, 'openvino_tokenizer.xml', 'tokenizer_config.json')
                os.makedirs(tokenizer_path, exist_ok=True)
                shutil.move(os.path.join(tmpdirname, 'openvino_tokenizer.xml'), os.path.join(tokenizer_path, 'model.xml'))
                shutil.move(os.path.join(tmpdirname, 'openvino_tokenizer.bin'), os.path.join(tokenizer_path, 'model.bin'))
    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_graph_template)
    graph_content = gtemplate.render(model_name=model_name, **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
        f.write(graph_content)
    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
    stemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(rerank_subconfig_template)
    subconfig_content = stemplate.render(model_name=model_name, **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'subconfig.json'), 'w') as f:
        f.write(subconfig_content)
    print("Created subconfig {}".format(os.path.join(model_repository_path, model_name, 'subconfig.json')))
    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))


def export_image_generation_model(model_repository_path, source_model, model_name, precision, task_parameters, config_file_path, num_streams):
    model_path = "./"
    target_path = os.path.join(model_repository_path, model_name)
    model_index_path = os.path.join(target_path, 'model_index.json')

    if os.path.isfile(model_index_path):
        print("Model index file already exists. Skipping conversion, re-generating graph only.")
    else:
        optimum_command = "optimum-cli export openvino --model {} --weight-format {} {} {}".format(source_model, precision, task_parameters['extra_quantization_params'], target_path)
        print(f'optimum cli command: {optimum_command}')
        if os.system(optimum_command):
            raise ValueError("Failed to export image generation model", source_model)

    plugin_config = {}
    assert num_streams >= 0, "num_streams should be a non-negative integer"
    if num_streams > 0:
        plugin_config['NUM_STREAMS'] = num_streams
    if 'ov_cache_dir' in task_parameters and task_parameters['ov_cache_dir'] is not None:
        plugin_config['CACHE_DIR'] = task_parameters['ov_cache_dir']
    
    if len(plugin_config) > 0:
        task_parameters['plugin_config_str'] = json.dumps(plugin_config)

    # assert that max_resolution if exists, is in WxH format
    for param in ['max_resolution', 'default_resolution']:
        if task_parameters[param]:
            if 'x' not in task_parameters[param]:
                raise ValueError(param + " should be in WxH format, e.g. 1024x768")
            width, height = task_parameters[param].split('x')
            if not (width.isdigit() and height.isdigit()):
                raise ValueError(param + " should be in WxH format with positive integers, e.g. 1024x768")
            task_parameters[param] = '{}x{}'.format(int(width), int(height))

    gtemplate = jinja2.Environment(loader=jinja2.BaseLoader).from_string(image_generation_graph_template)
    graph_content = gtemplate.render(model_path=model_path, **task_parameters)
    with open(os.path.join(model_repository_path, model_name, 'graph.pbtxt'), 'w') as f:
         f.write(graph_content)
    print("Created graph {}".format(os.path.join(model_repository_path, model_name, 'graph.pbtxt')))
    add_servable_to_config(config_file_path, model_name, os.path.relpath(os.path.join(model_repository_path, model_name), os.path.dirname(config_file_path)))


if not os.path.isdir(args['model_repository_path']):
    raise ValueError(f"The model repository path '{args['model_repository_path']}' is not a valid directory.")
if args['source_model'] is None:
    args['source_model'] = args['model_name']
if args['model_name'] is None:
    args['model_name'] = args['source_model']
if args['model_name'] is None and args['source_model'] is None:
    raise ValueError("Either model_name or source_model should be provided")

### Speculative decoding specific
if args['task'] == 'text_generation':
    if args['draft_source_model'] is None:
        args['draft_source_model'] = args['draft_model_name']
    if args['draft_model_name'] is None:
        args['draft_model_name'] = args['draft_source_model']
###

if args['extra_quantization_params'] is None:
    args['extra_quantization_params'] = ""

template_parameters = {k: v for k, v in args.items() if k not in ['model_repository_path', 'source_model', 'model_name', 'precision', 'version', 'config_file_path', 'overwrite_models']}
print("template params:", template_parameters)

if args['task'] == 'text_generation':
    export_text_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])

elif args['task'] == 'embeddings_ov':
    export_embeddings_model_ov(args['model_repository_path'], args['source_model'], args['model_name'],  args['precision'], template_parameters, args['config_file_path'], args['truncate'])

elif args['task'] == 'rerank':
    export_rerank_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, str(args['version']), args['config_file_path'], args['max_doc_length'])

elif args['task'] == 'rerank_ov':
    export_rerank_model_ov(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'], args['max_doc_length'])

elif args['task'] == 'text2speech':
    export_text2speech_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'])

elif args['task'] == 'speech2text':
    export_speech2text_model(args['model_repository_path'], args['source_model'], args['model_name'] ,args['precision'], template_parameters, args['config_file_path'])
elif args['task'] == 'image_generation':
    template_parameters = {k: v for k, v in args.items() if k in [
        'ov_cache_dir',
        'target_device',
        'resolution',
        'num_images_per_prompt',
        'guidance_scale',
        'max_resolution',
        'default_resolution',
        'max_num_images_per_prompt',
        'default_num_inference_steps',
        'max_num_inference_steps',
        'extra_quantization_params'
    ]}
    export_image_generation_model(args['model_repository_path'], args['source_model'], args['model_name'], args['precision'], template_parameters, args['config_file_path'], args['num_streams'])