#
# Copyright (c) 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from transformers import AutoTokenizer, AutoModel
from optimum.intel import OVModelForFeatureExtraction, OVSentenceTransformer
import torch
import datetime
import numpy as np
import argparse

parser = argparse.ArgumentParser(description='Compare embeddings responses from HF transformers, OVSentenceTransformer and OVMS')
parser.add_argument('--service_url', required=False, default='http://localhost:8000/v3/embeddings',
                    help='Specify url to embeddings endpoint. default:http://localhost:8000/v3/embeddings', dest='service_url')
parser.add_argument('--model_name', default='Alibaba-NLP/gte-large-en-v1.5', help='Model name to query. default: Alibaba-NLP/gte-large-en-v1.5',
                    dest='model_name')
parser.add_argument('--hf_model_name', default='', help='HuggingFaces model name. default: equal to --model_name',
                    dest='hf_model_name')
parser.add_argument('--input', default=[], help='List of strings to query. default: []',
                    dest='input', action='append')
parser.add_argument('--pooling', default="CLS", choices=["CLS", "LAST", "MEAN"], help='Embeddings pooling mode', dest='pooling')

args = vars(parser.parse_args())

model_id = args['model_name']
hf_model_name = args['hf_model_name']
if len(hf_model_name) == 0:
    hf_model_name = model_id
tokenizer = AutoTokenizer.from_pretrained(hf_model_name, trust_remote_code=True)
model_pt = AutoModel.from_pretrained(hf_model_name, trust_remote_code=True)
#model_ov = OVSentenceTransformer.from_pretrained(model_id, trust_remote_code=True)

text = args['input']
print("input", text)

def run_model():
    with torch.no_grad():
        start_time = datetime.datetime.now()
        input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        model_output = model_pt(**input)
        if args['pooling'] == "LAST":
            sequence_lengths = input['attention_mask'].sum(dim=1) - 1
            batch_size = model_output.last_hidden_state.shape[0]
            embeddings = model_output.last_hidden_state[torch.arange(batch_size, device=model_output.last_hidden_state.device), sequence_lengths]
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        else:
            embeddings = model_output.last_hidden_state[:, 0]
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        end_time = datetime.datetime.now()
        duration = (end_time - start_time).total_seconds() * 1000
        print("HF Duration:", duration, "ms", type(model_pt).__name__)
        return np.array(embeddings)

def run_OV():
    with torch.no_grad():
        start_time = datetime.datetime.now()
        embeddings = model_ov.encode(text)
        embeddings = embeddings / np.sqrt(np.sum(embeddings**2))
        end_time = datetime.datetime.now()
        duration = (end_time - start_time).total_seconds() * 1000
        print("OV Duration:", duration, "ms", type(model_ov).__name__)
        return embeddings

def run_ovms():
    from openai import OpenAI
    client = OpenAI(base_url=args['service_url'],api_key="unused"    )
    start_time = datetime.datetime.now()
    responses = client.embeddings.create(input=text, model=model_id)
    end_time = datetime.datetime.now()
    duration = (end_time - start_time).total_seconds() * 1000
    print("OVMS Duration:", duration, "ms",)
    return responses.data

HF_embeddings = run_model()
#OV_embeddings = run_OV()
OVMS_embeddings = run_ovms()

i=0
failed=0
for res in OVMS_embeddings:
    print("Batch number:", i)
    ovmsresult = np.array(res.embedding)
    with np.printoptions(precision=4, suppress=True):
        print("OVMS embeddings: shape:",ovmsresult.shape, "emb[:20]:\n", ovmsresult[:20])
        #print("OVSentenceTransformer: shape:",OV_embeddings[i].shape, "emb[:20]:\n", OV_embeddings[i][:20])
        print("HF AutoModel: shape:",HF_embeddings[i].shape, "emb[:20]:\n", HF_embeddings[i][:20])
    print("Difference score with HF AutoModel:", np.linalg.norm(ovmsresult - HF_embeddings[i]))
    if np.allclose(ovmsresult, HF_embeddings[i], atol=1e-2):
        print("[PASS] Arrays are within tolerance (atol=1e-2)")
    else:
        failed+=1
        print("[FAIL] Arrays are NOT within tolerance (atol=1e-2)")
        # Optional: print the differences for debugging
        diff = np.abs(ovmsresult - HF_embeddings[i])
        print(f"Max difference: {diff.max():.6f}")
        print(f"Mean difference: {diff.mean():.6f}")
    if (np.linalg.norm(ovmsresult - HF_embeddings[i]) < 0.06):
        print("[PASS] Np linalg.norm")
    else:
        print("[FAIL] Np linalg.norm")
        failed+=1
    i+=1

if failed:
    print("[FAILED]")
    assert failed==0
else:
    print("[SUCCESS]")