""" OpenVINO LLM chat sample without chat template. This sample is meant to test models that do not have a chat template. For better results, use a chat model (usually named -instruct or -chat) and use the llm_chat.py sample instead. This chat will not have history, it is purely meant to test model outputs. Prerequisites: - pip install openvino-genai - an OpenVINO LLM. See https://github.com/helena-intel/readmes/blob/main/genai-best-practices.md Usage: python llm_test.py /path/to/ov_model DEVICE Modified from https://github.com/openvinotoolkit/openvino.genai/tree/master/samples/python/chat_sample """ import argparse import time import openvino_genai def streamer(subword): print(subword, end="", flush=True) # Return flag corresponds whether generation should be stopped. # False means continue generation. return False parser = argparse.ArgumentParser() parser.add_argument("model_dir") parser.add_argument("device") args = parser.parse_args() pipeline_config = {"CACHE_DIR": "model_cache"} pipe = openvino_genai.LLMPipeline(args.model_dir, args.device, **pipeline_config) config = pipe.get_generation_config() config.max_new_tokens = 100 config.do_sample = False config.apply_chat_template = False # From 2025.1, chat templates are automatically enabled # warmup inference pipe.generate("hello", max_new_tokens=1, do_sample=False, apply_chat_template=False) while True: try: prompt = input("prompt:\n") except EOFError: break start = time.perf_counter() pipe.generate(prompt, config, streamer) end = time.perf_counter() print() print(f"Inference duration: {end-start:.2f} seconds") print("\n----------")