from llama_cpp import Llama import time import gc import sys # Configuration MODEL_PATH = "path/to/etc/Models/DeepSeek-Coder-V2-Lite-Instruct-Q4_K_M.gguf" # 10 Unique Questions for the CPU CPU_QUESTIONS = [ "Logic: If all A are B and some B are C, is it necessarily true that some A are C? Explain your reasoning step-by-step.", "Health: What are the primary lifestyle factors that contribute to improving heart health, and how do they interact?", "History: Summarize the significance of the Magna Carta in the context of modern constitutional law.", "Coding: Write a Python function using recursion to solve the Tower of Hanoi problem.", "Creativity: Write a 10-line poem about the intersection of ancient nature and cyberpunk technology.", "Fact: Provide a detailed biography of Marie Curie, focusing on her two Nobel Prizes.", "Math: Explain the concept of the 'Golden Ratio' and find three examples of it in nature.", "Tech: Explain how a 'Virtual Private Network' (VPN) works using an analogy involving a physical envelope.", "Ethics: Discuss the ethical dilemmas of self-driving cars in 'trolley problem' scenarios.", "Food: Explain the science of 'Maillard Reaction' and why it is crucial for searing a steak." ] # 10 Different Unique Questions for the iGPU GPU_QUESTIONS = [ "Logic: Brothers and sisters I have none, but that man's father is my father's son. Who is in the photo? Walk me through the logic.", "Health: Explain the physiological differences between Type 1 and Type 2 diabetes for a medical student.", "History: Analyze the socio-economic causes that led directly to the storming of the Bastille.", "Coding: Write a Python script that implements a basic Blockchain with blocks, hashes, and a simple proof-of-work.", "Creativity: Describe a futuristic library where books are stored as biological DNA memories in 100 words.", "Fact: Explain the 'Tallest Mountain' debate: Everest vs. Mauna Kea vs. Chimborazo.", "Math: What is the 'Monty Hall Problem'? Explain why the math suggests you should always switch doors.", "Tech: Compare HDD, SSD, and NVMe technologies in terms of latency, IOPS, and physical longevity.", "Ethics: Should AI models be allowed to have 'copyright' over the art or code they generate?", "Food: Detail the chemical process of fermentation in sourdough bread and why it differs from commercial yeast." ] def run_streaming_bench(device_label, questions, n_gpu_layers): print(f"\n{'='*70}\n>>> STARTING BATTLE: {device_label}\n{'='*70}") try: # Load model fresh llm = Llama( model_path=MODEL_PATH, n_ctx=4096, # Comfortable context for 16GB dual-channel RAM n_threads=4, # Optimized for i3-8th Gen n_gpu_layers=n_gpu_layers, verbose=False ) total_tokens = 0 total_gen_time = 0 for i, q in enumerate(questions): print(f"\n[Q{i+1}/10] {q[:65]}...") print(f"Response: ", end="", flush=True) start_trigger = time.perf_counter() q_token_count = 0 # stream=True with NO max_tokens limit (None) stream = llm(q, max_tokens=None, stream=True, stop=["<|EOT|>"]) for output in stream: text = output["choices"][0]["text"] print(text, end="", flush=True) q_token_count += 1 end_trigger = time.perf_counter() duration = end_trigger - start_trigger # Calculate speed for this specific question tps = q_token_count / duration print(f"\n\n--- Stats: {q_token_count} tokens | {tps:.2f} t/s ---") total_tokens += q_token_count total_gen_time += duration avg_tps = total_tokens / total_gen_time print(f"\n[DEVICE FINAL] {device_label}: {avg_tps:.2f} tokens/sec (Average)") # Explicit Unload to prevent OOM on 16GB del llm gc.collect() time.sleep(5) # Let the hardware cool down return avg_tps except Exception as e: print(f"\nError on {device_label}: {e}") return 0 if __name__ == "__main__": # Test 1: CPU cpu_score = run_streaming_bench("CPU ", CPU_QUESTIONS, 0) # Test 2: iGPU gpu_score = run_streaming_bench("iGPU ", GPU_QUESTIONS, 1) # Final Comparison Report print(f"\n{'#'*60}") print(f"{'DEVICE':<25} | {'AVG SPEED (TPS)':<20}") print(f"{'-'*55}") print(f"{'CPU ':<25} | {cpu_score:<20.2f}") print(f"{'iGPU ':<25} | {gpu_score:<20.2f}") print(f"{'#'*60}") if gpu_score > cpu_score: diff = ((gpu_score / cpu_score) - 1) * 100 print(f"WINNER: iGPU is {diff:.1f}% faster over 10 unique tasks.") else: diff = ((cpu_score / gpu_score) - 1) * 100 if gpu_score > 0 else 0 print(f"WINNER: CPU is {diff:.1f}% faster over 10 unique tasks.")