{ "model_id": "gpt-5.4", "provider_id": "openai", "quality": { "accuracy": 86.8, "correct": 210, "total": 242, "model_id": "gpt-5.4", "provider_id": "openai", "sample_id": "conv-43" }, "reflect": { "accuracy": 76.9, "correct": 186, "total": 242, "avg_latency_s": 9.769, "model_id": "gpt-5.4", "provider_id": "openai", "sample_id": "conv-43" }, "retain": { "timestamp": "2026-03-10T15:55:18.437780+00:00", "model_id": "gpt-5.4", "model_name": "gpt-5.4", "provider_id": "openai", "size_gb": 0.0, "dataset": "locomo_3k_50", "concurrency": 4, "wall_s": 449.54648303985596, "summary": { "success": 50, "total": 50, "wall_s": 449.546, "avg_latency_s": 34.839, "throughput_rps": 0.111, "completion_toks_s": 259.9, "total_toks_s": 705.5, "out_in_ratio": 0.583, "tokens_per_fact": 91.9 }, "tests": [ { "test_index": 1, "latency_s": 35.23434519767761, "num_facts": 26, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3948, "completion_tokens": 2320, "error": "" }, { "test_index": 2, "latency_s": 45.20960807800293, "num_facts": 27, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4063, "completion_tokens": 2459, "error": "" }, { "test_index": 3, "latency_s": 29.600133895874023, "num_facts": 20, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3913, "completion_tokens": 1996, "error": "" }, { "test_index": 4, "latency_s": 43.21213698387146, "num_facts": 28, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4123, "completion_tokens": 2401, "error": "" }, { "test_index": 5, "latency_s": 23.99639916419983, "num_facts": 20, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3991, "completion_tokens": 1836, "error": "" }, { "test_index": 6, "latency_s": 25.42018985748291, "num_facts": 23, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4038, "completion_tokens": 1900, "error": "" }, { "test_index": 7, "latency_s": 39.847088098526, "num_facts": 27, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3948, "completion_tokens": 2518, "error": "" }, { "test_index": 8, "latency_s": 32.30544424057007, "num_facts": 26, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4114, "completion_tokens": 2484, "error": "" }, { "test_index": 9, "latency_s": 26.528470039367676, "num_facts": 20, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3829, "completion_tokens": 1615, "error": "" }, { "test_index": 10, "latency_s": 24.324678897857666, "num_facts": 18, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3805, "completion_tokens": 1693, "error": "" }, { "test_index": 11, "latency_s": 40.38941788673401, "num_facts": 30, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3951, "completion_tokens": 2702, "error": "" }, { "test_index": 12, "latency_s": 34.30115985870361, "num_facts": 24, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4080, "completion_tokens": 2235, "error": "" }, { "test_index": 13, "latency_s": 33.41103792190552, "num_facts": 24, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3972, "completion_tokens": 2332, "error": "" }, { "test_index": 14, "latency_s": 34.30152606964111, "num_facts": 24, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4063, "completion_tokens": 2172, "error": "" }, { "test_index": 15, "latency_s": 29.670932054519653, "num_facts": 21, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4125, "completion_tokens": 2115, "error": "" }, { "test_index": 16, "latency_s": 45.76071500778198, "num_facts": 28, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4000, "completion_tokens": 2533, "error": "" }, { "test_index": 17, "latency_s": 30.819142818450928, "num_facts": 22, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4080, "completion_tokens": 2155, "error": "" }, { "test_index": 18, "latency_s": 33.90704894065857, "num_facts": 23, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4062, "completion_tokens": 2141, "error": "" }, { "test_index": 19, "latency_s": 37.16212463378906, "num_facts": 29, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3888, "completion_tokens": 2724, "error": "" }, { "test_index": 20, "latency_s": 39.18040180206299, "num_facts": 28, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4086, "completion_tokens": 2895, "error": "" }, { "test_index": 21, "latency_s": 27.45640778541565, "num_facts": 19, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4080, "completion_tokens": 2031, "error": "" }, { "test_index": 22, "latency_s": 37.90520095825195, "num_facts": 25, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4000, "completion_tokens": 2507, "error": "" }, { "test_index": 23, "latency_s": 32.972245931625366, "num_facts": 31, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4063, "completion_tokens": 2005, "error": "" }, { "test_index": 24, "latency_s": 30.2575900554657, "num_facts": 23, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4038, "completion_tokens": 2098, "error": "" }, { "test_index": 25, "latency_s": 47.74693822860718, "num_facts": 32, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4086, "completion_tokens": 3242, "error": "" }, { "test_index": 26, "latency_s": 25.396655082702637, "num_facts": 20, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4062, "completion_tokens": 1745, "error": "" }, { "test_index": 27, "latency_s": 27.614996910095215, "num_facts": 21, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3805, "completion_tokens": 1980, "error": "" }, { "test_index": 28, "latency_s": 39.27312207221985, "num_facts": 25, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4000, "completion_tokens": 2634, "error": "" }, { "test_index": 29, "latency_s": 40.50503396987915, "num_facts": 29, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4123, "completion_tokens": 2438, "error": "" }, { "test_index": 30, "latency_s": 39.61660194396973, "num_facts": 32, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3923, "completion_tokens": 2765, "error": "" }, { "test_index": 31, "latency_s": 35.213732957839966, "num_facts": 25, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3991, "completion_tokens": 2289, "error": "" }, { "test_index": 32, "latency_s": 44.60033178329468, "num_facts": 26, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4063, "completion_tokens": 2550, "error": "" }, { "test_index": 33, "latency_s": 43.36257600784302, "num_facts": 30, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4109, "completion_tokens": 2981, "error": "" }, { "test_index": 34, "latency_s": 46.94837522506714, "num_facts": 40, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3951, "completion_tokens": 3087, "error": "" }, { "test_index": 35, "latency_s": 33.482800006866455, "num_facts": 26, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4125, "completion_tokens": 2385, "error": "" }, { "test_index": 36, "latency_s": 33.02161693572998, "num_facts": 24, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4063, "completion_tokens": 2265, "error": "" }, { "test_index": 37, "latency_s": 41.26958608627319, "num_facts": 32, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3923, "completion_tokens": 2782, "error": "" }, { "test_index": 38, "latency_s": 25.73211407661438, "num_facts": 23, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3829, "completion_tokens": 1948, "error": "" }, { "test_index": 39, "latency_s": 32.329679012298584, "num_facts": 27, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3972, "completion_tokens": 2440, "error": "" }, { "test_index": 40, "latency_s": 38.21934795379639, "num_facts": 29, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3888, "completion_tokens": 2448, "error": "" }, { "test_index": 41, "latency_s": 42.29778981208801, "num_facts": 27, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3972, "completion_tokens": 2732, "error": "" }, { "test_index": 42, "latency_s": 30.55418300628662, "num_facts": 20, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3805, "completion_tokens": 1816, "error": "" }, { "test_index": 43, "latency_s": 35.04198694229126, "num_facts": 26, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4123, "completion_tokens": 2273, "error": "" }, { "test_index": 44, "latency_s": 33.57872009277344, "num_facts": 25, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4114, "completion_tokens": 2434, "error": "" }, { "test_index": 45, "latency_s": 31.52805995941162, "num_facts": 25, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3923, "completion_tokens": 2308, "error": "" }, { "test_index": 46, "latency_s": 33.049763202667236, "num_facts": 26, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3991, "completion_tokens": 2172, "error": "" }, { "test_index": 47, "latency_s": 30.60388708114624, "num_facts": 24, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4114, "completion_tokens": 2134, "error": "" }, { "test_index": 48, "latency_s": 26.884108066558838, "num_facts": 22, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4109, "completion_tokens": 2058, "error": "" }, { "test_index": 49, "latency_s": 40.443859338760376, "num_facts": 27, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 4086, "completion_tokens": 2874, "error": "" }, { "test_index": 50, "latency_s": 30.443174123764038, "num_facts": 23, "valid_json": true, "success": true, "retries": 0, "prompt_tokens": 3913, "completion_tokens": 2192, "error": "" } ] } }