### Python ```python from autoevals.llm import * import asyncio # Create a new LLM-based evaluator evaluator = Factuality() # Synchronous evaluation input = "Which country has the highest population?" output = "People's Republic of China" expected = "China" # Using the synchronous API result = evaluator(output, expected, input=input) print(f"Factuality score (sync): {result.score}") print(f"Factuality metadata (sync): {result.metadata['rationale']}") # Using the asynchronous API async def main(): result = await evaluator.eval_async(output, expected, input=input) print(f"Factuality score (async): {result.score}") print(f"Factuality metadata (async): {result.metadata['rationale']}") # Run the async example asyncio.run(main()) ``` ### TypeScript ```typescript import { Factuality } from "autoevals"; (async () => { const input = "Which country has the highest population?"; const output = "People's Republic of China"; const expected = "China"; const result = await Factuality({ output, expected, input }); console.log(`Factuality score: ${result.score}`); console.log(`Factuality metadata: ${result.metadata?.rationale}`); })(); ```

### Python ```python # NOTE: ensure BRAINTRUST_API_KEY is set in your environment from autoevals.llm import * # Create an LLM-based evaluator using the Claude 3.5 Sonnet model from Anthropic evaluator = Factuality(model="claude-3-5-sonnet-latest") # Evaluate an example LLM completion input = "Which country has the highest population?" output = "People's Republic of China" expected = "China" result = evaluator(output, expected, input=input) # The evaluator returns a score from [0,1] and includes the raw outputs from the evaluator print(f"Factuality score: {result.score}") print(f"Factuality metadata: {result.metadata['rationale']}") ``` ### TypeScript ```typescript // NOTE: ensure BRAINTRUST_API_KEY is set in your environment import { Factuality } from "autoevals"; (async () => { const input = "Which country has the highest population?"; const output = "People's Republic of China"; const expected = "China"; // Run an LLM-based evaluator using the Claude 3.5 Sonnet model from Anthropic const result = await Factuality({ model: "claude-3-5-sonnet-latest", output, expected, input, }); // The evaluator returns a score from [0,1] and includes the raw outputs from the evaluator console.log(`Factuality score: ${result.score}`); console.log(`Factuality metadata: ${result.metadata?.rationale}`); })(); ```

#### Python ```python import openai import asyncio from autoevals import init from autoevals.llm import Factuality client = init(openai.AsyncOpenAI(base_url="https://api.openai.com/v1/")) async def main(): evaluator = Factuality() result = await evaluator.eval_async( input="What is the speed of light in a vacuum?", output="The speed of light in a vacuum is 299,792,458 meters per second.", expected="The speed of light in a vacuum is approximately 300,000 kilometers per second." ) print(f"Factuality score: {result.score}") asyncio.run(main()) ``` #### TypeScript ```typescript import OpenAI from "openai"; import { init, Factuality } from "autoevals"; const client = new OpenAI({ baseURL: "https://api.openai.com/v1/", }); init({ client }); (async () => { const result = await Factuality({ input: "What is the speed of light in a vacuum?", output: "The speed of light in a vacuum is 299,792,458 meters per second.", expected: "The speed of light in a vacuum is approximately 300,000 kilometers per second (or precisely 299,792,458 meters per second).", }); console.log("Factuality Score:", result); })(); ```

#### Python ```python import openai from autoevals.llm import Factuality custom_client = openai.OpenAI(base_url="https://custom-api.example.com/v1/") evaluator = Factuality(client=custom_client) ``` #### TypeScript ```typescript import OpenAI from "openai"; import { Factuality } from "autoevals"; (async () => { const customClient = new OpenAI({ baseURL: "https://custom-api.example.com/v1/", }); const result = await Factuality({ client: customClient, output: "Paris is the capital of France", expected: "Paris is the capital of France and has a population of over 2 million", input: "Tell me about Paris", }); console.log(result); })(); ```

### TypeScript Create a file named `example.eval.js` (it must take the form `*.eval.[ts|tsx|js|jsx]`): ```typescript import { Eval } from "braintrust"; import { Factuality } from "autoevals"; Eval("Autoevals", { data: () => [ { input: "Which country has the highest population?", expected: "China", }, ], task: () => "People's Republic of China", scores: [Factuality], }); ``` Then, run ```bash npx braintrust run example.eval.js ``` ### Python Create a file named `eval_example.py` (it must take the form `eval_*.py`): ```python import braintrust from autoevals.llm import Factuality Eval( "Autoevals", data=lambda: [ dict( input="Which country has the highest population?", expected="China", ), ], task=lambda *args: "People's Republic of China", scores=[Factuality], ) ```

### Python ```python from autoevals import LLMClassifier # Define a prompt prefix for a LLMClassifier (returns just one answer) prompt_prefix = """ You are a technical project manager who helps software engineers generate better titles for their GitHub issues. You will look at the issue description, and pick which of two titles better describes it. I'm going to provide you with the issue description, and two possible titles. Issue Description: {{input}} 1: {{output}} 2: {{expected}} """ # Define the scoring mechanism # 1 if the generated answer is better than the expected answer # 0 otherwise output_scores = {"1": 1, "2": 0} evaluator = LLMClassifier( name="TitleQuality", prompt_template=prompt_prefix, choice_scores=output_scores, use_cot=True, ) # Evaluate an example LLM completion page_content = """ As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client, We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy? Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification""" output = "Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX" expected = "Standardize Error Responses across APIs" response = evaluator(output, expected, input=page_content) print(f"Score: {response.score}") print(f"Metadata: {response.metadata}") ``` ### TypeScript ```typescript import { LLMClassifierFromTemplate } from "autoevals"; (async () => { const promptTemplate = `You are a technical project manager who helps software engineers generate better titles for their GitHub issues. You will look at the issue description, and pick which of two titles better describes it. I'm going to provide you with the issue description, and two possible titles. Issue Description: {{input}} 1: {{output}} 2: {{expected}}`; const choiceScores = { 1: 1, 2: 0 }; const evaluator = LLMClassifierFromTemplate<{ input: string }>({ name: "TitleQuality", promptTemplate, choiceScores, useCoT: true, }); const input = `As suggested by Nicolo, we should standardize the error responses coming from GoTrue, postgres, and realtime (and any other/future APIs) so that it's better DX when writing a client, We can make this change on the servers themselves, but since postgrest and gotrue are fully/partially external may be harder to change, it might be an option to transform the errors within the client libraries/supabase-js, could be messy? Nicolo also dropped this as a reference: http://spec.openapis.org/oas/v3.0.3#openapi-specification`; const output = `Standardize error responses from GoTrue, Postgres, and Realtime APIs for better DX`; const expected = `Standardize Error Responses across APIs`; const response = await evaluator({ input, output, expected }); console.log("Score", response.score); console.log("Metadata", response.metadata); })(); ```

### Python ```python from autoevals import Score def banana_scorer(output, expected, input): return Score(name="banana_scorer", score=1 if "banana" in output else 0) input = "What is 1 banana + 2 bananas?" output = "3" expected = "3 bananas" result = banana_scorer(output, expected, input) print(f"Banana score: {result.score}") ``` ### TypeScript ```typescript import { Score } from "autoevals"; const bananaScorer = ({ output, expected, input, }: { output: string; expected: string; input: string; }): Score => { return { name: "banana_scorer", score: output.includes("banana") ? 1 : 0 }; }; (async () => { const input = "What is 1 banana + 2 bananas?"; const output = "3"; const expected = "3 bananas"; const result = bananaScorer({ output, expected, input }); console.log(`Banana score: ${result.score}`); })(); ```