/** * Incident Postmortem DAG with Parallel Fan-Out * * Demonstrates DAG task orchestration with three independent root tasks * fanning out from t=0, then synthesizing into a final postmortem document. * * log-pattern-extractor | * correlate-deploys |--> root-cause-hypothesizer --> postmortem-writer * analyze-blast-radius | * departures from that template: * - No FORCE_FAIL / retry-demonstration machinery -- this example's * theme is parallel fan-out + multi-source synthesis, not retry. * - Three parallel root tasks (instead of two), so the parallelism * check asserts that all three start within a 500ms window. * - Final postmortem is also written to disk via os.tmpdir(). * - Token cost on claude-sonnet-4-6 is printed alongside token totals. * * Run: * npx tsx examples/cookbook/incident-postmortem-dag.ts * * Prerequisites: * ANTHROPIC_API_KEY env var must be set. * * Local validation hatch: * EXAMPLE_PROVIDER=openrouter switches every agent + the orchestrator * defaults to a free OpenRouter model (openrouter/free) * via the OpenAI-compatible adapter. The framework reads OPENAI_API_KEY * and OPENAI_BASE_URL from env. */ import { OpenMultiAgent } from '../../src/index.js' import type { AgentConfig, OrchestratorEvent } from '../../src/types.js' import { readFileSync, writeFileSync } from 'fs' import { fileURLToPath } from 'url' import { dirname, join } from 'path' import { tmpdir } from 'os' const __filename = fileURLToPath(import.meta.url) const __dirname = dirname(__filename) // --------------------------------------------------------------------------- // Step 1: Agent configurations (5 agents) // --------------------------------------------------------------------------- const logPatternExtractorConfig: AgentConfig = { name: 'log-pattern-extractor', model: 'claude-sonnet-4-6', provider: 'anthropic', systemPrompt: `You are a log analysis specialist. Parse the provided raw backend logs and identify: - Error clusters (group by error message / endpoint / service) - The timestamp of the first regression (when error rate clearly departs from baseline) - Noisy hosts/services/endpoints Output a JSON object with this shape: { "first_regression_at": "ISO 8601 timestamp", "error_clusters": [ { "service": string, "endpoint": string, "error": string, "count": number, "first_seen": string, "last_seen": string } ], "noisy_endpoints": [ { "endpoint": string, "error_count": number } ], "notes": string } Output ONLY valid JSON, no other text.`, maxTurns: 1, temperature: 0.1, } const deployCorrelatorConfig: AgentConfig = { name: 'deploy-correlator', model: 'claude-sonnet-4-6', provider: 'anthropic', systemPrompt: `You are a deploy correlation analyst. Given a list of recent deploys and a window of incident symptoms, identify which deploy(s) most plausibly caused the symptoms based on timing and changed paths. Output a JSON object: { "ranked_candidates": [ { "sha": string, "deployed_at": string, "summary": string, "confidence": "high" | "medium" | "low", "rationale": string } ], "ruled_out": [ { "sha": string, "reason": string } ] } Output ONLY valid JSON, no other text.`, maxTurns: 1, temperature: 0.1, } const blastRadiusAnalystConfig: AgentConfig = { name: 'blast-radius-analyst', model: 'claude-sonnet-4-6', provider: 'anthropic', systemPrompt: `You are a blast radius analyst. From the provided logs, estimate user-facing impact. Output a JSON object: { "affected_endpoints": [ { "endpoint": string, "method": string } ], "approximate_failed_requests": number, "approximate_total_requests_in_window": number, "error_rate_pct": number, "impact_window": { "start": string, "end": string }, "user_facing_severity": "low" | "medium" | "high" | "critical", "notes": string } Output ONLY valid JSON, no other text.`, maxTurns: 1, temperature: 0.1, } const rootCauseHypothesizerConfig: AgentConfig = { name: 'root-cause-hypothesizer', model: 'claude-sonnet-4-6', provider: 'anthropic', systemPrompt: `You are a root cause analyst. Using the log patterns and deploy correlation from the previous tasks (available via shared memory), propose 2-3 ranked root cause hypotheses. For each hypothesis, include: - A short title - Confidence (high / medium / low) - Supporting evidence (cite specific log clusters and deploy SHAs) - What you'd check next to confirm or refute it Output in Markdown format, with hypotheses as numbered ## headings ranked from most to least likely.`, maxTurns: 1, temperature: 0.2, } const postmortemWriterConfig: AgentConfig = { name: 'postmortem-writer', model: 'claude-sonnet-4-6', provider: 'anthropic', systemPrompt: `You are a postmortem author. Synthesize a final incident postmortem in Markdown using the log patterns, deploy correlation, blast radius, and root cause hypotheses from the previous tasks (available via shared memory). Use these top-level sections, in order: - # Incident Postmortem - ## Summary - ## Timeline - ## Impact - ## Contributing Factors - ## Action Items Keep it tight and factual. Cite specific timestamps, endpoints, and deploy SHAs where they support a claim. Action items should be concrete and assignable.`, maxTurns: 1, temperature: 0.2, } // --------------------------------------------------------------------------- // Step 2: Task configurations (5 tasks) // --------------------------------------------------------------------------- interface TaskConfig { title: string description: string assignee?: string dependsOn?: readonly string[] maxRetries?: number retryDelayMs?: number retryBackoff?: number } // Task configs will be created after reading fixture files (see Step 6b) // --------------------------------------------------------------------------- // Step 3: Progress tracking // --------------------------------------------------------------------------- interface TaskTiming { startTime: number endTime?: number } const taskTimings = new Map() const taskStartTimes = new Map() function handleProgress(event: OrchestratorEvent): void { const ts = new Date().toISOString() switch (event.type) { case 'task_start': { const task = event.data as { id: string; title: string } | undefined if (task) { const now = Date.now() taskTimings.set(task.id, { startTime: now }) taskStartTimes.set(task.title, now) // Use title as key for easier verification console.log(`[${ts}] task_start: ${task.title}`) } break } case 'task_complete': { const task = event.data as { id: string; title: string } | undefined if (task) { const timing = taskTimings.get(task.id) if (timing) { timing.endTime = Date.now() const duration = timing.endTime - timing.startTime console.log(`[${ts}] task_complete: ${task.title} (${duration}ms)`) } } break } case 'task_retry': console.log(`[${ts}] task_retry:`, JSON.stringify(event.data)) break case 'task_skipped': console.log(`[${ts}] task_skipped: ${event.task}`) break case 'agent_start': if (event.agent) { console.log(`[${ts}] agent_start: ${event.agent}`) } break case 'agent_complete': if (event.agent) { console.log(`[${ts}] agent_complete: ${event.agent}`) } break case 'message': if (typeof event.data === 'string') { console.log(`[${ts}] message: ${event.data}`) } break case 'error': console.log(`[${ts}] error:`, JSON.stringify(event.data)) break default: break } } // --------------------------------------------------------------------------- // Step 4: Parallelism verification // --------------------------------------------------------------------------- function verifyParallelism(): void { const t1 = taskStartTimes.get('extract-log-patterns') const t2 = taskStartTimes.get('correlate-deploys') const t3 = taskStartTimes.get('analyze-blast-radius') if (t1 !== undefined && t2 !== undefined && t3 !== undefined) { const spread = Math.max(t1, t2, t3) - Math.min(t1, t2, t3) console.log('\n=== Parallelism Check ===') console.log(`Task 1 (extract-log-patterns) start: ${t1}`) console.log(`Task 2 (correlate-deploys) start: ${t2}`) console.log(`Task 3 (analyze-blast-radius) start: ${t3}`) console.log(`Spread (max - min): ${spread}ms`) if (spread >= 500) { console.error( `ASSERTION FAILED: Tasks 1, 2, 3 start times spread by ${spread}ms (>= 500ms). ` + `Expected parallel execution from t=0.`, ) process.exit(1) } console.log(`Parallel execution (< 500ms): YES`) console.log('========================\n') } else { console.log('\n=== Parallelism Check: Unable to verify (missing timing data) ===\n') } } // --------------------------------------------------------------------------- // Step 5: Orchestrator and Team configuration // --------------------------------------------------------------------------- // Local validation hatch -- flip every agent's provider/model to a free // OpenRouter model when EXAMPLE_PROVIDER=openrouter. The committed // AgentConfig values above remain anthropic / claude-sonnet-4-6. const useOpenRouter = process.env.EXAMPLE_PROVIDER === 'openrouter' const OPENROUTER_MODEL = 'openrouter/free' const agentConfigs: AgentConfig[] = [ logPatternExtractorConfig, deployCorrelatorConfig, blastRadiusAnalystConfig, rootCauseHypothesizerConfig, postmortemWriterConfig, ] const effectiveAgents: AgentConfig[] = useOpenRouter ? agentConfigs.map((c) => ({ ...c, provider: 'openai', model: OPENROUTER_MODEL })) : agentConfigs const orchestrator = new OpenMultiAgent({ defaultModel: useOpenRouter ? OPENROUTER_MODEL : 'claude-sonnet-4-6', defaultProvider: useOpenRouter ? 'openai' : 'anthropic', onProgress: handleProgress, }) const team = orchestrator.createTeam('incident-postmortem-team', { name: 'incident-postmortem-team', agents: effectiveAgents, sharedMemory: true, }) console.log( `[mode] provider=${useOpenRouter ? 'openai' : 'anthropic'} ` + `model=${useOpenRouter ? OPENROUTER_MODEL : 'claude-sonnet-4-6'}`, ) // --------------------------------------------------------------------------- // Step 6: Read fixture files // --------------------------------------------------------------------------- const logsText = readFileSync( join(__dirname, '..', 'fixtures', 'incident-logs.txt'), 'utf-8', ) const deploysText = readFileSync( join(__dirname, '..', 'fixtures', 'incident-deploys.json'), 'utf-8', ) // --------------------------------------------------------------------------- // Step 6b: Create task configs with fixture content embedded // --------------------------------------------------------------------------- const taskConfigs: TaskConfig[] = [ { title: 'extract-log-patterns', description: `Parse the following backend logs and extract structured error patterns.\n\n=== LOGS ===\n${logsText}\n=== END LOGS ===\n\nOutput only valid JSON.`, assignee: 'log-pattern-extractor', }, { title: 'correlate-deploys', description: `Given the recent deploys below, identify which deploy(s) most likely caused the incident symptoms based on timing and changed paths. Symptoms: a sharp regression starting around 14:12 UTC on 2026-04-22, manifesting as 5xx errors. Use the timing alone for now; later tasks will cross-check against logs.\n\n=== DEPLOYS ===\n${deploysText}\n=== END DEPLOYS ===\n\nOutput only valid JSON.`, assignee: 'deploy-correlator', }, { title: 'analyze-blast-radius', description: `Estimate user-facing impact (affected endpoints, approximate failed request count, error rate, severity) from the following logs.\n\n=== LOGS ===\n${logsText}\n=== END LOGS ===\n\nOutput only valid JSON.`, assignee: 'blast-radius-analyst', }, { title: 'hypothesize-cause', description: 'Propose 2-3 ranked root cause hypotheses for the incident, using the log patterns and deploy correlation from the previous tasks. Cite specific log clusters and deploy SHAs.', assignee: 'root-cause-hypothesizer', dependsOn: ['extract-log-patterns', 'correlate-deploys'], maxRetries: 2, retryDelayMs: 500, retryBackoff: 2, }, { title: 'write-postmortem', description: 'Write the final incident postmortem in Markdown, synthesizing the log patterns, deploy correlation, blast radius, and root cause hypotheses from the previous tasks. Use sections: Summary, Timeline, Impact, Contributing Factors, Action Items.', assignee: 'postmortem-writer', dependsOn: ['extract-log-patterns', 'correlate-deploys', 'analyze-blast-radius', 'hypothesize-cause'], maxRetries: 2, retryDelayMs: 500, retryBackoff: 2, }, ] console.log('Incident Postmortem DAG with Parallel Fan-Out') console.log('='.repeat(60)) console.log(`\nLogs: ${logsText.split('\n').length} lines`) console.log(`Deploys: ${(JSON.parse(deploysText) as unknown[]).length} entries\n`) // --------------------------------------------------------------------------- // Step 7: Run the DAG // --------------------------------------------------------------------------- async function main(): Promise { try { console.log('[Orchestration] Starting DAG execution...\n') const result = await orchestrator.runTasks(team, taskConfigs) // Verify parallelism verifyParallelism() console.log('\n' + '='.repeat(60)) console.log('FINAL RESULT') console.log('='.repeat(60)) console.log(`Success: ${result.success}`) console.log(`Total tasks: ${result.tasks?.length ?? taskConfigs.length}`) console.log(`Total token usage: ${result.totalTokenUsage.input_tokens} input, ${result.totalTokenUsage.output_tokens} output`) // Cost estimate on claude-sonnet-4-6 -- even when running on OpenRouter, // this represents what the same token volume would cost on Anthropic. const inputCost = (result.totalTokenUsage.input_tokens / 1_000_000) * 3 const outputCost = (result.totalTokenUsage.output_tokens / 1_000_000) * 15 console.log( `Estimated cost on claude-sonnet-4-6: ~$${(inputCost + outputCost).toFixed(4)} USD ` + `(input @ $3/M, output @ $15/M)`, ) // Output each agent's result console.log('\n' + '='.repeat(60)) console.log('AGENT RESULTS') console.log('='.repeat(60)) for (const [agentName, agentResult] of result.agentResults) { console.log(`\n--- ${agentName} ---`) console.log(agentResult.success ? agentResult.output : `[FAILED] ${agentResult.output}`) } if (result.success) { console.log('\n--- Final Postmortem ---') const writerResult = result.agentResults.get('postmortem-writer') const writerOutput = writerResult?.output ?? '' if (writerOutput) { console.log(writerOutput) const outPath = join(tmpdir(), 'incident-postmortem.md') writeFileSync(outPath, writerOutput, 'utf-8') console.log(`\nPostmortem written to: ${outPath}`) } } else { console.log('\nWorkflow failed.') for (const task of result.tasks ?? []) { if (task.status === 'failed') { console.log(` - ${task.title}: failed`) } } } console.log('\n' + '='.repeat(60)) console.log('Done.') } catch (error) { console.error('Fatal error:', error) process.exit(1) } } main()