# Example 1: State of the Union Thematic Analysis # This workflow downloads the State of the Union address, chunks it, and performs thematic analysis nodes: # Download the State of the Union document from GitHub document_loader: type: LoadWebDocument config: url: "https://raw.githubusercontent.com/amaiya/onprem/refs/heads/master/nbs/tests/sample_data/sotu/state_of_the_union.txt" # Chunk the document into 1000-character pieces for detailed analysis text_chunker: type: SplitByCharacterCount config: chunk_size: 1000 # 1000 characters per chunk chunk_overlap: 100 # 100 character overlap between chunks # Extract thematic topics from each chunk thematic_analyzer: type: PromptProcessor config: prompt: | Analyze this excerpt from the State of the Union address and identify the main theme or policy area discussed. Provide only a single keyword or short phrase (2-3 words max) that captures the primary theme: {content} Theme: llm: model_url: "openai://gpt-4o-mini" # Good balance of quality and speed temperature: 0.3 # Some creativity for theme identification mute_stream: true batch_size: 3 # Process 3 chunks at a time # Clean up theme responses for consistency theme_cleaner: type: ResponseCleaner config: cleanup_prompt: | Clean up this State of the Union theme response to ensure consistent formatting: {original_response} Requirements: - Remove any extraneous text like "Theme:" or "The theme is" - Ensure the response is only a single keyword or short phrase (2-3 words max) - Capitalize properly (e.g., "Economic Policy" not "economic policy") - Remove quotes, periods, or other punctuation llm: model_url: "openai://gpt-4o-mini" temperature: 0 # Deterministic cleanup mute_stream: true # Aggregate themes into comprehensive State of the Union analysis sotu_aggregator: type: AggregatorNode config: prompt: | Analyze these {num_results} thematic keywords extracted from chunks of a State of the Union address and provide a comprehensive analysis: Themes identified: {responses} Please provide your analysis in this structure: ## STATE OF THE UNION THEMATIC ANALYSIS ### 1. PRIMARY POLICY AREAS List the 5-7 most prominent policy themes with their frequency/emphasis ### 2. POLICY CATEGORIES Group related themes into major areas (e.g., Domestic Policy, Foreign Policy, Economic Policy, Social Issues) ### 3. SPEECH FOCUS What percentage of the speech is devoted to each major policy category? ### 4. THEMATIC INSIGHTS - What are the administration's key priorities based on theme frequency? - Are there any surprising or underemphasized themes? - What narrative does the thematic structure tell? ### 5. EXECUTIVE SUMMARY One paragraph summarizing the overall thematic priorities and focus of this State of the Union address. Use clear formatting with bullet points and percentages where appropriate. llm: model_url: "openai://gpt-4o-mini" temperature: 0.4 # Allow for analytical creativity max_tokens: 1500 # More space for comprehensive analysis mute_stream: true # Export individual chunk themes to CSV for detailed analysis chunk_themes_export: type: CSVExporter config: output_path: "sotu_chunk_themes.csv" # Export comprehensive SOTU analysis to JSON sotu_analysis_export: type: JSONExporter config: output_path: "sotu_thematic_analysis.json" pretty_print: true connections: # Pipeline: Download → Chunk → Analyze Themes → Clean → Export Individual + Aggregate - from: document_loader from_port: documents to: text_chunker to_port: documents - from: text_chunker from_port: documents to: thematic_analyzer to_port: documents - from: thematic_analyzer from_port: results to: theme_cleaner to_port: results # Export individual chunk themes to CSV - from: theme_cleaner from_port: results to: chunk_themes_export to_port: results # Aggregate themes for comprehensive SOTU analysis - from: theme_cleaner from_port: results to: sotu_aggregator to_port: results # Export comprehensive analysis to JSON - from: sotu_aggregator from_port: result to: sotu_analysis_export to_port: result # Prerequisites: # - Set your OPENAI_API_KEY environment variable # - Internet connection to download the State of the Union text # # Usage: # python -m onprem.workflow 1_direct_analysis.yaml # # Expected output: # - sotu_chunk_themes.csv: Individual themes from each 500-character chunk # - sotu_thematic_analysis.json: Comprehensive analysis of State of the Union themes # # What this workflow does: # 1. Downloads the State of the Union address from GitHub # 2. Chunks it into 500-character segments with 50-character overlap # 3. Extracts thematic keywords from each chunk using AI analysis # 4. Cleans and standardizes the theme responses # 5. Aggregates all themes into a comprehensive policy analysis # 6. Exports both granular (CSV) and summary (JSON) results # # Benefits of this approach: # - No local files required - downloads document automatically # - Demonstrates chunking strategy for long documents # - Shows thematic analysis of political speech # - Provides both detailed chunk-level themes and high-level insights # - Perfect example of aggregation workflow for content analysis