# Example 3: Direct Document Analysis (No Vector Store) # This workflow processes documents directly from files, applies AI analysis, and exports results # Based on the FAR legal analysis example but generalized for any document type nodes: # Load documents directly from folder document_loader: type: LoadFromFolder config: source_directory: "../sample_data" # Directory containing documents include_patterns: ["*.pdf", "*.txt", "*.html"] # Multiple file types verbose: true pdf_markdown: true # Convert PDFs for better analysis store_file_dates: true # Include timestamps # Keep documents whole for comprehensive analysis full_document_processor: type: KeepFullDocument config: concatenate_pages: true # Combine multi-page documents # Apply AI analysis to extract key information document_analyzer: type: PromptProcessor config: prompt: | "Provide a single short keyword or keyphrase that captures the topic of the following text from {source} (only output the keyphrase without quotes and nothing else): {content}" llm: model_url: "openai://gpt-4o-mini" # Good balance of quality and speed temperature: 0.2 # Focused, consistent analysis mute_stream: true batch_size: 2 # Process 2 documents at a time # Clean up the responses to ensure consistent formatting response_cleaner: type: ResponseCleaner config: cleanup_prompt: | Clean up this analysis response to ensure consistent formatting: {original_response} Requirements: - Ensure the response is a single short keyword or keyphrase that captures the topic and nothing else. llm: model_url: "openai://gpt-4o-mini" temperature: 0 mute_stream: true # Aggregate all topics into a comprehensive analysis topic_aggregator: type: AggregatorNode config: prompt: | Analyze these {num_results} topic keywords from different documents and provide a comprehensive topic analysis: {responses} Please provide your analysis in the following structure: 1. TOP TOPICS: List the 5 most frequently mentioned topics with their frequency counts 2. TOPIC CATEGORIES: Group related topics into broader categories (e.g., Technology, Business, etc.) 3. COVERAGE ANALYSIS: What percentage of documents cover each major theme? 4. INSIGHTS: What patterns or trends do you observe across the document collection? 5. EXECUTIVE SUMMARY: One paragraph summarizing the overall thematic landscape Format your response with clear headings and bullet points for easy reading. llm: model_url: "openai://gpt-4o-mini" temperature: 0.3 max_tokens: 1000 mute_stream: true # Export individual document topics to CSV for detailed analysis individual_topics_export: type: CSVExporter config: output_path: "document_analysis_summary.csv" # Export aggregated analysis to JSON for programmatic access aggregated_export: type: JSONExporter config: output_path: "document_topic_analysis.json" pretty_print: true connections: # Pipeline: Load → Keep Full → Analyze → Clean → Export Individual + Aggregate - from: document_loader from_port: documents to: full_document_processor to_port: documents - from: full_document_processor from_port: documents to: document_analyzer to_port: documents - from: document_analyzer from_port: results to: response_cleaner to_port: results # Export individual document topics to CSV - from: response_cleaner from_port: results to: individual_topics_export to_port: results # Aggregate topics for comprehensive analysis - from: response_cleaner from_port: results to: topic_aggregator to_port: results # Export aggregated analysis to JSON - from: topic_aggregator from_port: result to: aggregated_export to_port: result # Prerequisites: # 1. Documents in ../sample_data/ directory # 2. Set your OPENAI_API_KEY environment variable # # Usage: # python -m onprem.workflow 3_direct_analysis.yaml # # Expected output: # - document_analysis_summary.csv with individual document topic keywords # - document_topic_analysis.json with aggregated topic analysis across all documents # - Two-stage processing: initial extraction + response cleanup + aggregation # - Suitable for any document collection that needs both detailed and summary analysis # # Benefits of this approach: # - No vector store setup required # - Processes complete documents for full context # - Provides both individual document insights AND collection-wide topic analysis # - JSON output enables programmatic access to aggregated insights # - CSV output allows detailed filtering and analysis of individual documents