# Example 3: Direct Document Analysis (No Vector Store)
# This workflow processes documents directly from files, applies AI analysis, and exports results
# Based on the FAR legal analysis example but generalized for any document type

nodes:
  # Load documents directly from folder
  document_loader:
    type: LoadFromFolder
    config:
      source_directory: "../sample_data"       # Directory containing documents
      include_patterns: ["*.pdf", "*.txt", "*.html"]  # Multiple file types
      verbose: true
      pdf_markdown: true                       # Convert PDFs for better analysis
      store_file_dates: true                   # Include timestamps
  
  # Keep documents whole for comprehensive analysis
  full_document_processor:
    type: KeepFullDocument
    config:
      concatenate_pages: true                  # Combine multi-page documents
  
  # Apply AI analysis to extract key information
  document_analyzer:
    type: PromptProcessor
    config:
      prompt: |
        "Provide a single short keyword or keyphrase that captures the topic of the following text from {source} (only output the keyphrase without quotes and nothing else): {content}"
      llm:
        model_url: "openai://gpt-4o-mini"      # Good balance of quality and speed
        temperature: 0.2                       # Focused, consistent analysis
        mute_stream: true
      batch_size: 2                           # Process 2 documents at a time
  
  # Clean up the responses to ensure consistent formatting
  response_cleaner:
    type: ResponseCleaner
    config:
      cleanup_prompt: |
        Clean up this analysis response to ensure consistent formatting:
        {original_response}
        
        Requirements:
        - Ensure the response is a single short keyword or keyphrase that captures the topic and nothing else.
      llm:
        model_url: "openai://gpt-4o-mini"
        temperature: 0
        mute_stream: true
  
  # Aggregate all topics into a comprehensive analysis
  topic_aggregator:
    type: AggregatorNode
    config:
      prompt: |
        Analyze these {num_results} topic keywords from different documents and provide a comprehensive topic analysis:
        
        {responses}
        
        Please provide your analysis in the following structure:
        1. TOP TOPICS: List the 5 most frequently mentioned topics with their frequency counts
        2. TOPIC CATEGORIES: Group related topics into broader categories (e.g., Technology, Business, etc.)
        3. COVERAGE ANALYSIS: What percentage of documents cover each major theme?
        4. INSIGHTS: What patterns or trends do you observe across the document collection?
        5. EXECUTIVE SUMMARY: One paragraph summarizing the overall thematic landscape
        
        Format your response with clear headings and bullet points for easy reading.
      llm:
        model_url: "openai://gpt-4o-mini"
        temperature: 0.3
        max_tokens: 1000
        mute_stream: true
  
  # Export individual document topics to CSV for detailed analysis
  individual_topics_export:
    type: CSVExporter
    config:
      output_path: "document_analysis_summary.csv"
  
  # Export aggregated analysis to JSON for programmatic access
  aggregated_export:
    type: JSONExporter
    config:
      output_path: "document_topic_analysis.json"
      pretty_print: true

connections:
  # Pipeline: Load → Keep Full → Analyze → Clean → Export Individual + Aggregate
  - from: document_loader
    from_port: documents
    to: full_document_processor
    to_port: documents
  
  - from: full_document_processor
    from_port: documents
    to: document_analyzer
    to_port: documents
  
  - from: document_analyzer
    from_port: results
    to: response_cleaner
    to_port: results
  
  # Export individual document topics to CSV
  - from: response_cleaner
    from_port: results
    to: individual_topics_export
    to_port: results
  
  # Aggregate topics for comprehensive analysis
  - from: response_cleaner
    from_port: results
    to: topic_aggregator
    to_port: results
  
  # Export aggregated analysis to JSON
  - from: topic_aggregator
    from_port: result
    to: aggregated_export
    to_port: result

# Prerequisites:
# 1. Documents in ../sample_data/ directory
# 2. Set your OPENAI_API_KEY environment variable
#
# Usage:
# python -m onprem.workflow 3_direct_analysis.yaml
#
# Expected output:
# - document_analysis_summary.csv with individual document topic keywords
# - document_topic_analysis.json with aggregated topic analysis across all documents
# - Two-stage processing: initial extraction + response cleanup + aggregation
# - Suitable for any document collection that needs both detailed and summary analysis
#
# Benefits of this approach:
# - No vector store setup required
# - Processes complete documents for full context
# - Provides both individual document insights AND collection-wide topic analysis
# - JSON output enables programmatic access to aggregated insights
# - CSV output allows detailed filtering and analysis of individual documents