# Example 1: Ingest PDF Documents into Vector Store # This workflow loads PDF files, chunks them, and stores them in a vector database for later retrieval nodes: # Load PDF documents from a directory pdf_loader: type: LoadFromFolder config: source_directory: "../sample_data" # Directory containing PDF files include_patterns: ["*.pdf"] # Only process PDF files pdf_markdown: true # Convert PDFs to markdown for better chunking verbose: true # Show progress store_md5: true # Store document hashes for deduplication store_file_dates: true # Include file dates in metadata # Chunk documents for optimal retrieval document_chunker: type: SplitByCharacterCount config: chunk_size: 800 # ~800 chars per chunk (good for most use cases) chunk_overlap: 80 # 10% overlap to maintain context preserve_paragraphs: true # Try to keep paragraphs intact # Store in ChromaDB vector database for semantic search vector_store: type: ChromaStore config: persist_location: "document_vectors" # Directory to store the database collection_name: "pdf_documents" # Collection name in ChromaDB connections: # Pipeline: Load PDFs → Chunk → Store in Vector DB - from: pdf_loader from_port: documents to: document_chunker to_port: documents - from: document_chunker from_port: documents to: vector_store to_port: documents # Usage: # python -m onprem.workflow 1_ingest_pdfs.yaml # # This creates a "document_vectors/" directory containing your vector database. # You can then use example 2 to query and analyze these documents. # # Expected output: # - Creates document_vectors/chroma.sqlite3 and related files # - Processes all PDFs in ../sample_data/ # - Status message showing number of documents stored