--- name: document-inventory description: Scan and catalog document collections with metadata extraction, categorization, and statistics. Use for auditing document libraries, preparing for knowledge base creation, or understanding large file collections. version: 1.1.0 last_updated: 2026-01-02 category: document-handling related_skills: - knowledge-base-builder - pdf-text-extractor - semantic-search-setup --- # Document Inventory Skill ## Overview This skill scans document collections (PDFs, Word docs, text files) and creates a structured inventory with metadata, automatic categorization, and collection statistics. Essential first step before building knowledge bases. ## Quick Start ```python from pathlib import Path import sqlite3 # Scan directory documents = [] for filepath in Path("/path/to/docs").rglob("*.pdf"): documents.append({ 'filename': filepath.name, 'size': filepath.stat().st_size, 'path': str(filepath) }) # Store in database conn = sqlite3.connect("inventory.db") cursor = conn.cursor() cursor.execute("CREATE TABLE IF NOT EXISTS docs (name TEXT, size INTEGER, path TEXT)") for doc in documents: cursor.execute("INSERT INTO docs VALUES (?, ?, ?)", (doc['filename'], doc['size'], doc['path'])) conn.commit() print(f"Inventoried {len(documents)} documents") ``` ## When to Use - Auditing large document libraries before processing - Understanding the scope of a document collection - Categorizing documents by type, source, or content - Preparing inventories for knowledge base creation - Generating reports on document collections - Identifying duplicates or organizing files ## Features - **Recursive scanning** - Process nested directories - **Metadata extraction** - Size, dates, page counts - **Auto-categorization** - Pattern-based classification - **Statistics generation** - Collection summaries - **SQLite storage** - Queryable inventory database - **Multiple formats** - PDF, DOCX, TXT, and more ## Implementation ### Core Inventory Builder ```python #!/usr/bin/env python3 """Document inventory builder.""" import sqlite3 import os from pathlib import Path from datetime import datetime import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DocumentInventory: """Build and manage document inventories.""" SUPPORTED_EXTENSIONS = { '.pdf': 'PDF', '.docx': 'Word', '.doc': 'Word', '.txt': 'Text', '.md': 'Markdown', '.xlsx': 'Excel', '.xls': 'Excel', '.pptx': 'PowerPoint', '.ppt': 'PowerPoint', } def __init__(self, db_path): self.db_path = db_path self.conn = sqlite3.connect(db_path, timeout=30) self._setup_tables() def _setup_tables(self): cursor = self.conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS documents ( id INTEGER PRIMARY KEY, filename TEXT NOT NULL, filepath TEXT UNIQUE NOT NULL, extension TEXT, file_type TEXT, category TEXT, file_size INTEGER, created_date TEXT, modified_date TEXT, parent_dir TEXT, depth INTEGER, scanned_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_category ON documents(category) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_extension ON documents(extension) ''') self.conn.commit() def scan_directory(self, root_path): """Scan directory and build inventory.""" root = Path(root_path).resolve() logger.info(f"Scanning: {root}") count = 0 for filepath in root.rglob('*'): if filepath.is_file(): ext = filepath.suffix.lower() if ext in self.SUPPORTED_EXTENSIONS: self._add_document(filepath, root) count += 1 if count % 500 == 0: logger.info(f"Scanned {count} documents...") self.conn.commit() self.conn.commit() logger.info(f"Scan complete: {count} documents found") return count def _add_document(self, filepath, root): """Add document to inventory.""" cursor = self.conn.cursor() try: stat = filepath.stat() ext = filepath.suffix.lower() cursor.execute(''' INSERT OR REPLACE INTO documents (filename, filepath, extension, file_type, category, file_size, created_date, modified_date, parent_dir, depth) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( filepath.name, str(filepath), ext, self.SUPPORTED_EXTENSIONS.get(ext, 'Unknown'), self._categorize(filepath), stat.st_size, datetime.fromtimestamp(stat.st_ctime).isoformat(), datetime.fromtimestamp(stat.st_mtime).isoformat(), str(filepath.parent), len(filepath.relative_to(root).parts) - 1 )) except Exception as e: logger.warning(f"Error adding {filepath}: {e}") def _categorize(self, filepath): """Auto-categorize document based on patterns.""" name = filepath.name.upper() path_str = str(filepath).upper() # Industry standard patterns patterns = { 'API': 'API', 'ISO': 'ISO', 'ASME': 'ASME', 'DNV': 'DNV', 'NORSOK': 'NORSOK', 'BSI': 'BSI', 'ASTM': 'ASTM', 'AWS': 'AWS', 'ABS': 'ABS', 'AISC': 'AISC', 'IEEE': 'IEEE', } for pattern, category in patterns.items(): if pattern in name or pattern in path_str: return category # Path-based categorization path_categories = { 'STANDARD': 'Standards', 'SPEC': 'Specifications', 'MANUAL': 'Manuals', 'GUIDE': 'Guides', 'REPORT': 'Reports', 'DRAWING': 'Drawings', 'PROCEDURE': 'Procedures', } for pattern, category in path_categories.items(): if pattern in path_str: return category return 'Unknown' def get_statistics(self): """Get inventory statistics.""" cursor = self.conn.cursor() stats = {} # Total count cursor.execute('SELECT COUNT(*) FROM documents') stats['total_documents'] = cursor.fetchone()[0] # Total size cursor.execute('SELECT SUM(file_size) FROM documents') total_bytes = cursor.fetchone()[0] or 0 stats['total_size_mb'] = round(total_bytes / (1024 * 1024), 2) # By file type cursor.execute(''' SELECT file_type, COUNT(*), SUM(file_size) FROM documents GROUP BY file_type ORDER BY COUNT(*) DESC ''') stats['by_type'] = { row[0]: {'count': row[1], 'size_mb': round((row[2] or 0) / 1024 / 1024, 2)} for row in cursor.fetchall() } # By category cursor.execute(''' SELECT category, COUNT(*) FROM documents GROUP BY category ORDER BY COUNT(*) DESC ''') stats['by_category'] = dict(cursor.fetchall()) # By extension cursor.execute(''' SELECT extension, COUNT(*) FROM documents GROUP BY extension ORDER BY COUNT(*) DESC ''') stats['by_extension'] = dict(cursor.fetchall()) return stats def search(self, query, category=None, file_type=None, limit=50): """Search inventory.""" cursor = self.conn.cursor() sql = 'SELECT filename, filepath, category, file_size FROM documents WHERE 1=1' params = [] if query: sql += ' AND filename LIKE ?' params.append(f'%{query}%') if category: sql += ' AND category = ?' params.append(category) if file_type: sql += ' AND file_type = ?' params.append(file_type) sql += ' ORDER BY filename LIMIT ?' params.append(limit) cursor.execute(sql, params) return cursor.fetchall() def export_csv(self, output_path): """Export inventory to CSV.""" import csv cursor = self.conn.cursor() cursor.execute('SELECT * FROM documents') columns = [desc[0] for desc in cursor.description] with open(output_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(columns) writer.writerows(cursor.fetchall()) logger.info(f"Exported to {output_path}") ``` ### CLI Interface ```python #!/usr/bin/env python3 """Document Inventory CLI.""" import argparse import json def main(): parser = argparse.ArgumentParser(description='Document Inventory Tool') subparsers = parser.add_subparsers(dest='command', help='Commands') # Scan command scan_parser = subparsers.add_parser('scan', help='Scan directory') scan_parser.add_argument('path', help='Directory to scan') scan_parser.add_argument('--db', default='inventory.db', help='Database path') # Stats command stats_parser = subparsers.add_parser('stats', help='Show statistics') stats_parser.add_argument('--db', default='inventory.db', help='Database path') stats_parser.add_argument('--json', action='store_true', help='Output as JSON') # Search command search_parser = subparsers.add_parser('search', help='Search inventory') search_parser.add_argument('query', help='Search query') search_parser.add_argument('--db', default='inventory.db', help='Database path') search_parser.add_argument('--category', help='Filter by category') search_parser.add_argument('--type', help='Filter by file type') # Export command export_parser = subparsers.add_parser('export', help='Export to CSV') export_parser.add_argument('output', help='Output CSV path') export_parser.add_argument('--db', default='inventory.db', help='Database path') args = parser.parse_args() if args.command == 'scan': inventory = DocumentInventory(args.db) count = inventory.scan_directory(args.path) print(f"\nScanned {count} documents") stats = inventory.get_statistics() print(f"Total size: {stats['total_size_mb']} MB") print(f"\nBy category:") for cat, count in list(stats['by_category'].items())[:10]: print(f" {cat}: {count}") elif args.command == 'stats': inventory = DocumentInventory(args.db) stats = inventory.get_statistics() if args.json: print(json.dumps(stats, indent=2)) else: print(f"Total Documents: {stats['total_documents']}") print(f"Total Size: {stats['total_size_mb']} MB") print(f"\nBy Type:") for t, data in stats['by_type'].items(): print(f" {t}: {data['count']} ({data['size_mb']} MB)") print(f"\nBy Category:") for cat, count in list(stats['by_category'].items())[:15]: print(f" {cat}: {count}") elif args.command == 'search': inventory = DocumentInventory(args.db) results = inventory.search( args.query, category=args.category, file_type=args.type ) print(f"Found {len(results)} results:\n") for filename, filepath, category, size in results: size_kb = size / 1024 print(f" [{category:10}] {filename} ({size_kb:.1f} KB)") elif args.command == 'export': inventory = DocumentInventory(args.db) inventory.export_csv(args.output) else: parser.print_help() if __name__ == '__main__': main() ``` ### Report Generator ```python def generate_report(db_path, output_path): """Generate HTML inventory report.""" inventory = DocumentInventory(db_path) stats = inventory.get_statistics() html = f"""
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
| Type | Count | Size (MB) |
|---|---|---|
| {t} | {d['count']} | {d['size_mb']} |
| Category | Count |
|---|---|
| {c} | {n} |