# pylint: disable=line-too-long,useless-suppression
# mypy: disable-error-code="attr-defined"
# coding=utf-8
# --------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------
"""
FILE: sample_analyze_invoice.py

DESCRIPTION:
    This sample demonstrates how to analyze an invoice from a URL using the prebuilt-invoice analyzer
    and extract structured fields from the result.

    ## About analyzing invoices

    Content Understanding provides a rich set of prebuilt analyzers that are ready to use without any
    configuration. These analyzers are powered by knowledge bases of thousands of real-world document
    examples, enabling them to understand document structure and adapt to variations in format and
    content.

    Prebuilt analyzers are ideal for:
    - Content ingestion in search and retrieval-augmented generation (RAG) workflows
    - Intelligent document processing (IDP) to extract structured data from common document types
    - Agentic flows as tools for extracting structured representations from input files

    ### The prebuilt-invoice analyzer

    The prebuilt-invoice analyzer is a domain-specific analyzer optimized for processing invoices,
    utility bills, sales orders, and purchase orders. It automatically extracts structured fields
    including:

    - Customer/Vendor information: Name, address, contact details
    - Invoice metadata: Invoice number, date, due date, purchase order number
    - Line items: Description, quantity, unit price, total for each item
    - Financial totals: Subtotal, tax amount, shipping charges, total amount
    - Payment information: Payment terms, payment method, remittance address

    The analyzer works out of the box with various invoice formats and requires no configuration.
    It's part of the financial documents category of prebuilt analyzers, which also includes:
    - prebuilt-receipt - Sales receipts from retail and dining establishments
    - prebuilt-creditCard - Credit card statements
    - prebuilt-bankStatement.us - US bank statements
    - prebuilt-check.us - US bank checks
    - prebuilt-creditMemo - Credit memos and refund documents

USAGE:
    python sample_analyze_invoice.py

    Set the environment variables with your own values before running the sample:
    1) CONTENTUNDERSTANDING_ENDPOINT - the endpoint to your Content Understanding resource.
    2) CONTENTUNDERSTANDING_KEY - your Content Understanding API key (optional if using DefaultAzureCredential).

    Before using prebuilt analyzers, you MUST configure model deployments for your Microsoft Foundry
    resource. See sample_update_defaults.py for setup instructions.
"""

import os
from typing import cast

from dotenv import load_dotenv
from azure.ai.contentunderstanding import ContentUnderstandingClient, to_llm_input
from azure.ai.contentunderstanding.models import (
    AnalysisInput,
    AnalysisResult,
    DocumentContent,
    ContentField,
    ArrayField,
    ObjectField,
)
from azure.core.credentials import AzureKeyCredential
from azure.identity import DefaultAzureCredential

load_dotenv()


def main() -> None:
    endpoint = os.environ["CONTENTUNDERSTANDING_ENDPOINT"]
    key = os.getenv("CONTENTUNDERSTANDING_KEY")
    credential = AzureKeyCredential(key) if key else DefaultAzureCredential()

    client = ContentUnderstandingClient(endpoint=endpoint, credential=credential)

    # [START analyze_invoice]
    # You can replace this URL with your own invoice file URL
    invoice_url = "https://raw.githubusercontent.com/Azure-Samples/azure-ai-content-understanding-assets/main/document/invoice.pdf"

    print("Analyzing invoice with prebuilt-invoice analyzer...")
    print(f"  URL: {invoice_url}\n")

    poller = client.begin_analyze(
        analyzer_id="prebuilt-invoice",
        inputs=[AnalysisInput(url=invoice_url)],
    )
    result: AnalysisResult = poller.result()
    # [END analyze_invoice]

    # [START extract_invoice_fields]
    if not result.contents or len(result.contents) == 0:
        print("No content found in the analysis result.")
        return

    # Get the document content (invoices are documents)
    document_content = cast(DocumentContent, result.contents[0])

    # Print document unit information
    # The unit indicates the measurement system used for coordinates in the source field
    print(f"Document unit: {document_content.unit or 'unknown'}")
    print(
        f"Pages: {document_content.start_page_number} to {document_content.end_page_number}"
    )

    # Print page dimensions if available
    if document_content.pages and len(document_content.pages) > 0:
        page = document_content.pages[0]
        unit = document_content.unit or "units"
        print(f"Page dimensions: {page.width} x {page.height} {unit}")
    print()

    if not document_content.fields:
        print("No fields found in the analysis result.")
        return

    # Extract simple string fields
    customer_name_field = document_content.fields.get("CustomerName")
    print(
        f"Customer Name: {customer_name_field.value or '(None)' if customer_name_field else '(None)'}"
    )
    if customer_name_field:
        print(
            f"  Confidence: {customer_name_field.confidence:.2f}"
            if customer_name_field.confidence
            else "  Confidence: N/A"
        )
        print(f"  Source: {customer_name_field.source or 'N/A'}")
        if customer_name_field.spans and len(customer_name_field.spans) > 0:
            span = customer_name_field.spans[0]
            print(f"  Position in markdown: offset={span.offset}, length={span.length}")

    # Extract simple date field
    invoice_date_field = document_content.fields.get("InvoiceDate")
    print(
        f"Invoice Date: {invoice_date_field.value or '(None)' if invoice_date_field else '(None)'}"
    )
    if invoice_date_field:
        print(
            f"  Confidence: {invoice_date_field.confidence:.2f}"
            if invoice_date_field.confidence
            else "  Confidence: N/A"
        )
        print(f"  Source: {invoice_date_field.source or 'N/A'}")
        if invoice_date_field.spans and len(invoice_date_field.spans) > 0:
            span = invoice_date_field.spans[0]
            print(f"  Position in markdown: offset={span.offset}, length={span.length}")

    # Extract object fields (nested structures)
    total_amount_field = document_content.fields.get("TotalAmount")
    if isinstance(total_amount_field, ObjectField) and total_amount_field.value:
        amount_field = total_amount_field.value.get("Amount")
        currency_field = total_amount_field.value.get("CurrencyCode")
        amount = amount_field.value if amount_field else None
        # Use currency value if present, otherwise default to ""
        currency = (
            currency_field.value if currency_field and currency_field.value else ""
        )
        if isinstance(amount, (int, float)):
            print(f"\nTotal: {currency}{amount:.2f}")
        else:
            print(f"\nTotal: {currency}{amount or '(None)'}")
        print(
            f"  Amount Confidence: {amount_field.confidence:.2f}"
            if amount_field and amount_field.confidence
            else "  Amount Confidence: N/A"
        )
        print(
            f"  Source for Amount: {amount_field.source or 'N/A'}"
            if amount_field
            else "  Source: N/A"
        )

    # Extract array fields (collections like line items)
    line_items_field = document_content.fields.get("LineItems")
    if isinstance(line_items_field, ArrayField) and line_items_field.value:
        print(f"\nLine Items ({len(line_items_field.value)}):")
        for i, item in enumerate(line_items_field.value, 1):
            if isinstance(item, ObjectField) and item.value:
                description_field = item.value.get("Description")
                quantity_field = item.value.get("Quantity")
                description = (
                    description_field.value
                    if description_field and description_field.value
                    else "N/A"
                )
                quantity = (
                    quantity_field.value
                    if quantity_field and quantity_field.value
                    else "N/A"
                )
                print(f"  Item {i}: {description}")
                print(f"    Quantity: {quantity}")
                print(
                    f"    Quantity Confidence: {quantity_field.confidence:.2f}"
                    if quantity_field and quantity_field.confidence
                    else "    Quantity Confidence: N/A"
                )
    # [END extract_invoice_fields]

    # [START get_usage]
    # Access usage details from the poller (available after result() completes).
    # Usage reports resource consumption for billing estimation:
    #
    # - document_pages_standard/basic/minimal: Pages processed at each extraction tier.
    #   Standard = layout + OCR (scanned docs), Basic = OCR only, Minimal = digital formats
    #   (DOCX, XLSX, HTML, TXT) that need no OCR. Charged per 1,000 pages.
    #
    # - contextualization_tokens: Fixed-rate tokens charged by Content Understanding for
    #   preparing context, generating confidence scores, source grounding, and formatting
    #   output. Typically 1,000 tokens per page. Charged separately from LLM tokens.
    #
    # - tokens: Dict of "{model}-input" / "{model}-output" token counts consumed by your
    #   Foundry model deployment (e.g. "gpt-4.1-input", "gpt-4.1-output"). These are
    #   billed on your Foundry deployment, not on Content Understanding.
    #
    # For full pricing details, see:
    # https://learn.microsoft.com/azure/ai-services/content-understanding/pricing-explainer
    usage = poller.usage
    if usage:
        print("\nUsage Details:")
        if usage.document_pages_standard is not None:
            print(f"  Document pages (standard): {usage.document_pages_standard}")
        if usage.contextualization_tokens is not None:
            print(f"  Contextualization tokens: {usage.contextualization_tokens}")
        if usage.tokens:
            print("  Model tokens:")
            for model, count in usage.tokens.items():
                print(f"    {model}: {count}")
    # [END get_usage]

    # [START invoice_to_llm_input]
    # The fields above can also be packaged into a single LLM-ready text block.
    # to_llm_input() renders all extracted fields as YAML front matter followed by
    # the markdown body, so an LLM can consume both structured data and document text
    # in one shot. For advanced options, see sample_to_llm_input.py.
    print("\n" + "=" * 60)
    print("LLM-READY OUTPUT (fields + markdown)")
    print("=" * 60)

    text = to_llm_input(result)
    print(text)
    # [END invoice_to_llm_input]


if __name__ == "__main__":
    main()