#!/usr/bin/env python3
"""
Receipt parser workbench for curlys-books.

Runs:
- OCR/text extraction via `packages.parsers.ocr.extract_text_from_receipt` (file input), or
- Deterministic parsing directly from an OCR text file (text input),
then dispatches through `packages.invoice_parsers.vendor_dispatcher`.
"""

from __future__ import annotations

import argparse
import asyncio
import json
import sys
from pathlib import Path


def _find_repo_root(start: Path) -> Path | None:
    current = start.resolve()
    for parent in [current, *current.parents]:
        if (parent / "pyproject.toml").exists() and (parent / "packages").exists():
            return parent
    return None


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run OCR + vendor parsing for a receipt/invoice.")
    source = parser.add_mutually_exclusive_group(required=True)
    source.add_argument("--file", type=Path, help="Path to a receipt file (PDF/image/html/txt).")
    source.add_argument("--ocr-text", type=Path, help="Path to an *_ocr.txt fixture (skip OCR).")

    parser.add_argument(
        "--entity",
        choices=["corp", "soleprop"],
        default="corp",
        help="Entity type (default: corp).",
    )
    parser.add_argument(
        "--force-parser",
        type=str,
        default=None,
        help="Force a specific parser name (e.g., claude_vision, CostcoParser).",
    )
    parser.add_argument(
        "--print-ocr",
        action="store_true",
        help="Print extracted OCR text to stdout (after extraction/read).",
    )
    parser.add_argument(
        "--json",
        action="store_true",
        help="Emit JSON payload (OCR + parsed receipt).",
    )
    return parser.parse_args()


def main() -> int:
    args = _parse_args()

    repo_root = _find_repo_root(Path.cwd())
    if not repo_root:
        print("Error: run from the repository root (missing pyproject.toml/packages).", file=sys.stderr)
        return 2

    sys.path.insert(0, str(repo_root))

    from packages.common.schemas.invoice_schema import EntityType  # noqa: WPS433
    from packages.invoice_parsers.vendor_dispatcher import dispatcher, parse_receipt  # noqa: WPS433

    entity_type = EntityType.SOLEPROP if args.entity == "soleprop" else EntityType.CORP

    file_path: Path | None = args.file.resolve() if args.file else None
    pdf_path = str(file_path) if file_path and file_path.suffix.lower() == ".pdf" else None
    image_path = (
        str(file_path)
        if file_path and file_path.suffix.lower() in {".jpg", ".jpeg", ".png", ".heic", ".heif", ".tiff", ".tif"}
        else None
    )

    if args.ocr_text:
        ocr_text = args.ocr_text.read_text(encoding="utf-8")
        ocr_meta = {"method": "fixture_text", "confidence": None, "page_count": None}
    else:
        if not file_path or not file_path.exists():
            print(f"Error: file not found: {file_path}", file=sys.stderr)
            return 2

        from packages.parsers.ocr import extract_text_from_receipt  # noqa: WPS433

        ocr_result = asyncio.run(extract_text_from_receipt(file_path))
        ocr_text = ocr_result.text
        ocr_meta = {
            "method": ocr_result.method,
            "confidence": ocr_result.confidence,
            "page_count": ocr_result.page_count,
            "bounding_boxes": len(ocr_result.bounding_boxes or []),
        }

    if args.print_ocr and not args.json:
        print("\n--- OCR TEXT START ---\n")
        print(ocr_text)
        print("\n--- OCR TEXT END ---\n")

    detected = dispatcher.detect_vendor(ocr_text) if ocr_text else None

    receipt = parse_receipt(
        ocr_text=ocr_text,
        entity=entity_type,
        pdf_path=pdf_path,
        force_parser_name=args.force_parser,
        image_path=image_path,
    )

    receipt_dict = receipt.model_dump() if hasattr(receipt, "model_dump") else receipt.__dict__

    if args.json:
        payload = {
            "input": {
                "file": str(file_path) if file_path else None,
                "ocr_text": str(args.ocr_text) if args.ocr_text else None,
                "entity": args.entity,
                "force_parser": args.force_parser,
            },
            "ocr": ocr_meta,
            "detected_parser": detected,
            "receipt": receipt_dict,
        }
        print(json.dumps(payload, indent=2, default=str))
        return 0

    print("=== Workbench Summary ===")
    print(f"Detected parser: {detected}")
    print(f"OCR method:      {ocr_meta.get('method')}")
    if ocr_meta.get("confidence") is not None:
        print(f"OCR confidence:  {float(ocr_meta['confidence']):.2f}")
    print(f"Vendor guess:    {receipt_dict.get('vendor_guess')}")
    print(f"Purchase date:   {receipt_dict.get('purchase_date')}")
    print(f"Invoice #:       {receipt_dict.get('invoice_number')}")
    print(f"Subtotal:        {receipt_dict.get('subtotal')}")
    print(f"Tax total:       {receipt_dict.get('tax_total')}")
    print(f"Total:           {receipt_dict.get('total')}")
    print(f"Line count:      {len(receipt_dict.get('lines') or [])}")
    warnings = receipt_dict.get("validation_warnings") or []
    if warnings:
        print(f"Warnings:        {len(warnings)}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())

