#!/usr/bin/env python3
"""Extract a specific page from PDF as image and run OCR to get text."""

import argparse
import subprocess
import sys

def main():
    parser = argparse.ArgumentParser(description='Extract PDF page as image and run OCR')
    parser.add_argument('pdf_path', help='Path to the PDF file')
    parser.add_argument('page_number', type=int, help='Page number to extract (1-indexed)')
    parser.add_argument('--output-dir', '-o', default='.', help='Output directory for images and text')
    parser.add_argument('--resolution', '-r', type=int, default=200, help='Image resolution (DPI)')
    args = parser.parse_args()

    try:
        import pdfplumber
    except ImportError:
        print("Installing pdfplumber...")
        subprocess.run([sys.executable, '-m', 'pip', 'install', 'pdfplumber', '-q'], check=True)
        import pdfplumber

    try:
        from PIL import Image
    except ImportError:
        print("Installing pillow...")
        subprocess.run([sys.executable, '-m', 'pip', 'install', 'pillow', '-q'], check=True)
        from PIL import Image

    import os
    output_dir = args.output_dir
    os.makedirs(output_dir, exist_ok=True)

    page_idx = args.page_number - 1  # Convert to 0-indexed

    with pdfplumber.open(args.pdf_path) as pdf:
        if page_idx < 0 or page_idx >= len(pdf.pages):
            print(f"Error: Page {args.page_number} does not exist. PDF has {len(pdf.pages)} pages.")
            sys.exit(1)

        page = pdf.pages[page_idx]
        print(f"Page {args.page_number} dimensions: {page.width} x {page.height}")
        print(f"Total pages in PDF: {len(pdf.pages)}")

        # Save page as image
        pil_image = page.to_image(resolution=args.resolution).original
        image_path = os.path.join(output_dir, f"page{args.page_number}.png")
        pil_image.save(image_path, "PNG")
        print(f"Image saved: {image_path}")

        # Run OCR with tesseract
        text_path = os.path.join(output_dir, f"page{args.page_number}_text")
        try:
            result = subprocess.run(
                ['tesseract', image_path, text_path, '-l', 'eng'],
                capture_output=True, text=True, timeout=120
            )
            if result.returncode == 0:
                print(f"OCR text saved: {text_path}.txt")
                # Print preview of extracted text
                with open(f"{text_path}.txt", 'r') as f:
                    text = f.read()
                print(f"\n=== OCR Preview (first 500 chars) ===\n{text[:500]}")
            else:
                print(f"OCR failed: {result.stderr}")
        except FileNotFoundError:
            print("Warning: tesseract not installed. Install with: brew install tesseract")
            print("Skipping OCR step.")

if __name__ == '__main__':
    main()
