#!/usr/bin/env python
import argparse
import json
import os
from typing import Any

import cv2
import numpy as np
import pypdfium2 as pdfium
import pytesseract
from PIL import Image
from pillow_heif import register_heif_opener


def sanitize_text(value: str) -> str:
    if not isinstance(value, str):
        return ""
    return value.encode("utf-8", errors="replace").decode("utf-8")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Local OCR service for Laravel")
    parser.add_argument("--input", required=True, help="Absolute path to input file")
    parser.add_argument("--lang", default="es", help="OCR language (es, en, etc.)")
    parser.add_argument("--tesseract", default="tesseract", help="Path to tesseract binary")
    return parser.parse_args()


def load_images(file_path: str) -> list[Image.Image]:
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        pdf = pdfium.PdfDocument(file_path)
        images: list[Image.Image] = []
        for i in range(len(pdf)):
            page = pdf[i]
            bitmap = page.render(scale=2)
            images.append(bitmap.to_pil())
        return images

    img = Image.open(file_path)
    return [img.convert("RGB")]


def preprocess_image(image: Image.Image) -> np.ndarray:
    array_img = np.array(image.convert("RGB"))
    gray = cv2.cvtColor(array_img, cv2.COLOR_RGB2GRAY)
    denoised = cv2.fastNlMeansDenoising(gray, h=18)
    _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    coords = np.column_stack(np.where(thresh < 255))
    if len(coords) > 0:
        angle = cv2.minAreaRect(coords)[-1]
        angle = -(90 + angle) if angle < -45 else -angle
        (h, w) = thresh.shape[:2]
        matrix = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
        thresh = cv2.warpAffine(thresh, matrix, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

    return thresh


def paddle_extract(processed_images: list[np.ndarray], lang: str) -> tuple[list[dict[str, Any]], float]:
    try:
        from paddleocr import PaddleOCR
    except ImportError as exc:
        raise RuntimeError(f'PaddleOCR no disponible: {exc}') from exc

    ocr = PaddleOCR(use_angle_cls=True, lang=lang, show_log=False)
    blocks: list[dict[str, Any]] = []
    confidences: list[float] = []

    for page_number, image in enumerate(processed_images, start=1):
        raw_result = ocr.ocr(image, cls=True)
        page_lines = raw_result[0] if raw_result else []
        for line in page_lines:
            if len(line) < 2:
                continue
            bbox = line[0]
            text = line[1][0] if isinstance(line[1], (list, tuple)) and len(line[1]) >= 1 else ""
            confidence = float(line[1][1]) if isinstance(line[1], (list, tuple)) and len(line[1]) >= 2 else 0.0
            if not text:
                continue

            confidences.append(confidence)
            blocks.append(
                {
                    "page": page_number,
                    "text": sanitize_text(text.strip()),
                    "confidence": round(confidence, 4),
                    "bbox": bbox,
                }
            )

    avg_confidence = float(np.mean(confidences)) if confidences else 0.0
    return blocks, avg_confidence


def tesseract_extract(processed_images: list[np.ndarray], lang: str) -> tuple[list[dict[str, Any]], float]:
    blocks: list[dict[str, Any]] = []
    confidences: list[float] = []

    for page_number, image in enumerate(processed_images, start=1):
        data = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT)
        count = len(data.get("text", []))

        for i in range(count):
            text = (data["text"][i] or "").strip()
            conf_raw = data["conf"][i]
            try:
                conf = float(conf_raw)
            except (TypeError, ValueError):
                conf = -1.0

            if not text or conf < 0:
                continue

            confidences.append(conf / 100)
            blocks.append(
                {
                    "page": page_number,
                    "text": sanitize_text(text),
                    "confidence": round(conf / 100, 4),
                    "bbox": [
                        int(data["left"][i]),
                        int(data["top"][i]),
                        int(data["width"][i]),
                        int(data["height"][i]),
                    ],
                }
            )

    avg_confidence = float(np.mean(confidences)) if confidences else 0.0
    return blocks, avg_confidence


def main() -> None:
    args = parse_args()
    register_heif_opener()
    pytesseract.pytesseract.tesseract_cmd = args.tesseract

    if not os.path.exists(args.input):
        raise FileNotFoundError(f"Input file not found: {args.input}")

    images = load_images(args.input)
    processed_images = [preprocess_image(img) for img in images]

    blocks: list[dict[str, Any]] = []
    confidence = 0.0
    engine = "paddleocr"
    errors: list[str] = []

    try:
        blocks, confidence = paddle_extract(processed_images, args.lang)
    except Exception as exc:  # noqa: BLE001
        errors.append(f"PaddleOCR error: {exc}")

    if not blocks:
        try:
            fallback_blocks, fallback_confidence = tesseract_extract(processed_images, args.lang)
            blocks = fallback_blocks
            confidence = fallback_confidence
            engine = "tesseract"
        except Exception as exc:  # noqa: BLE001
            errors.append(f"Tesseract error: {exc}")

    payload = {
        "engine": engine,
        "confidence": round(confidence, 4),
        "blocks": blocks,
        "errors": [sanitize_text(error) for error in errors],
        "page_count": len(images),
    }
    print(json.dumps(payload, ensure_ascii=True))


if __name__ == "__main__":
    main()
