WealthySmart/backend/app/services/pension_pdf.py

"""Parse BAC San José Pensiones PDF statements into structured fund snapshots."""

import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from datetime import date


@dataclass
class FundSnapshot:
    fund: str  # "ROP", "FCL", or "VOL"
    contract_number: str
    period_start: date
    period_end: date
    saldo_anterior: float
    aportes: float
    rendimientos: float
    retiros: float
    traslados: float
    comision: float
    correccion: float
    bonificacion: float
    saldo_final: float


def _find_pdftotext() -> str:
    """Find pdftotext binary, checking common install paths."""
    import os

    cmd = shutil.which("pdftotext")
    if cmd:
        return cmd
    for path in ["/opt/homebrew/bin/pdftotext", "/usr/bin/pdftotext", "/usr/local/bin/pdftotext"]:
        if os.path.isfile(path):
            return path
    raise FileNotFoundError("pdftotext not found — install poppler-utils")


def extract_text(pdf_bytes: bytes) -> str:
    pdftotext_bin = _find_pdftotext()
    with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
        f.write(pdf_bytes)
        f.flush()
        result = subprocess.run(
            [pdftotext_bin, "-layout", f.name, "-"],
            capture_output=True,
            text=True,
            timeout=30,
        )
        if result.returncode != 0:
            raise ValueError(f"pdftotext failed: {result.stderr.strip()}")
        return result.stdout


def detect_type(text: str) -> str:
    """Return 'VOL', 'ROP_FCL', or 'UNKNOWN'."""
    if any(kw in text for kw in ("MARCA DE TARJETA", "ESTADO DE CUENTA", "PAGO MÍNIMO")):
        return "CREDIT_CARD"
    if "FONDO C VOLUNTARIO" in text:
        return "VOL"
    if "RÉGIMEN OBLIGATORIO" in text or ("ROP" in text and "FCL" in text):
        return "ROP_FCL"
    return "UNKNOWN"


def _parse_amount(s: str) -> float:
    """Parse '17,819,176.79' or '-12,693.13' into float."""
    cleaned = s.replace(",", "")
    return float(cleaned)


def _find_amounts(line: str) -> list[float]:
    """Extract all ¢-prefixed amounts from a line."""
    return [_parse_amount(m) for m in re.findall(r"¢\s*(-?[\d,]+\.\d{2})", line)]


def _parse_period(text: str) -> tuple[date, date]:
    m = re.search(r"DEL\s+(\d{2}/\d{2}/\d{4})\s+AL\s+(\d{2}/\d{2}/\d{4})", text)
    if not m:
        raise ValueError("Could not find period dates (DEL ... AL ...)")
    start = date(int(m.group(1)[6:]), int(m.group(1)[3:5]), int(m.group(1)[:2]))
    end = date(int(m.group(2)[6:]), int(m.group(2)[3:5]), int(m.group(2)[:2]))
    return start, end


def _extract_summary_value(text: str, label: str) -> list[float]:
    """Find a summary line by label and return all ¢ amounts on that line."""
    pattern = re.compile(re.escape(label) + r".*", re.IGNORECASE)
    for line in text.split("\n"):
        if pattern.search(line):
            amounts = _find_amounts(line)
            if amounts:
                return amounts
    return []


_SUMMARY_FIELDS = [
    ("Saldo Anterior", "saldo_anterior"),
    ("Aportes", "aportes"),
    ("Rendimientos", "rendimientos"),
    ("Retiros", "retiros"),
    ("Traslados", "traslados"),
    ("Comisión de Administración", "comision"),
    ("Corrección de Imputaciones", "correccion"),
    ("Bonificación", "bonificacion"),
]


def _find_final_balance(text: str, after_label: str = "Bonificación") -> list[float]:
    """Find the standalone balance line after the last summary field.

    After Bonificación (or Corrección for ROP+FCL), there's a line with just
    the final balance amount(s) and no label.
    """
    lines = text.split("\n")
    found_label = False
    for line in lines:
        if after_label in line:
            found_label = True
            continue
        if found_label:
            amounts = _find_amounts(line)
            if amounts:
                return amounts
    return []


def parse_vol(text: str) -> list[FundSnapshot]:
    period_start, period_end = _parse_period(text)

    # Contract number
    m = re.search(r"N°\s*Contrato:\s*(\S+)", text)
    contract = m.group(1) if m else ""

    data: dict[str, float] = {}
    for label, field in _SUMMARY_FIELDS:
        amounts = _extract_summary_value(text, label)
        data[field] = amounts[0] if amounts else 0.0

    finals = _find_final_balance(text, "Bonificación")
    if not finals:
        # Fallback: look after Corrección
        finals = _find_final_balance(text, "Corrección de Imputaciones")
    saldo_final = finals[0] if finals else 0.0

    return [
        FundSnapshot(
            fund="VOL",
            contract_number=contract,
            period_start=period_start,
            period_end=period_end,
            saldo_final=saldo_final,
            **data,
        )
    ]


def parse_rop_fcl(text: str) -> list[FundSnapshot]:
    period_start, period_end = _parse_period(text)

    # Contract numbers
    m_rop = re.search(r"N°\s*Contrato\s*ROP:\s*(\S+)", text)
    m_fcl = re.search(r"N°\s*Contrato\s*FCL:\s*(\S+)", text)
    contract_rop = m_rop.group(1) if m_rop else ""
    contract_fcl = m_fcl.group(1) if m_fcl else ""

    rop_data: dict[str, float] = {}
    fcl_data: dict[str, float] = {}

    for label, field in _SUMMARY_FIELDS:
        amounts = _extract_summary_value(text, label)
        if len(amounts) >= 2:
            rop_data[field] = amounts[0]
            fcl_data[field] = amounts[1]
        elif len(amounts) == 1:
            rop_data[field] = amounts[0]
            fcl_data[field] = 0.0
        else:
            rop_data[field] = 0.0
            fcl_data[field] = 0.0

    # Final balance line (after Corrección since ROP+FCL has no Bonificación)
    finals = _find_final_balance(text, "Corrección de Imputaciones")
    rop_final = finals[0] if len(finals) >= 1 else 0.0
    fcl_final = finals[1] if len(finals) >= 2 else 0.0

    return [
        FundSnapshot(
            fund="ROP",
            contract_number=contract_rop,
            period_start=period_start,
            period_end=period_end,
            saldo_final=rop_final,
            **rop_data,
        ),
        FundSnapshot(
            fund="FCL",
            contract_number=contract_fcl,
            period_start=period_start,
            period_end=period_end,
            saldo_final=fcl_final,
            **fcl_data,
        ),
    ]


def parse_pension_pdf(pdf_bytes: bytes, filename: str = "") -> list[FundSnapshot]:
    """Parse a pension PDF and return fund snapshots.

    Raises ValueError for credit card statements or unrecognized formats.
    """
    text = extract_text(pdf_bytes)
    doc_type = detect_type(text)

    if doc_type == "CREDIT_CARD":
        raise ValueError(f"'{filename}' is a credit card statement, not a pension extract")
    if doc_type == "UNKNOWN":
        raise ValueError(f"'{filename}' is not a recognized BAC pension statement")

    if doc_type == "VOL":
        return parse_vol(text)
    else:
        return parse_rop_fcl(text)