Add pension PDF upload, parsing, and fund summary API

Backend: parse BAC pension statement PDFs (VOL, ROP, FCL) via pdftotext, store snapshots with duplicate detection, reject credit card statements. Endpoints: POST /upload, GET /snapshots, GET /fund-summary. Frontend: wire up drag-and-drop upload, load real balances and rendimientos from API, show upload results with error/duplicate feedback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-07-17 11:28:47 +02:00 · 2026-03-28 22:24:42 -06:00
parent 1b90f0c70a
commit eccfd53e0b
8 changed files with 631 additions and 56 deletions
--- a/backend/app/services/pension_pdf.py
+++ b/backend/app/services/pension_pdf.py
@@ -0,0 +1,225 @@
+"""Parse BAC San José Pensiones PDF statements into structured fund snapshots."""
+
+import re
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from datetime import date
+
+
+@dataclass
+class FundSnapshot:
+    fund: str  # "ROP", "FCL", or "VOL"
+    contract_number: str
+    period_start: date
+    period_end: date
+    saldo_anterior: float
+    aportes: float
+    rendimientos: float
+    retiros: float
+    traslados: float
+    comision: float
+    correccion: float
+    bonificacion: float
+    saldo_final: float
+
+
+def _find_pdftotext() -> str:
+    """Find pdftotext binary, checking common install paths."""
+    import os
+
+    cmd = shutil.which("pdftotext")
+    if cmd:
+        return cmd
+    for path in ["/opt/homebrew/bin/pdftotext", "/usr/bin/pdftotext", "/usr/local/bin/pdftotext"]:
+        if os.path.isfile(path):
+            return path
+    raise FileNotFoundError("pdftotext not found — install poppler-utils")
+
+
+def extract_text(pdf_bytes: bytes) -> str:
+    pdftotext_bin = _find_pdftotext()
+    with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
+        f.write(pdf_bytes)
+        f.flush()
+        result = subprocess.run(
+            [pdftotext_bin, "-layout", f.name, "-"],
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        if result.returncode != 0:
+            raise ValueError(f"pdftotext failed: {result.stderr.strip()}")
+        return result.stdout
+
+
+def detect_type(text: str) -> str:
+    """Return 'VOL', 'ROP_FCL', or 'UNKNOWN'."""
+    if any(kw in text for kw in ("MARCA DE TARJETA", "ESTADO DE CUENTA", "PAGO MÍNIMO")):
+        return "CREDIT_CARD"
+    if "FONDO C VOLUNTARIO" in text:
+        return "VOL"
+    if "RÉGIMEN OBLIGATORIO" in text or ("ROP" in text and "FCL" in text):
+        return "ROP_FCL"
+    return "UNKNOWN"
+
+
+def _parse_amount(s: str) -> float:
+    """Parse '17,819,176.79' or '-12,693.13' into float."""
+    cleaned = s.replace(",", "")
+    return float(cleaned)
+
+
+def _find_amounts(line: str) -> list[float]:
+    """Extract all ¢-prefixed amounts from a line."""
+    return [_parse_amount(m) for m in re.findall(r"¢\s*(-?[\d,]+\.\d{2})", line)]
+
+
+def _parse_period(text: str) -> tuple[date, date]:
+    m = re.search(r"DEL\s+(\d{2}/\d{2}/\d{4})\s+AL\s+(\d{2}/\d{2}/\d{4})", text)
+    if not m:
+        raise ValueError("Could not find period dates (DEL ... AL ...)")
+    start = date(int(m.group(1)[6:]), int(m.group(1)[3:5]), int(m.group(1)[:2]))
+    end = date(int(m.group(2)[6:]), int(m.group(2)[3:5]), int(m.group(2)[:2]))
+    return start, end
+
+
+def _extract_summary_value(text: str, label: str) -> list[float]:
+    """Find a summary line by label and return all ¢ amounts on that line."""
+    pattern = re.compile(re.escape(label) + r".*", re.IGNORECASE)
+    for line in text.split("\n"):
+        if pattern.search(line):
+            amounts = _find_amounts(line)
+            if amounts:
+                return amounts
+    return []
+
+
+_SUMMARY_FIELDS = [
+    ("Saldo Anterior", "saldo_anterior"),
+    ("Aportes", "aportes"),
+    ("Rendimientos", "rendimientos"),
+    ("Retiros", "retiros"),
+    ("Traslados", "traslados"),
+    ("Comisión de Administración", "comision"),
+    ("Corrección de Imputaciones", "correccion"),
+    ("Bonificación", "bonificacion"),
+]
+
+
+def _find_final_balance(text: str, after_label: str = "Bonificación") -> list[float]:
+    """Find the standalone balance line after the last summary field.
+
+    After Bonificación (or Corrección for ROP+FCL), there's a line with just
+    the final balance amount(s) and no label.
+    """
+    lines = text.split("\n")
+    found_label = False
+    for line in lines:
+        if after_label in line:
+            found_label = True
+            continue
+        if found_label:
+            amounts = _find_amounts(line)
+            if amounts:
+                return amounts
+    return []
+
+
+def parse_vol(text: str) -> list[FundSnapshot]:
+    period_start, period_end = _parse_period(text)
+
+    # Contract number
+    m = re.search(r"N°\s*Contrato:\s*(\S+)", text)
+    contract = m.group(1) if m else ""
+
+    data: dict[str, float] = {}
+    for label, field in _SUMMARY_FIELDS:
+        amounts = _extract_summary_value(text, label)
+        data[field] = amounts[0] if amounts else 0.0
+
+    finals = _find_final_balance(text, "Bonificación")
+    if not finals:
+        # Fallback: look after Corrección
+        finals = _find_final_balance(text, "Corrección de Imputaciones")
+    saldo_final = finals[0] if finals else 0.0
+
+    return [
+        FundSnapshot(
+            fund="VOL",
+            contract_number=contract,
+            period_start=period_start,
+            period_end=period_end,
+            saldo_final=saldo_final,
+            **data,
+        )
+    ]
+
+
+def parse_rop_fcl(text: str) -> list[FundSnapshot]:
+    period_start, period_end = _parse_period(text)
+
+    # Contract numbers
+    m_rop = re.search(r"N°\s*Contrato\s*ROP:\s*(\S+)", text)
+    m_fcl = re.search(r"N°\s*Contrato\s*FCL:\s*(\S+)", text)
+    contract_rop = m_rop.group(1) if m_rop else ""
+    contract_fcl = m_fcl.group(1) if m_fcl else ""
+
+    rop_data: dict[str, float] = {}
+    fcl_data: dict[str, float] = {}
+
+    for label, field in _SUMMARY_FIELDS:
+        amounts = _extract_summary_value(text, label)
+        if len(amounts) >= 2:
+            rop_data[field] = amounts[0]
+            fcl_data[field] = amounts[1]
+        elif len(amounts) == 1:
+            rop_data[field] = amounts[0]
+            fcl_data[field] = 0.0
+        else:
+            rop_data[field] = 0.0
+            fcl_data[field] = 0.0
+
+    # Final balance line (after Corrección since ROP+FCL has no Bonificación)
+    finals = _find_final_balance(text, "Corrección de Imputaciones")
+    rop_final = finals[0] if len(finals) >= 1 else 0.0
+    fcl_final = finals[1] if len(finals) >= 2 else 0.0
+
+    return [
+        FundSnapshot(
+            fund="ROP",
+            contract_number=contract_rop,
+            period_start=period_start,
+            period_end=period_end,
+            saldo_final=rop_final,
+            **rop_data,
+        ),
+        FundSnapshot(
+            fund="FCL",
+            contract_number=contract_fcl,
+            period_start=period_start,
+            period_end=period_end,
+            saldo_final=fcl_final,
+            **fcl_data,
+        ),
+    ]
+
+
+def parse_pension_pdf(pdf_bytes: bytes, filename: str = "") -> list[FundSnapshot]:
+    """Parse a pension PDF and return fund snapshots.
+
+    Raises ValueError for credit card statements or unrecognized formats.
+    """
+    text = extract_text(pdf_bytes)
+    doc_type = detect_type(text)
+
+    if doc_type == "CREDIT_CARD":
+        raise ValueError(f"'{filename}' is a credit card statement, not a pension extract")
+    if doc_type == "UNKNOWN":
+        raise ValueError(f"'{filename}' is not a recognized BAC pension statement")
+
+    if doc_type == "VOL":
+        return parse_vol(text)
+    else:
+        return parse_rop_fcl(text)