Add pension PDF upload, parsing, and fund summary API
All checks were successful
Deploy to VPS / deploy (push) Successful in 48s

Backend: parse BAC pension statement PDFs (VOL, ROP, FCL) via
pdftotext, store snapshots with duplicate detection, reject
credit card statements. Endpoints: POST /upload, GET /snapshots,
GET /fund-summary.

Frontend: wire up drag-and-drop upload, load real balances and
rendimientos from API, show upload results with error/duplicate
feedback.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Carlos Escalante
2026-03-28 22:24:42 -06:00
parent 1b90f0c70a
commit eccfd53e0b
8 changed files with 631 additions and 56 deletions

View File

@@ -0,0 +1,225 @@
"""Parse BAC San José Pensiones PDF statements into structured fund snapshots."""
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from datetime import date
@dataclass
class FundSnapshot:
fund: str # "ROP", "FCL", or "VOL"
contract_number: str
period_start: date
period_end: date
saldo_anterior: float
aportes: float
rendimientos: float
retiros: float
traslados: float
comision: float
correccion: float
bonificacion: float
saldo_final: float
def _find_pdftotext() -> str:
"""Find pdftotext binary, checking common install paths."""
import os
cmd = shutil.which("pdftotext")
if cmd:
return cmd
for path in ["/opt/homebrew/bin/pdftotext", "/usr/bin/pdftotext", "/usr/local/bin/pdftotext"]:
if os.path.isfile(path):
return path
raise FileNotFoundError("pdftotext not found — install poppler-utils")
def extract_text(pdf_bytes: bytes) -> str:
pdftotext_bin = _find_pdftotext()
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(pdf_bytes)
f.flush()
result = subprocess.run(
[pdftotext_bin, "-layout", f.name, "-"],
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
raise ValueError(f"pdftotext failed: {result.stderr.strip()}")
return result.stdout
def detect_type(text: str) -> str:
"""Return 'VOL', 'ROP_FCL', or 'UNKNOWN'."""
if any(kw in text for kw in ("MARCA DE TARJETA", "ESTADO DE CUENTA", "PAGO MÍNIMO")):
return "CREDIT_CARD"
if "FONDO C VOLUNTARIO" in text:
return "VOL"
if "RÉGIMEN OBLIGATORIO" in text or ("ROP" in text and "FCL" in text):
return "ROP_FCL"
return "UNKNOWN"
def _parse_amount(s: str) -> float:
"""Parse '17,819,176.79' or '-12,693.13' into float."""
cleaned = s.replace(",", "")
return float(cleaned)
def _find_amounts(line: str) -> list[float]:
"""Extract all ¢-prefixed amounts from a line."""
return [_parse_amount(m) for m in re.findall(r"¢\s*(-?[\d,]+\.\d{2})", line)]
def _parse_period(text: str) -> tuple[date, date]:
m = re.search(r"DEL\s+(\d{2}/\d{2}/\d{4})\s+AL\s+(\d{2}/\d{2}/\d{4})", text)
if not m:
raise ValueError("Could not find period dates (DEL ... AL ...)")
start = date(int(m.group(1)[6:]), int(m.group(1)[3:5]), int(m.group(1)[:2]))
end = date(int(m.group(2)[6:]), int(m.group(2)[3:5]), int(m.group(2)[:2]))
return start, end
def _extract_summary_value(text: str, label: str) -> list[float]:
"""Find a summary line by label and return all ¢ amounts on that line."""
pattern = re.compile(re.escape(label) + r".*", re.IGNORECASE)
for line in text.split("\n"):
if pattern.search(line):
amounts = _find_amounts(line)
if amounts:
return amounts
return []
_SUMMARY_FIELDS = [
("Saldo Anterior", "saldo_anterior"),
("Aportes", "aportes"),
("Rendimientos", "rendimientos"),
("Retiros", "retiros"),
("Traslados", "traslados"),
("Comisión de Administración", "comision"),
("Corrección de Imputaciones", "correccion"),
("Bonificación", "bonificacion"),
]
def _find_final_balance(text: str, after_label: str = "Bonificación") -> list[float]:
"""Find the standalone balance line after the last summary field.
After Bonificación (or Corrección for ROP+FCL), there's a line with just
the final balance amount(s) and no label.
"""
lines = text.split("\n")
found_label = False
for line in lines:
if after_label in line:
found_label = True
continue
if found_label:
amounts = _find_amounts(line)
if amounts:
return amounts
return []
def parse_vol(text: str) -> list[FundSnapshot]:
period_start, period_end = _parse_period(text)
# Contract number
m = re.search(r"\s*Contrato:\s*(\S+)", text)
contract = m.group(1) if m else ""
data: dict[str, float] = {}
for label, field in _SUMMARY_FIELDS:
amounts = _extract_summary_value(text, label)
data[field] = amounts[0] if amounts else 0.0
finals = _find_final_balance(text, "Bonificación")
if not finals:
# Fallback: look after Corrección
finals = _find_final_balance(text, "Corrección de Imputaciones")
saldo_final = finals[0] if finals else 0.0
return [
FundSnapshot(
fund="VOL",
contract_number=contract,
period_start=period_start,
period_end=period_end,
saldo_final=saldo_final,
**data,
)
]
def parse_rop_fcl(text: str) -> list[FundSnapshot]:
period_start, period_end = _parse_period(text)
# Contract numbers
m_rop = re.search(r"\s*Contrato\s*ROP:\s*(\S+)", text)
m_fcl = re.search(r"\s*Contrato\s*FCL:\s*(\S+)", text)
contract_rop = m_rop.group(1) if m_rop else ""
contract_fcl = m_fcl.group(1) if m_fcl else ""
rop_data: dict[str, float] = {}
fcl_data: dict[str, float] = {}
for label, field in _SUMMARY_FIELDS:
amounts = _extract_summary_value(text, label)
if len(amounts) >= 2:
rop_data[field] = amounts[0]
fcl_data[field] = amounts[1]
elif len(amounts) == 1:
rop_data[field] = amounts[0]
fcl_data[field] = 0.0
else:
rop_data[field] = 0.0
fcl_data[field] = 0.0
# Final balance line (after Corrección since ROP+FCL has no Bonificación)
finals = _find_final_balance(text, "Corrección de Imputaciones")
rop_final = finals[0] if len(finals) >= 1 else 0.0
fcl_final = finals[1] if len(finals) >= 2 else 0.0
return [
FundSnapshot(
fund="ROP",
contract_number=contract_rop,
period_start=period_start,
period_end=period_end,
saldo_final=rop_final,
**rop_data,
),
FundSnapshot(
fund="FCL",
contract_number=contract_fcl,
period_start=period_start,
period_end=period_end,
saldo_final=fcl_final,
**fcl_data,
),
]
def parse_pension_pdf(pdf_bytes: bytes, filename: str = "") -> list[FundSnapshot]:
"""Parse a pension PDF and return fund snapshots.
Raises ValueError for credit card statements or unrecognized formats.
"""
text = extract_text(pdf_bytes)
doc_type = detect_type(text)
if doc_type == "CREDIT_CARD":
raise ValueError(f"'{filename}' is a credit card statement, not a pension extract")
if doc_type == "UNKNOWN":
raise ValueError(f"'{filename}' is not a recognized BAC pension statement")
if doc_type == "VOL":
return parse_vol(text)
else:
return parse_rop_fcl(text)