Files
WealthySmart/backend/app/services/pension_pdf.py
Carlos Escalante eccfd53e0b
All checks were successful
Deploy to VPS / deploy (push) Successful in 48s
Add pension PDF upload, parsing, and fund summary API
Backend: parse BAC pension statement PDFs (VOL, ROP, FCL) via
pdftotext, store snapshots with duplicate detection, reject
credit card statements. Endpoints: POST /upload, GET /snapshots,
GET /fund-summary.

Frontend: wire up drag-and-drop upload, load real balances and
rendimientos from API, show upload results with error/duplicate
feedback.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 22:24:42 -06:00

226 lines
6.8 KiB
Python

"""Parse BAC San José Pensiones PDF statements into structured fund snapshots."""
import re
import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from datetime import date
@dataclass
class FundSnapshot:
fund: str # "ROP", "FCL", or "VOL"
contract_number: str
period_start: date
period_end: date
saldo_anterior: float
aportes: float
rendimientos: float
retiros: float
traslados: float
comision: float
correccion: float
bonificacion: float
saldo_final: float
def _find_pdftotext() -> str:
"""Find pdftotext binary, checking common install paths."""
import os
cmd = shutil.which("pdftotext")
if cmd:
return cmd
for path in ["/opt/homebrew/bin/pdftotext", "/usr/bin/pdftotext", "/usr/local/bin/pdftotext"]:
if os.path.isfile(path):
return path
raise FileNotFoundError("pdftotext not found — install poppler-utils")
def extract_text(pdf_bytes: bytes) -> str:
pdftotext_bin = _find_pdftotext()
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(pdf_bytes)
f.flush()
result = subprocess.run(
[pdftotext_bin, "-layout", f.name, "-"],
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
raise ValueError(f"pdftotext failed: {result.stderr.strip()}")
return result.stdout
def detect_type(text: str) -> str:
"""Return 'VOL', 'ROP_FCL', or 'UNKNOWN'."""
if any(kw in text for kw in ("MARCA DE TARJETA", "ESTADO DE CUENTA", "PAGO MÍNIMO")):
return "CREDIT_CARD"
if "FONDO C VOLUNTARIO" in text:
return "VOL"
if "RÉGIMEN OBLIGATORIO" in text or ("ROP" in text and "FCL" in text):
return "ROP_FCL"
return "UNKNOWN"
def _parse_amount(s: str) -> float:
"""Parse '17,819,176.79' or '-12,693.13' into float."""
cleaned = s.replace(",", "")
return float(cleaned)
def _find_amounts(line: str) -> list[float]:
"""Extract all ¢-prefixed amounts from a line."""
return [_parse_amount(m) for m in re.findall(r"¢\s*(-?[\d,]+\.\d{2})", line)]
def _parse_period(text: str) -> tuple[date, date]:
m = re.search(r"DEL\s+(\d{2}/\d{2}/\d{4})\s+AL\s+(\d{2}/\d{2}/\d{4})", text)
if not m:
raise ValueError("Could not find period dates (DEL ... AL ...)")
start = date(int(m.group(1)[6:]), int(m.group(1)[3:5]), int(m.group(1)[:2]))
end = date(int(m.group(2)[6:]), int(m.group(2)[3:5]), int(m.group(2)[:2]))
return start, end
def _extract_summary_value(text: str, label: str) -> list[float]:
"""Find a summary line by label and return all ¢ amounts on that line."""
pattern = re.compile(re.escape(label) + r".*", re.IGNORECASE)
for line in text.split("\n"):
if pattern.search(line):
amounts = _find_amounts(line)
if amounts:
return amounts
return []
_SUMMARY_FIELDS = [
("Saldo Anterior", "saldo_anterior"),
("Aportes", "aportes"),
("Rendimientos", "rendimientos"),
("Retiros", "retiros"),
("Traslados", "traslados"),
("Comisión de Administración", "comision"),
("Corrección de Imputaciones", "correccion"),
("Bonificación", "bonificacion"),
]
def _find_final_balance(text: str, after_label: str = "Bonificación") -> list[float]:
"""Find the standalone balance line after the last summary field.
After Bonificación (or Corrección for ROP+FCL), there's a line with just
the final balance amount(s) and no label.
"""
lines = text.split("\n")
found_label = False
for line in lines:
if after_label in line:
found_label = True
continue
if found_label:
amounts = _find_amounts(line)
if amounts:
return amounts
return []
def parse_vol(text: str) -> list[FundSnapshot]:
period_start, period_end = _parse_period(text)
# Contract number
m = re.search(r"\s*Contrato:\s*(\S+)", text)
contract = m.group(1) if m else ""
data: dict[str, float] = {}
for label, field in _SUMMARY_FIELDS:
amounts = _extract_summary_value(text, label)
data[field] = amounts[0] if amounts else 0.0
finals = _find_final_balance(text, "Bonificación")
if not finals:
# Fallback: look after Corrección
finals = _find_final_balance(text, "Corrección de Imputaciones")
saldo_final = finals[0] if finals else 0.0
return [
FundSnapshot(
fund="VOL",
contract_number=contract,
period_start=period_start,
period_end=period_end,
saldo_final=saldo_final,
**data,
)
]
def parse_rop_fcl(text: str) -> list[FundSnapshot]:
period_start, period_end = _parse_period(text)
# Contract numbers
m_rop = re.search(r"\s*Contrato\s*ROP:\s*(\S+)", text)
m_fcl = re.search(r"\s*Contrato\s*FCL:\s*(\S+)", text)
contract_rop = m_rop.group(1) if m_rop else ""
contract_fcl = m_fcl.group(1) if m_fcl else ""
rop_data: dict[str, float] = {}
fcl_data: dict[str, float] = {}
for label, field in _SUMMARY_FIELDS:
amounts = _extract_summary_value(text, label)
if len(amounts) >= 2:
rop_data[field] = amounts[0]
fcl_data[field] = amounts[1]
elif len(amounts) == 1:
rop_data[field] = amounts[0]
fcl_data[field] = 0.0
else:
rop_data[field] = 0.0
fcl_data[field] = 0.0
# Final balance line (after Corrección since ROP+FCL has no Bonificación)
finals = _find_final_balance(text, "Corrección de Imputaciones")
rop_final = finals[0] if len(finals) >= 1 else 0.0
fcl_final = finals[1] if len(finals) >= 2 else 0.0
return [
FundSnapshot(
fund="ROP",
contract_number=contract_rop,
period_start=period_start,
period_end=period_end,
saldo_final=rop_final,
**rop_data,
),
FundSnapshot(
fund="FCL",
contract_number=contract_fcl,
period_start=period_start,
period_end=period_end,
saldo_final=fcl_final,
**fcl_data,
),
]
def parse_pension_pdf(pdf_bytes: bytes, filename: str = "") -> list[FundSnapshot]:
"""Parse a pension PDF and return fund snapshots.
Raises ValueError for credit card statements or unrecognized formats.
"""
text = extract_text(pdf_bytes)
doc_type = detect_type(text)
if doc_type == "CREDIT_CARD":
raise ValueError(f"'{filename}' is a credit card statement, not a pension extract")
if doc_type == "UNKNOWN":
raise ValueError(f"'{filename}' is not a recognized BAC pension statement")
if doc_type == "VOL":
return parse_vol(text)
else:
return parse_rop_fcl(text)