mirror of
https://github.com/escalante29/WealthySmart.git
synced 2026-05-19 11:28:49 +02:00
All checks were successful
Deploy to VPS / deploy (push) Successful in 48s
Backend: parse BAC pension statement PDFs (VOL, ROP, FCL) via pdftotext, store snapshots with duplicate detection, reject credit card statements. Endpoints: POST /upload, GET /snapshots, GET /fund-summary. Frontend: wire up drag-and-drop upload, load real balances and rendimientos from API, show upload results with error/duplicate feedback. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
226 lines
6.8 KiB
Python
226 lines
6.8 KiB
Python
"""Parse BAC San José Pensiones PDF statements into structured fund snapshots."""
|
|
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import tempfile
|
|
from dataclasses import dataclass
|
|
from datetime import date
|
|
|
|
|
|
@dataclass
|
|
class FundSnapshot:
|
|
fund: str # "ROP", "FCL", or "VOL"
|
|
contract_number: str
|
|
period_start: date
|
|
period_end: date
|
|
saldo_anterior: float
|
|
aportes: float
|
|
rendimientos: float
|
|
retiros: float
|
|
traslados: float
|
|
comision: float
|
|
correccion: float
|
|
bonificacion: float
|
|
saldo_final: float
|
|
|
|
|
|
def _find_pdftotext() -> str:
|
|
"""Find pdftotext binary, checking common install paths."""
|
|
import os
|
|
|
|
cmd = shutil.which("pdftotext")
|
|
if cmd:
|
|
return cmd
|
|
for path in ["/opt/homebrew/bin/pdftotext", "/usr/bin/pdftotext", "/usr/local/bin/pdftotext"]:
|
|
if os.path.isfile(path):
|
|
return path
|
|
raise FileNotFoundError("pdftotext not found — install poppler-utils")
|
|
|
|
|
|
def extract_text(pdf_bytes: bytes) -> str:
|
|
pdftotext_bin = _find_pdftotext()
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
|
|
f.write(pdf_bytes)
|
|
f.flush()
|
|
result = subprocess.run(
|
|
[pdftotext_bin, "-layout", f.name, "-"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=30,
|
|
)
|
|
if result.returncode != 0:
|
|
raise ValueError(f"pdftotext failed: {result.stderr.strip()}")
|
|
return result.stdout
|
|
|
|
|
|
def detect_type(text: str) -> str:
|
|
"""Return 'VOL', 'ROP_FCL', or 'UNKNOWN'."""
|
|
if any(kw in text for kw in ("MARCA DE TARJETA", "ESTADO DE CUENTA", "PAGO MÍNIMO")):
|
|
return "CREDIT_CARD"
|
|
if "FONDO C VOLUNTARIO" in text:
|
|
return "VOL"
|
|
if "RÉGIMEN OBLIGATORIO" in text or ("ROP" in text and "FCL" in text):
|
|
return "ROP_FCL"
|
|
return "UNKNOWN"
|
|
|
|
|
|
def _parse_amount(s: str) -> float:
|
|
"""Parse '17,819,176.79' or '-12,693.13' into float."""
|
|
cleaned = s.replace(",", "")
|
|
return float(cleaned)
|
|
|
|
|
|
def _find_amounts(line: str) -> list[float]:
|
|
"""Extract all ¢-prefixed amounts from a line."""
|
|
return [_parse_amount(m) for m in re.findall(r"¢\s*(-?[\d,]+\.\d{2})", line)]
|
|
|
|
|
|
def _parse_period(text: str) -> tuple[date, date]:
|
|
m = re.search(r"DEL\s+(\d{2}/\d{2}/\d{4})\s+AL\s+(\d{2}/\d{2}/\d{4})", text)
|
|
if not m:
|
|
raise ValueError("Could not find period dates (DEL ... AL ...)")
|
|
start = date(int(m.group(1)[6:]), int(m.group(1)[3:5]), int(m.group(1)[:2]))
|
|
end = date(int(m.group(2)[6:]), int(m.group(2)[3:5]), int(m.group(2)[:2]))
|
|
return start, end
|
|
|
|
|
|
def _extract_summary_value(text: str, label: str) -> list[float]:
|
|
"""Find a summary line by label and return all ¢ amounts on that line."""
|
|
pattern = re.compile(re.escape(label) + r".*", re.IGNORECASE)
|
|
for line in text.split("\n"):
|
|
if pattern.search(line):
|
|
amounts = _find_amounts(line)
|
|
if amounts:
|
|
return amounts
|
|
return []
|
|
|
|
|
|
_SUMMARY_FIELDS = [
|
|
("Saldo Anterior", "saldo_anterior"),
|
|
("Aportes", "aportes"),
|
|
("Rendimientos", "rendimientos"),
|
|
("Retiros", "retiros"),
|
|
("Traslados", "traslados"),
|
|
("Comisión de Administración", "comision"),
|
|
("Corrección de Imputaciones", "correccion"),
|
|
("Bonificación", "bonificacion"),
|
|
]
|
|
|
|
|
|
def _find_final_balance(text: str, after_label: str = "Bonificación") -> list[float]:
|
|
"""Find the standalone balance line after the last summary field.
|
|
|
|
After Bonificación (or Corrección for ROP+FCL), there's a line with just
|
|
the final balance amount(s) and no label.
|
|
"""
|
|
lines = text.split("\n")
|
|
found_label = False
|
|
for line in lines:
|
|
if after_label in line:
|
|
found_label = True
|
|
continue
|
|
if found_label:
|
|
amounts = _find_amounts(line)
|
|
if amounts:
|
|
return amounts
|
|
return []
|
|
|
|
|
|
def parse_vol(text: str) -> list[FundSnapshot]:
|
|
period_start, period_end = _parse_period(text)
|
|
|
|
# Contract number
|
|
m = re.search(r"N°\s*Contrato:\s*(\S+)", text)
|
|
contract = m.group(1) if m else ""
|
|
|
|
data: dict[str, float] = {}
|
|
for label, field in _SUMMARY_FIELDS:
|
|
amounts = _extract_summary_value(text, label)
|
|
data[field] = amounts[0] if amounts else 0.0
|
|
|
|
finals = _find_final_balance(text, "Bonificación")
|
|
if not finals:
|
|
# Fallback: look after Corrección
|
|
finals = _find_final_balance(text, "Corrección de Imputaciones")
|
|
saldo_final = finals[0] if finals else 0.0
|
|
|
|
return [
|
|
FundSnapshot(
|
|
fund="VOL",
|
|
contract_number=contract,
|
|
period_start=period_start,
|
|
period_end=period_end,
|
|
saldo_final=saldo_final,
|
|
**data,
|
|
)
|
|
]
|
|
|
|
|
|
def parse_rop_fcl(text: str) -> list[FundSnapshot]:
|
|
period_start, period_end = _parse_period(text)
|
|
|
|
# Contract numbers
|
|
m_rop = re.search(r"N°\s*Contrato\s*ROP:\s*(\S+)", text)
|
|
m_fcl = re.search(r"N°\s*Contrato\s*FCL:\s*(\S+)", text)
|
|
contract_rop = m_rop.group(1) if m_rop else ""
|
|
contract_fcl = m_fcl.group(1) if m_fcl else ""
|
|
|
|
rop_data: dict[str, float] = {}
|
|
fcl_data: dict[str, float] = {}
|
|
|
|
for label, field in _SUMMARY_FIELDS:
|
|
amounts = _extract_summary_value(text, label)
|
|
if len(amounts) >= 2:
|
|
rop_data[field] = amounts[0]
|
|
fcl_data[field] = amounts[1]
|
|
elif len(amounts) == 1:
|
|
rop_data[field] = amounts[0]
|
|
fcl_data[field] = 0.0
|
|
else:
|
|
rop_data[field] = 0.0
|
|
fcl_data[field] = 0.0
|
|
|
|
# Final balance line (after Corrección since ROP+FCL has no Bonificación)
|
|
finals = _find_final_balance(text, "Corrección de Imputaciones")
|
|
rop_final = finals[0] if len(finals) >= 1 else 0.0
|
|
fcl_final = finals[1] if len(finals) >= 2 else 0.0
|
|
|
|
return [
|
|
FundSnapshot(
|
|
fund="ROP",
|
|
contract_number=contract_rop,
|
|
period_start=period_start,
|
|
period_end=period_end,
|
|
saldo_final=rop_final,
|
|
**rop_data,
|
|
),
|
|
FundSnapshot(
|
|
fund="FCL",
|
|
contract_number=contract_fcl,
|
|
period_start=period_start,
|
|
period_end=period_end,
|
|
saldo_final=fcl_final,
|
|
**fcl_data,
|
|
),
|
|
]
|
|
|
|
|
|
def parse_pension_pdf(pdf_bytes: bytes, filename: str = "") -> list[FundSnapshot]:
|
|
"""Parse a pension PDF and return fund snapshots.
|
|
|
|
Raises ValueError for credit card statements or unrecognized formats.
|
|
"""
|
|
text = extract_text(pdf_bytes)
|
|
doc_type = detect_type(text)
|
|
|
|
if doc_type == "CREDIT_CARD":
|
|
raise ValueError(f"'{filename}' is a credit card statement, not a pension extract")
|
|
if doc_type == "UNKNOWN":
|
|
raise ValueError(f"'{filename}' is not a recognized BAC pension statement")
|
|
|
|
if doc_type == "VOL":
|
|
return parse_vol(text)
|
|
else:
|
|
return parse_rop_fcl(text)
|