Files
WealthySmart/backend/app/services/municipal_receipt_pdf.py
Carlos Escalante 739a32efd4
All checks were successful
Deploy to VPS / deploy (push) Successful in 58s
Add municipal receipt module and convert navbar to sidebar
- New module: Municipalidad de Belén receipt extraction via pdftotext+regex
  - Backend: MunicipalReceipt + WaterMeterReading models, upload/list/detail/water-consumption endpoints
  - Auto-creates budget Transaction on upload (duplicate-safe via reference)
  - Frontend: ServiciosMunicipales page with summary cards, water consumption bar chart, receipt history, PDF upload
- Convert top navbar to left sidebar with section headers (General, Finanzas, Servicios)
  - Desktop: fixed 220px sidebar, mobile: sheet overlay
  - Grouped nav: Dashboard | Presupuesto, Salarios, Pensiones, Analytics | Municipalidad

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 16:11:51 -06:00

292 lines
8.2 KiB
Python

"""
Extract structured data from Municipalidad de Belén receipts using pdftotext + regex.
"""
import re
import subprocess
import tempfile
from dataclasses import dataclass, field
def _parse_amount(s: str) -> float:
"""Parse a Costa Rican formatted number: '1,875.00' → 1875.00"""
return float(s.replace(",", ""))
def _parse_date(s: str) -> str:
"""Convert dd/mm/yyyy → YYYY-MM-DD"""
d, m, y = s.strip().split("/")
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
def _parse_period(s: str) -> str:
"""Convert mm/yyyy → YYYY-MM"""
m, y = s.strip().split("/")
return f"{y}-{m.zfill(2)}"
@dataclass
class Charge:
detail: str
interests: float
iva: float
amount: float
@dataclass
class WaterMeter:
period: str
meter_id: str
reading_previous: int
reading_current: int
consumption_m3: int
agua_potable: float
serv_ambientales: float
alcant_sanitario: float
iva: float
@dataclass
class HistoricalConsumption:
meter_id: str
period: str
consumption_m3: int
@dataclass
class MunicipalReceiptData:
receipt_date: str # YYYY-MM-DD
due_date: str # YYYY-MM-DD
holder_name: str
holder_cedula: str
holder_address: str
account: str
finca: str
charges: list[Charge] = field(default_factory=list)
subtotal: float = 0.0
interests: float = 0.0
iva: float = 0.0
total: float = 0.0
water_meters: list[WaterMeter] = field(default_factory=list)
historical_consumption: list[HistoricalConsumption] = field(default_factory=list)
def _pdf_to_text(pdf_bytes: bytes) -> str:
"""Convert PDF bytes to text using pdftotext -layout."""
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
tmp.write(pdf_bytes)
tmp.flush()
result = subprocess.run(
["pdftotext", "-layout", tmp.name, "-"],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode != 0:
raise ValueError(f"pdftotext failed: {result.stderr}")
return result.stdout
# Regex patterns
RE_FECHA = re.compile(r"Fecha:\s*(\d{2}/\d{2}/\d{4})")
RE_VENCIMIENTO = re.compile(r"Fecha de vencimiento:\s*(\d{2}/\d{2}/\d{4})")
RE_NOMBRE = re.compile(r"Nombre:\s*(.+)")
RE_CEDULA = re.compile(r"Cédula:\s*(\d+)")
RE_DIRECCION = re.compile(r"Dirección:\s*(.+)")
# Charge line: DETAIL_TEXT account finca interests iva periodo_actual periodo_anterior
RE_CHARGE = re.compile(
r"^([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ\s.]+?)\s+"
r"(\d{4})\s+"
r"(\d{6}---\d{3})\s+"
r"([\d,]+\.\d{2})\s+"
r"([\d,]+\.\d{2})\s+"
r"([\d,]+\.\d{2})\s+"
r"([\d,]+\.\d{2})\s*$"
)
RE_SUBTOTAL = re.compile(r"Sub-Total:\s+([\d,]+\.\d{2})")
RE_INTERESES = re.compile(r"Intereses:\s+([\d,]+\.\d{2})")
RE_IVA = re.compile(r"IVA\s+([\d,]+\.\d{2})")
RE_TOTAL = re.compile(r"Total:\s+([\d,]+\.\d{2})")
# Water meter line: period meter_id lec_ant lec_act consumo agua_potable serv_amb alcant iva
RE_WATER_METER = re.compile(
r"(\d{2}/\d{4})\s+"
r"(\d{4})\s+"
r"(\d{5})\s+"
r"(\d{5})\s+"
r"(\d+)\s+"
r"([\d,]+\.\d{2})\s+"
r"([\d,]+\.\d{2})\s+"
r"([\d,]+\.\d{2})\s+"
r"([\d,]+\.\d{2})"
)
# Historical consumption: meter_id period consumption
RE_HISTORICAL = re.compile(
r"(\d{4})\s+(\d{2}/\d{4})\s+(\d{5})"
)
def extract_municipal_receipt(
pdf_bytes: bytes, filename: str
) -> dict:
"""Extract structured data from a municipal receipt PDF.
Returns a dict matching the target JSON schema.
"""
text = _pdf_to_text(pdf_bytes)
if "RECIBO MUNICIPAL" not in text:
raise ValueError(f"{filename}: Not a municipal receipt")
data = MunicipalReceiptData(
receipt_date="",
due_date="",
holder_name="",
holder_cedula="",
holder_address="",
account="",
finca="",
)
# --- Header fields ---
m = RE_FECHA.search(text)
if m:
data.receipt_date = _parse_date(m.group(1))
m = RE_VENCIMIENTO.search(text)
if m:
data.due_date = _parse_date(m.group(1))
m = RE_NOMBRE.search(text)
if m:
data.holder_name = m.group(1).strip()
m = RE_CEDULA.search(text)
if m:
data.holder_cedula = m.group(1).strip()
m = RE_DIRECCION.search(text)
if m:
data.holder_address = m.group(1).strip().rstrip(".")
# --- Charges ---
for line in text.splitlines():
m = RE_CHARGE.match(line.strip())
if m:
detail = m.group(1).strip()
data.account = m.group(2)
data.finca = m.group(3)
interests = _parse_amount(m.group(4))
iva = _parse_amount(m.group(5))
amount = _parse_amount(m.group(6))
data.charges.append(Charge(detail=detail, interests=interests, iva=iva, amount=amount))
# --- Totals ---
m = RE_SUBTOTAL.search(text)
if m:
data.subtotal = _parse_amount(m.group(1))
m = RE_INTERESES.search(text)
if m:
data.interests = _parse_amount(m.group(1))
m = RE_IVA.search(text)
if m:
data.iva = _parse_amount(m.group(1))
m = RE_TOTAL.search(text)
if m:
data.total = _parse_amount(m.group(1))
# --- Water meters ---
for m in RE_WATER_METER.finditer(text):
data.water_meters.append(
WaterMeter(
period=_parse_period(m.group(1)),
meter_id=m.group(2),
reading_previous=int(m.group(3)),
reading_current=int(m.group(4)),
consumption_m3=int(m.group(5)),
agua_potable=_parse_amount(m.group(6)),
serv_ambientales=_parse_amount(m.group(7)),
alcant_sanitario=_parse_amount(m.group(8)),
iva=_parse_amount(m.group(9)),
)
)
# --- Historical consumption ---
# Only parse lines AFTER "DETALLE DE CONSUMO MESES ANTERIORES"
hist_section = text.split("DETALLE DE CONSUMO MESES ANTERIORES")
if len(hist_section) > 1:
for m in RE_HISTORICAL.finditer(hist_section[1]):
data.historical_consumption.append(
HistoricalConsumption(
meter_id=m.group(1),
period=_parse_period(m.group(2)),
consumption_m3=int(m.group(3)),
)
)
# --- Validation ---
if not data.receipt_date:
raise ValueError(f"{filename}: Could not parse receipt date")
if not data.charges:
raise ValueError(f"{filename}: No charges found")
# --- Build output dict ---
return {
"receipt": {
"type": "RECIBO MUNICIPAL",
"issuer": {
"name": "MUNICIPALIDAD DE BELÉN",
"phone": "(506) 2587-0000",
"fax": "(506) 2293-3667",
"website": "www.belen.go.cr",
},
"date": data.receipt_date,
"due_date": data.due_date,
"account_holder": {
"name": data.holder_name,
"cedula": data.holder_cedula,
"address": data.holder_address,
},
"account": data.account,
"finca": data.finca,
},
"charges": [
{"detail": c.detail, "interests": c.interests, "iva": c.iva, "amount": c.amount}
for c in data.charges
],
"totals": {
"subtotal": data.subtotal,
"interests": data.interests,
"iva": data.iva,
"total": data.total,
},
"water_meters": [
{
"period": wm.period,
"meter_id": wm.meter_id,
"reading_previous": wm.reading_previous,
"reading_current": wm.reading_current,
"consumption_m3": wm.consumption_m3,
"agua_potable": wm.agua_potable,
"serv_ambientales": wm.serv_ambientales,
"alcant_sanitario": wm.alcant_sanitario,
"iva": wm.iva,
}
for wm in data.water_meters
],
"historical_consumption": [
{
"meter_id": hc.meter_id,
"period": hc.period,
"consumption_m3": hc.consumption_m3,
}
for hc in data.historical_consumption
],
}