mirror of
https://github.com/escalante29/WealthySmart.git
synced 2026-05-19 08:28:48 +02:00
All checks were successful
Deploy to VPS / deploy (push) Successful in 58s
- New module: Municipalidad de Belén receipt extraction via pdftotext+regex - Backend: MunicipalReceipt + WaterMeterReading models, upload/list/detail/water-consumption endpoints - Auto-creates budget Transaction on upload (duplicate-safe via reference) - Frontend: ServiciosMunicipales page with summary cards, water consumption bar chart, receipt history, PDF upload - Convert top navbar to left sidebar with section headers (General, Finanzas, Servicios) - Desktop: fixed 220px sidebar, mobile: sheet overlay - Grouped nav: Dashboard | Presupuesto, Salarios, Pensiones, Analytics | Municipalidad Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
292 lines
8.2 KiB
Python
292 lines
8.2 KiB
Python
"""
|
|
Extract structured data from Municipalidad de Belén receipts using pdftotext + regex.
|
|
"""
|
|
|
|
import re
|
|
import subprocess
|
|
import tempfile
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
def _parse_amount(s: str) -> float:
|
|
"""Parse a Costa Rican formatted number: '1,875.00' → 1875.00"""
|
|
return float(s.replace(",", ""))
|
|
|
|
|
|
def _parse_date(s: str) -> str:
|
|
"""Convert dd/mm/yyyy → YYYY-MM-DD"""
|
|
d, m, y = s.strip().split("/")
|
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
|
|
def _parse_period(s: str) -> str:
|
|
"""Convert mm/yyyy → YYYY-MM"""
|
|
m, y = s.strip().split("/")
|
|
return f"{y}-{m.zfill(2)}"
|
|
|
|
|
|
@dataclass
|
|
class Charge:
|
|
detail: str
|
|
interests: float
|
|
iva: float
|
|
amount: float
|
|
|
|
|
|
@dataclass
|
|
class WaterMeter:
|
|
period: str
|
|
meter_id: str
|
|
reading_previous: int
|
|
reading_current: int
|
|
consumption_m3: int
|
|
agua_potable: float
|
|
serv_ambientales: float
|
|
alcant_sanitario: float
|
|
iva: float
|
|
|
|
|
|
@dataclass
|
|
class HistoricalConsumption:
|
|
meter_id: str
|
|
period: str
|
|
consumption_m3: int
|
|
|
|
|
|
@dataclass
|
|
class MunicipalReceiptData:
|
|
receipt_date: str # YYYY-MM-DD
|
|
due_date: str # YYYY-MM-DD
|
|
holder_name: str
|
|
holder_cedula: str
|
|
holder_address: str
|
|
account: str
|
|
finca: str
|
|
charges: list[Charge] = field(default_factory=list)
|
|
subtotal: float = 0.0
|
|
interests: float = 0.0
|
|
iva: float = 0.0
|
|
total: float = 0.0
|
|
water_meters: list[WaterMeter] = field(default_factory=list)
|
|
historical_consumption: list[HistoricalConsumption] = field(default_factory=list)
|
|
|
|
|
|
def _pdf_to_text(pdf_bytes: bytes) -> str:
|
|
"""Convert PDF bytes to text using pdftotext -layout."""
|
|
with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp:
|
|
tmp.write(pdf_bytes)
|
|
tmp.flush()
|
|
result = subprocess.run(
|
|
["pdftotext", "-layout", tmp.name, "-"],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
)
|
|
if result.returncode != 0:
|
|
raise ValueError(f"pdftotext failed: {result.stderr}")
|
|
return result.stdout
|
|
|
|
|
|
# Regex patterns
|
|
RE_FECHA = re.compile(r"Fecha:\s*(\d{2}/\d{2}/\d{4})")
|
|
RE_VENCIMIENTO = re.compile(r"Fecha de vencimiento:\s*(\d{2}/\d{2}/\d{4})")
|
|
RE_NOMBRE = re.compile(r"Nombre:\s*(.+)")
|
|
RE_CEDULA = re.compile(r"Cédula:\s*(\d+)")
|
|
RE_DIRECCION = re.compile(r"Dirección:\s*(.+)")
|
|
|
|
# Charge line: DETAIL_TEXT account finca interests iva periodo_actual periodo_anterior
|
|
RE_CHARGE = re.compile(
|
|
r"^([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ\s.]+?)\s+"
|
|
r"(\d{4})\s+"
|
|
r"(\d{6}---\d{3})\s+"
|
|
r"([\d,]+\.\d{2})\s+"
|
|
r"([\d,]+\.\d{2})\s+"
|
|
r"([\d,]+\.\d{2})\s+"
|
|
r"([\d,]+\.\d{2})\s*$"
|
|
)
|
|
|
|
RE_SUBTOTAL = re.compile(r"Sub-Total:\s+([\d,]+\.\d{2})")
|
|
RE_INTERESES = re.compile(r"Intereses:\s+([\d,]+\.\d{2})")
|
|
RE_IVA = re.compile(r"IVA\s+([\d,]+\.\d{2})")
|
|
RE_TOTAL = re.compile(r"Total:\s+([\d,]+\.\d{2})")
|
|
|
|
# Water meter line: period meter_id lec_ant lec_act consumo agua_potable serv_amb alcant iva
|
|
RE_WATER_METER = re.compile(
|
|
r"(\d{2}/\d{4})\s+"
|
|
r"(\d{4})\s+"
|
|
r"(\d{5})\s+"
|
|
r"(\d{5})\s+"
|
|
r"(\d+)\s+"
|
|
r"([\d,]+\.\d{2})\s+"
|
|
r"([\d,]+\.\d{2})\s+"
|
|
r"([\d,]+\.\d{2})\s+"
|
|
r"([\d,]+\.\d{2})"
|
|
)
|
|
|
|
# Historical consumption: meter_id period consumption
|
|
RE_HISTORICAL = re.compile(
|
|
r"(\d{4})\s+(\d{2}/\d{4})\s+(\d{5})"
|
|
)
|
|
|
|
|
|
def extract_municipal_receipt(
|
|
pdf_bytes: bytes, filename: str
|
|
) -> dict:
|
|
"""Extract structured data from a municipal receipt PDF.
|
|
|
|
Returns a dict matching the target JSON schema.
|
|
"""
|
|
text = _pdf_to_text(pdf_bytes)
|
|
|
|
if "RECIBO MUNICIPAL" not in text:
|
|
raise ValueError(f"{filename}: Not a municipal receipt")
|
|
|
|
data = MunicipalReceiptData(
|
|
receipt_date="",
|
|
due_date="",
|
|
holder_name="",
|
|
holder_cedula="",
|
|
holder_address="",
|
|
account="",
|
|
finca="",
|
|
)
|
|
|
|
# --- Header fields ---
|
|
m = RE_FECHA.search(text)
|
|
if m:
|
|
data.receipt_date = _parse_date(m.group(1))
|
|
|
|
m = RE_VENCIMIENTO.search(text)
|
|
if m:
|
|
data.due_date = _parse_date(m.group(1))
|
|
|
|
m = RE_NOMBRE.search(text)
|
|
if m:
|
|
data.holder_name = m.group(1).strip()
|
|
|
|
m = RE_CEDULA.search(text)
|
|
if m:
|
|
data.holder_cedula = m.group(1).strip()
|
|
|
|
m = RE_DIRECCION.search(text)
|
|
if m:
|
|
data.holder_address = m.group(1).strip().rstrip(".")
|
|
|
|
# --- Charges ---
|
|
for line in text.splitlines():
|
|
m = RE_CHARGE.match(line.strip())
|
|
if m:
|
|
detail = m.group(1).strip()
|
|
data.account = m.group(2)
|
|
data.finca = m.group(3)
|
|
interests = _parse_amount(m.group(4))
|
|
iva = _parse_amount(m.group(5))
|
|
amount = _parse_amount(m.group(6))
|
|
data.charges.append(Charge(detail=detail, interests=interests, iva=iva, amount=amount))
|
|
|
|
# --- Totals ---
|
|
m = RE_SUBTOTAL.search(text)
|
|
if m:
|
|
data.subtotal = _parse_amount(m.group(1))
|
|
|
|
m = RE_INTERESES.search(text)
|
|
if m:
|
|
data.interests = _parse_amount(m.group(1))
|
|
|
|
m = RE_IVA.search(text)
|
|
if m:
|
|
data.iva = _parse_amount(m.group(1))
|
|
|
|
m = RE_TOTAL.search(text)
|
|
if m:
|
|
data.total = _parse_amount(m.group(1))
|
|
|
|
# --- Water meters ---
|
|
for m in RE_WATER_METER.finditer(text):
|
|
data.water_meters.append(
|
|
WaterMeter(
|
|
period=_parse_period(m.group(1)),
|
|
meter_id=m.group(2),
|
|
reading_previous=int(m.group(3)),
|
|
reading_current=int(m.group(4)),
|
|
consumption_m3=int(m.group(5)),
|
|
agua_potable=_parse_amount(m.group(6)),
|
|
serv_ambientales=_parse_amount(m.group(7)),
|
|
alcant_sanitario=_parse_amount(m.group(8)),
|
|
iva=_parse_amount(m.group(9)),
|
|
)
|
|
)
|
|
|
|
# --- Historical consumption ---
|
|
# Only parse lines AFTER "DETALLE DE CONSUMO MESES ANTERIORES"
|
|
hist_section = text.split("DETALLE DE CONSUMO MESES ANTERIORES")
|
|
if len(hist_section) > 1:
|
|
for m in RE_HISTORICAL.finditer(hist_section[1]):
|
|
data.historical_consumption.append(
|
|
HistoricalConsumption(
|
|
meter_id=m.group(1),
|
|
period=_parse_period(m.group(2)),
|
|
consumption_m3=int(m.group(3)),
|
|
)
|
|
)
|
|
|
|
# --- Validation ---
|
|
if not data.receipt_date:
|
|
raise ValueError(f"{filename}: Could not parse receipt date")
|
|
if not data.charges:
|
|
raise ValueError(f"{filename}: No charges found")
|
|
|
|
# --- Build output dict ---
|
|
return {
|
|
"receipt": {
|
|
"type": "RECIBO MUNICIPAL",
|
|
"issuer": {
|
|
"name": "MUNICIPALIDAD DE BELÉN",
|
|
"phone": "(506) 2587-0000",
|
|
"fax": "(506) 2293-3667",
|
|
"website": "www.belen.go.cr",
|
|
},
|
|
"date": data.receipt_date,
|
|
"due_date": data.due_date,
|
|
"account_holder": {
|
|
"name": data.holder_name,
|
|
"cedula": data.holder_cedula,
|
|
"address": data.holder_address,
|
|
},
|
|
"account": data.account,
|
|
"finca": data.finca,
|
|
},
|
|
"charges": [
|
|
{"detail": c.detail, "interests": c.interests, "iva": c.iva, "amount": c.amount}
|
|
for c in data.charges
|
|
],
|
|
"totals": {
|
|
"subtotal": data.subtotal,
|
|
"interests": data.interests,
|
|
"iva": data.iva,
|
|
"total": data.total,
|
|
},
|
|
"water_meters": [
|
|
{
|
|
"period": wm.period,
|
|
"meter_id": wm.meter_id,
|
|
"reading_previous": wm.reading_previous,
|
|
"reading_current": wm.reading_current,
|
|
"consumption_m3": wm.consumption_m3,
|
|
"agua_potable": wm.agua_potable,
|
|
"serv_ambientales": wm.serv_ambientales,
|
|
"alcant_sanitario": wm.alcant_sanitario,
|
|
"iva": wm.iva,
|
|
}
|
|
for wm in data.water_meters
|
|
],
|
|
"historical_consumption": [
|
|
{
|
|
"meter_id": hc.meter_id,
|
|
"period": hc.period,
|
|
"consumption_m3": hc.consumption_m3,
|
|
}
|
|
for hc in data.historical_consumption
|
|
],
|
|
}
|