""" Extract structured data from Municipalidad de Belén receipts using pdftotext + regex. """ import re import subprocess import tempfile from dataclasses import dataclass, field def _parse_amount(s: str) -> float: """Parse a Costa Rican formatted number: '1,875.00' → 1875.00""" return float(s.replace(",", "")) def _parse_date(s: str) -> str: """Convert dd/mm/yyyy → YYYY-MM-DD""" d, m, y = s.strip().split("/") return f"{y}-{m.zfill(2)}-{d.zfill(2)}" def _parse_period(s: str) -> str: """Convert mm/yyyy → YYYY-MM""" m, y = s.strip().split("/") return f"{y}-{m.zfill(2)}" @dataclass class Charge: detail: str interests: float iva: float amount: float @dataclass class WaterMeter: period: str meter_id: str reading_previous: int reading_current: int consumption_m3: int agua_potable: float serv_ambientales: float alcant_sanitario: float iva: float @dataclass class HistoricalConsumption: meter_id: str period: str consumption_m3: int @dataclass class MunicipalReceiptData: receipt_date: str # YYYY-MM-DD due_date: str # YYYY-MM-DD holder_name: str holder_cedula: str holder_address: str account: str finca: str charges: list[Charge] = field(default_factory=list) subtotal: float = 0.0 interests: float = 0.0 iva: float = 0.0 total: float = 0.0 water_meters: list[WaterMeter] = field(default_factory=list) historical_consumption: list[HistoricalConsumption] = field(default_factory=list) def _pdf_to_text(pdf_bytes: bytes) -> str: """Convert PDF bytes to text using pdftotext -layout.""" with tempfile.NamedTemporaryFile(suffix=".pdf") as tmp: tmp.write(pdf_bytes) tmp.flush() result = subprocess.run( ["pdftotext", "-layout", tmp.name, "-"], capture_output=True, text=True, timeout=10, ) if result.returncode != 0: raise ValueError(f"pdftotext failed: {result.stderr}") return result.stdout # Regex patterns RE_FECHA = re.compile(r"Fecha:\s*(\d{2}/\d{2}/\d{4})") RE_VENCIMIENTO = re.compile(r"Fecha de vencimiento:\s*(\d{2}/\d{2}/\d{4})") RE_NOMBRE = re.compile(r"Nombre:\s*(.+)") RE_CEDULA = re.compile(r"Cédula:\s*(\d+)") RE_DIRECCION = re.compile(r"Dirección:\s*(.+)") # Charge line: DETAIL_TEXT account finca interests iva periodo_actual periodo_anterior RE_CHARGE = re.compile( r"^([A-ZÁÉÍÓÚÑ][A-ZÁÉÍÓÚÑ\s.]+?)\s+" r"(\d{4})\s+" r"(\d{6}---\d{3})\s+" r"([\d,]+\.\d{2})\s+" r"([\d,]+\.\d{2})\s+" r"([\d,]+\.\d{2})\s+" r"([\d,]+\.\d{2})\s*$" ) RE_SUBTOTAL = re.compile(r"Sub-Total:\s+([\d,]+\.\d{2})") RE_INTERESES = re.compile(r"Intereses:\s+([\d,]+\.\d{2})") RE_IVA = re.compile(r"IVA\s+([\d,]+\.\d{2})") RE_TOTAL = re.compile(r"Total:\s+([\d,]+\.\d{2})") # Water meter line: period meter_id lec_ant lec_act consumo agua_potable serv_amb alcant iva RE_WATER_METER = re.compile( r"(\d{2}/\d{4})\s+" r"(\d{4})\s+" r"(\d{5})\s+" r"(\d{5})\s+" r"(\d+)\s+" r"([\d,]+\.\d{2})\s+" r"([\d,]+\.\d{2})\s+" r"([\d,]+\.\d{2})\s+" r"([\d,]+\.\d{2})" ) # Historical consumption: meter_id period consumption RE_HISTORICAL = re.compile( r"(\d{4})\s+(\d{2}/\d{4})\s+(\d{5})" ) def extract_municipal_receipt( pdf_bytes: bytes, filename: str ) -> dict: """Extract structured data from a municipal receipt PDF. Returns a dict matching the target JSON schema. """ text = _pdf_to_text(pdf_bytes) if "RECIBO MUNICIPAL" not in text: raise ValueError(f"{filename}: Not a municipal receipt") data = MunicipalReceiptData( receipt_date="", due_date="", holder_name="", holder_cedula="", holder_address="", account="", finca="", ) # --- Header fields --- m = RE_FECHA.search(text) if m: data.receipt_date = _parse_date(m.group(1)) m = RE_VENCIMIENTO.search(text) if m: data.due_date = _parse_date(m.group(1)) m = RE_NOMBRE.search(text) if m: data.holder_name = m.group(1).strip() m = RE_CEDULA.search(text) if m: data.holder_cedula = m.group(1).strip() m = RE_DIRECCION.search(text) if m: data.holder_address = m.group(1).strip().rstrip(".") # --- Charges --- for line in text.splitlines(): m = RE_CHARGE.match(line.strip()) if m: detail = m.group(1).strip() data.account = m.group(2) data.finca = m.group(3) interests = _parse_amount(m.group(4)) iva = _parse_amount(m.group(5)) amount = _parse_amount(m.group(6)) data.charges.append(Charge(detail=detail, interests=interests, iva=iva, amount=amount)) # --- Totals --- m = RE_SUBTOTAL.search(text) if m: data.subtotal = _parse_amount(m.group(1)) m = RE_INTERESES.search(text) if m: data.interests = _parse_amount(m.group(1)) m = RE_IVA.search(text) if m: data.iva = _parse_amount(m.group(1)) m = RE_TOTAL.search(text) if m: data.total = _parse_amount(m.group(1)) # --- Water meters --- for m in RE_WATER_METER.finditer(text): data.water_meters.append( WaterMeter( period=_parse_period(m.group(1)), meter_id=m.group(2), reading_previous=int(m.group(3)), reading_current=int(m.group(4)), consumption_m3=int(m.group(5)), agua_potable=_parse_amount(m.group(6)), serv_ambientales=_parse_amount(m.group(7)), alcant_sanitario=_parse_amount(m.group(8)), iva=_parse_amount(m.group(9)), ) ) # --- Historical consumption --- # Only parse lines AFTER "DETALLE DE CONSUMO MESES ANTERIORES" hist_section = text.split("DETALLE DE CONSUMO MESES ANTERIORES") if len(hist_section) > 1: for m in RE_HISTORICAL.finditer(hist_section[1]): data.historical_consumption.append( HistoricalConsumption( meter_id=m.group(1), period=_parse_period(m.group(2)), consumption_m3=int(m.group(3)), ) ) # --- Validation --- if not data.receipt_date: raise ValueError(f"{filename}: Could not parse receipt date") if not data.charges: raise ValueError(f"{filename}: No charges found") # --- Build output dict --- return { "receipt": { "type": "RECIBO MUNICIPAL", "issuer": { "name": "MUNICIPALIDAD DE BELÉN", "phone": "(506) 2587-0000", "fax": "(506) 2293-3667", "website": "www.belen.go.cr", }, "date": data.receipt_date, "due_date": data.due_date, "account_holder": { "name": data.holder_name, "cedula": data.holder_cedula, "address": data.holder_address, }, "account": data.account, "finca": data.finca, }, "charges": [ {"detail": c.detail, "interests": c.interests, "iva": c.iva, "amount": c.amount} for c in data.charges ], "totals": { "subtotal": data.subtotal, "interests": data.interests, "iva": data.iva, "total": data.total, }, "water_meters": [ { "period": wm.period, "meter_id": wm.meter_id, "reading_previous": wm.reading_previous, "reading_current": wm.reading_current, "consumption_m3": wm.consumption_m3, "agua_potable": wm.agua_potable, "serv_ambientales": wm.serv_ambientales, "alcant_sanitario": wm.alcant_sanitario, "iva": wm.iva, } for wm in data.water_meters ], "historical_consumption": [ { "meter_id": hc.meter_id, "period": hc.period, "consumption_m3": hc.consumption_m3, } for hc in data.historical_consumption ], }