Files
AI-upflux-docprocessor/code/app.py
2026-02-04 13:29:15 -03:00

205 lines
5.8 KiB
Python

from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn
import boto3
import json
import time
import io
from pathlib import Path
from urllib.parse import urlparse
from PyPDF2 import PdfReader
from utils.langgraph_agent import RULES, run_agent
app = FastAPI()
AWS_REGION = "us-east-2"
# --- S3 / Textract helpers ---
def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
parsed = urlparse(s3_uri)
if parsed.scheme != "s3":
raise ValueError(f"Not an S3 URI: {s3_uri}")
bucket = parsed.netloc
key = parsed.path.lstrip("/")
if not bucket or not key:
raise ValueError(f"Invalid S3 URI: {s3_uri}")
return bucket, key
def get_s3_client():
return boto3.client("s3", region_name=AWS_REGION)
def get_textract_client():
return boto3.client("textract", region_name=AWS_REGION)
def get_pdf_page_count(pdf_bytes: bytes) -> int:
try:
return len(PdfReader(io.BytesIO(pdf_bytes)).pages)
except Exception:
return 1
def extract_text_from_textract_response(response: dict) -> str:
if not response:
return ""
return "\n".join(
block["Text"] for block in response.get("Blocks", [])
if block["BlockType"] == "LINE"
)
def extract_text_from_s3_document(bucket: str, key: str) -> str:
s3 = get_s3_client()
textract = get_textract_client()
file_ext = Path(key).suffix.lower()
if file_ext in [".png", ".jpg", ".jpeg"]:
response = textract.detect_document_text(
Document={"S3Object": {"Bucket": bucket, "Name": key}}
)
return extract_text_from_textract_response(response)
if file_ext == ".pdf":
obj = s3.get_object(Bucket=bucket, Key=key)
pdf_bytes = obj["Body"].read()
page_count = get_pdf_page_count(pdf_bytes)
if page_count > 1:
response = textract.start_document_text_detection(
DocumentLocation={"S3Object": {"Bucket": bucket, "Name": key}}
)
job_id = response["JobId"]
while True:
result = textract.get_document_text_detection(JobId=job_id)
status = result["JobStatus"]
if status == "SUCCEEDED":
return extract_text_from_textract_response(result)
elif status == "FAILED":
return ""
time.sleep(2)
else:
response = textract.detect_document_text(
Document={"S3Object": {"Bucket": bucket, "Name": key}}
)
return extract_text_from_textract_response(response)
return ""
# --- Guia processing ---
def process_guia(guia: dict) -> dict:
guia_code = guia.get("guia", {}).get("codigoGuiaLocal", "unknown")
# Step 1: Extract text from all anexos
anexos = guia.get("anexos", [])
all_extracted_texts = []
for anexo_idx, anexo in enumerate(anexos):
s3_uri = anexo.get("urlAnexo") or anexo.get("URLAnexo", "")
nome_arquivo = anexo.get("nomeArquivo", f"attachment_{anexo_idx}")
if not s3_uri or not s3_uri.startswith("s3://"):
anexo["textoExtraido"] = ""
continue
try:
bucket, key = parse_s3_uri(s3_uri)
extracted_text = extract_text_from_s3_document(bucket, key)
except Exception as e:
print(f" Error extracting text from {nome_arquivo}: {e}")
extracted_text = ""
anexo["textoExtraido"] = extracted_text
all_extracted_texts.append(f"--- {nome_arquivo} ---\n{extracted_text}")
file_content = "\n\n".join(all_extracted_texts)
# Step 2: For each servico, run the agent
servicos = guia.get("servicos", [])
avaliacao_resultados = []
for servico in servicos:
codigo_servico_raw = str(servico.get("codigoServico", ""))
code = "".join(c for c in codigo_servico_raw if c.isdigit())
if code not in RULES:
avaliacao_resultados.append({
"codigoServico": codigo_servico_raw,
"resultado": "SKIPPED",
"motivo": f"Codigo '{code}' nao encontrado nas regras",
"agentOutput": ""
})
continue
query_data = {
"atendimento": guia.get("atendimento", {}),
"guia": guia.get("guia", {}),
"servico": servico,
"historico": guia.get("historico", {})
}
query = json.dumps(query_data, indent=2, ensure_ascii=False)
try:
agent_output = run_agent(query, code, file_content)
except Exception as e:
print(f" Agent error for servico {codigo_servico_raw}: {e}")
agent_output = f"ERROR: {str(e)}"
avaliacao_resultados.append({
"codigoServico": codigo_servico_raw,
"resultado": "Aprovado" if "aprov" in "".join(c for c in agent_output.lower() if c.isalnum() or c == ' ') else "Reprovado",
"agentOutput": agent_output
})
guia["avaliacaoAgente"] = avaliacao_resultados
return guia
# --- API models ---
class ProcessRequest(BaseModel):
operadora: dict
guias: list[dict]
# --- Endpoints ---
@app.post("/process")
async def process(request: ProcessRequest):
results = []
for idx, guia in enumerate(request.guias):
try:
enriched = process_guia(guia)
results.append(enriched)
except Exception as e:
results.append({
"error": str(e),
"guia": guia.get("guia", {}).get("codigoGuiaLocal", f"index_{idx}")
})
return {
"status": "success",
"operadora": request.operadora,
"guias": results
}
@app.get("/health")
async def health():
return {"status": "healthy"}
@app.get("/rules")
async def get_rules():
return {"codes": list(RULES.keys())}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)