205 lines
5.8 KiB
Python
205 lines
5.8 KiB
Python
from fastapi import FastAPI
|
|
from pydantic import BaseModel
|
|
import uvicorn
|
|
import boto3
|
|
import json
|
|
import time
|
|
import io
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
from PyPDF2 import PdfReader
|
|
|
|
from utils.langgraph_agent import RULES, run_agent
|
|
|
|
app = FastAPI()
|
|
|
|
AWS_REGION = "us-east-2"
|
|
|
|
|
|
# --- S3 / Textract helpers ---
|
|
|
|
def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
|
|
parsed = urlparse(s3_uri)
|
|
if parsed.scheme != "s3":
|
|
raise ValueError(f"Not an S3 URI: {s3_uri}")
|
|
bucket = parsed.netloc
|
|
key = parsed.path.lstrip("/")
|
|
if not bucket or not key:
|
|
raise ValueError(f"Invalid S3 URI: {s3_uri}")
|
|
return bucket, key
|
|
|
|
|
|
def get_s3_client():
|
|
return boto3.client("s3", region_name=AWS_REGION)
|
|
|
|
|
|
def get_textract_client():
|
|
return boto3.client("textract", region_name=AWS_REGION)
|
|
|
|
|
|
def get_pdf_page_count(pdf_bytes: bytes) -> int:
|
|
try:
|
|
return len(PdfReader(io.BytesIO(pdf_bytes)).pages)
|
|
except Exception:
|
|
return 1
|
|
|
|
|
|
def extract_text_from_textract_response(response: dict) -> str:
|
|
if not response:
|
|
return ""
|
|
return "\n".join(
|
|
block["Text"] for block in response.get("Blocks", [])
|
|
if block["BlockType"] == "LINE"
|
|
)
|
|
|
|
|
|
def extract_text_from_s3_document(bucket: str, key: str) -> str:
|
|
s3 = get_s3_client()
|
|
textract = get_textract_client()
|
|
file_ext = Path(key).suffix.lower()
|
|
|
|
if file_ext in [".png", ".jpg", ".jpeg"]:
|
|
response = textract.detect_document_text(
|
|
Document={"S3Object": {"Bucket": bucket, "Name": key}}
|
|
)
|
|
return extract_text_from_textract_response(response)
|
|
|
|
if file_ext == ".pdf":
|
|
obj = s3.get_object(Bucket=bucket, Key=key)
|
|
pdf_bytes = obj["Body"].read()
|
|
page_count = get_pdf_page_count(pdf_bytes)
|
|
|
|
if page_count > 1:
|
|
response = textract.start_document_text_detection(
|
|
DocumentLocation={"S3Object": {"Bucket": bucket, "Name": key}}
|
|
)
|
|
job_id = response["JobId"]
|
|
while True:
|
|
result = textract.get_document_text_detection(JobId=job_id)
|
|
status = result["JobStatus"]
|
|
if status == "SUCCEEDED":
|
|
return extract_text_from_textract_response(result)
|
|
elif status == "FAILED":
|
|
return ""
|
|
time.sleep(2)
|
|
else:
|
|
response = textract.detect_document_text(
|
|
Document={"S3Object": {"Bucket": bucket, "Name": key}}
|
|
)
|
|
return extract_text_from_textract_response(response)
|
|
|
|
return ""
|
|
|
|
|
|
# --- Guia processing ---
|
|
|
|
def process_guia(guia: dict) -> dict:
|
|
guia_code = guia.get("guia", {}).get("codigoGuiaLocal", "unknown")
|
|
|
|
# Step 1: Extract text from all anexos
|
|
anexos = guia.get("anexos", [])
|
|
all_extracted_texts = []
|
|
|
|
for anexo_idx, anexo in enumerate(anexos):
|
|
s3_uri = anexo.get("urlAnexo") or anexo.get("URLAnexo", "")
|
|
nome_arquivo = anexo.get("nomeArquivo", f"attachment_{anexo_idx}")
|
|
|
|
if not s3_uri or not s3_uri.startswith("s3://"):
|
|
anexo["textoExtraido"] = ""
|
|
continue
|
|
|
|
try:
|
|
bucket, key = parse_s3_uri(s3_uri)
|
|
extracted_text = extract_text_from_s3_document(bucket, key)
|
|
except Exception as e:
|
|
print(f" Error extracting text from {nome_arquivo}: {e}")
|
|
extracted_text = ""
|
|
|
|
anexo["textoExtraido"] = extracted_text
|
|
all_extracted_texts.append(f"--- {nome_arquivo} ---\n{extracted_text}")
|
|
|
|
file_content = "\n\n".join(all_extracted_texts)
|
|
|
|
# Step 2: For each servico, run the agent
|
|
servicos = guia.get("servicos", [])
|
|
avaliacao_resultados = []
|
|
|
|
for servico in servicos:
|
|
codigo_servico_raw = str(servico.get("codigoServico", ""))
|
|
code = "".join(c for c in codigo_servico_raw if c.isdigit())
|
|
|
|
if code not in RULES:
|
|
avaliacao_resultados.append({
|
|
"codigoServico": codigo_servico_raw,
|
|
"resultado": "SKIPPED",
|
|
"motivo": f"Codigo '{code}' nao encontrado nas regras",
|
|
"agentOutput": ""
|
|
})
|
|
continue
|
|
|
|
query_data = {
|
|
"atendimento": guia.get("atendimento", {}),
|
|
"guia": guia.get("guia", {}),
|
|
"servico": servico,
|
|
"historico": guia.get("historico", {})
|
|
}
|
|
query = json.dumps(query_data, indent=2, ensure_ascii=False)
|
|
|
|
try:
|
|
agent_output = run_agent(query, code, file_content)
|
|
except Exception as e:
|
|
print(f" Agent error for servico {codigo_servico_raw}: {e}")
|
|
agent_output = f"ERROR: {str(e)}"
|
|
|
|
avaliacao_resultados.append({
|
|
"codigoServico": codigo_servico_raw,
|
|
"resultado": "Aprovado" if "aprov" in "".join(c for c in agent_output.lower() if c.isalnum() or c == ' ') else "Reprovado",
|
|
"agentOutput": agent_output
|
|
})
|
|
|
|
guia["avaliacaoAgente"] = avaliacao_resultados
|
|
return guia
|
|
|
|
|
|
# --- API models ---
|
|
|
|
class ProcessRequest(BaseModel):
|
|
operadora: dict
|
|
guias: list[dict]
|
|
|
|
|
|
# --- Endpoints ---
|
|
|
|
@app.post("/process")
|
|
async def process(request: ProcessRequest):
|
|
results = []
|
|
for idx, guia in enumerate(request.guias):
|
|
try:
|
|
enriched = process_guia(guia)
|
|
results.append(enriched)
|
|
except Exception as e:
|
|
results.append({
|
|
"error": str(e),
|
|
"guia": guia.get("guia", {}).get("codigoGuiaLocal", f"index_{idx}")
|
|
})
|
|
|
|
return {
|
|
"status": "success",
|
|
"operadora": request.operadora,
|
|
"guias": results
|
|
}
|
|
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return {"status": "healthy"}
|
|
|
|
|
|
@app.get("/rules")
|
|
async def get_rules():
|
|
return {"codes": list(RULES.keys())}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|