Adds langfuse ec2 and api key to code

This commit is contained in:
2026-02-26 13:34:45 -03:00
parent 5717cdd254
commit b7c0b92fa3
14 changed files with 487 additions and 41 deletions

View File

@@ -1,4 +1,5 @@
from fastapi import FastAPI
from fastapi import FastAPI, Security, HTTPException
from fastapi.security import APIKeyHeader
from pydantic import BaseModel
import uvicorn
import boto3
@@ -9,11 +10,28 @@ from pathlib import Path
from urllib.parse import urlparse
from PyPDF2 import PdfReader
from datetime import datetime
from utils.langgraph_agent import RULES, run_agent
from utils.secrets_manager import SECRETS
app = FastAPI()
AWS_REGION = "us-east-2"
INPUT_BUCKET="automated-pre-authorization"
OUTPUT_BUCKET = "upflux-doc-analyzer"
VERSION = "v1"
# API Key auth
_api_key_header = APIKeyHeader(name="X-API-Key")
API_KEY = SECRETS["API-KEY"]
AWS_ACCESS_KEY =SECRETS["AWS_ACCESS_KEY"]
AWS_SECRET_KEY = SECRETS["AWS_SECRET_KEY"]
def verify_api_key(api_key: str = Security(_api_key_header)):
if api_key != API_KEY:
raise HTTPException(status_code=403, detail="Invalid API key")
return api_key
# --- S3 / Textract helpers ---
@@ -29,7 +47,13 @@ def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
return bucket, key
def get_s3_client():
def get_s3_input_client():
"""S3 client with cross-account credentials for INPUT_BUCKET."""
return boto3.client("s3",aws_access_key_id=AWS_ACCESS_KEY,aws_secret_access_key=AWS_SECRET_KEY,region_name=AWS_REGION)
def get_s3_output_client():
"""S3 client using ECS task role for OUTPUT_BUCKET."""
return boto3.client("s3", region_name=AWS_REGION)
@@ -53,42 +77,66 @@ def extract_text_from_textract_response(response: dict) -> str:
)
def extract_text_from_s3_document(bucket: str, key: str) -> str:
s3 = get_s3_client()
def extract_text_from_s3_document(bucket: str, key: str) -> tuple[str, int]:
"""Returns (extracted_text, page_count)."""
s3_input = get_s3_input_client()
s3_output = get_s3_output_client()
textract = get_textract_client()
file_ext = Path(key).suffix.lower()
# Download file bytes using cross-account S3 credentials
obj = s3_input.get_object(Bucket=bucket, Key=key)
file_bytes = obj["Body"].read()
if file_ext in [".png", ".jpg", ".jpeg"]:
# Pass bytes directly to Textract (avoids Textract needing cross-account S3 access)
response = textract.detect_document_text(
Document={"S3Object": {"Bucket": bucket, "Name": key}}
Document={"Bytes": file_bytes}
)
return extract_text_from_textract_response(response)
return extract_text_from_textract_response(response), 1
if file_ext == ".pdf":
obj = s3.get_object(Bucket=bucket, Key=key)
pdf_bytes = obj["Body"].read()
page_count = get_pdf_page_count(pdf_bytes)
page_count = get_pdf_page_count(file_bytes)
if page_count > 1:
# Async API requires S3Object — copy to local bucket Textract can access
temp_key = f"temp_textract/{Path(key).name}"
s3_output.put_object(Bucket=OUTPUT_BUCKET, Key=temp_key, Body=file_bytes)
response = textract.start_document_text_detection(
DocumentLocation={"S3Object": {"Bucket": bucket, "Name": key}}
DocumentLocation={"S3Object": {"Bucket": OUTPUT_BUCKET, "Name": temp_key}}
)
job_id = response["JobId"]
while True:
result = textract.get_document_text_detection(JobId=job_id)
status = result["JobStatus"]
if status == "SUCCEEDED":
return extract_text_from_textract_response(result)
elif status == "FAILED":
return ""
time.sleep(2)
else:
response = textract.detect_document_text(
Document={"S3Object": {"Bucket": bucket, "Name": key}}
)
return extract_text_from_textract_response(response)
try:
# Wait for job to complete
while True:
result = textract.get_document_text_detection(JobId=job_id)
status = result["JobStatus"]
if status == "SUCCEEDED":
break
elif status == "FAILED":
return "", page_count
time.sleep(2)
return ""
# Collect all blocks across paginated results
all_blocks = result.get("Blocks", [])
while "NextToken" in result:
result = textract.get_document_text_detection(
JobId=job_id, NextToken=result["NextToken"]
)
all_blocks.extend(result.get("Blocks", []))
return extract_text_from_textract_response({"Blocks": all_blocks}), page_count
finally:
s3_output.delete_object(Bucket=OUTPUT_BUCKET, Key=temp_key)
else:
# Single-page PDF — pass bytes directly to sync API
response = textract.detect_document_text(
Document={"Bytes": file_bytes}
)
return extract_text_from_textract_response(response), page_count
return "", 0
# --- Guia processing ---
@@ -110,12 +158,14 @@ def process_guia(guia: dict) -> dict:
try:
bucket, key = parse_s3_uri(s3_uri)
extracted_text = extract_text_from_s3_document(bucket, key)
extracted_text, page_count = extract_text_from_s3_document(bucket, key)
except Exception as e:
print(f" Error extracting text from {nome_arquivo}: {e}")
extracted_text = ""
page_count = 0
anexo["error"] = str(e)
anexo["textoExtraido"] = extracted_text
anexo["pageCount"] = page_count
all_extracted_texts.append(f"--- {nome_arquivo} ---\n{extracted_text}")
file_content = "\n\n".join(all_extracted_texts)
@@ -146,15 +196,22 @@ def process_guia(guia: dict) -> dict:
query = json.dumps(query_data, indent=2, ensure_ascii=False)
try:
agent_output = run_agent(query, code, file_content)
result = run_agent(query, code, file_content)
agent_output = result["response"]
input_tokens = result["input_tokens"]
output_tokens = result["output_tokens"]
except Exception as e:
print(f" Agent error for servico {codigo_servico_raw}: {e}")
agent_output = f"ERROR: {str(e)}"
input_tokens = 0
output_tokens = 0
avaliacao_resultados.append({
"codigoServico": codigo_servico_raw,
"resultado": "Aprovado" if "aprov" in "".join(c for c in agent_output.lower() if c.isalnum() or c == ' ') else "Reprovado",
"agentOutput": agent_output
"agentOutput": agent_output,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
})
guia["avaliacaoAgente"] = avaliacao_resultados
@@ -170,7 +227,7 @@ class ProcessRequest(BaseModel):
# --- Endpoints ---
@app.post("/process")
@app.post("/process", dependencies=[Security(verify_api_key)])
async def process(request: ProcessRequest):
results = []
for idx, guia in enumerate(request.guias):
@@ -183,12 +240,30 @@ async def process(request: ProcessRequest):
"guia": guia.get("guia", {}).get("codigoGuiaLocal", f"index_{idx}")
})
return {
response_body = {
"status": "success",
"operadora": request.operadora,
"guias": results
}
# Save result to S3
try:
s3 = get_s3_output_client()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
for guia_result in results:
numero_guia = guia_result.get("guia", {}).get("codigoGuiaLocal", "unknown")
key = f"{VERSION}/{numero_guia}_{timestamp}.json"
s3.put_object(
Bucket=OUTPUT_BUCKET,
Key=key,
Body=json.dumps(guia_result, ensure_ascii=False),
ContentType="application/json",
)
except Exception as e:
print(f"Error saving to S3: {e}")
return response_body
@app.get("/health")
async def health():