Adds langfuse ec2 and api key to code

2026-02-26 13:34:45 -03:00
parent 5717cdd254
commit b7c0b92fa3
14 changed files with 487 additions and 41 deletions
--- a/code/app.py
+++ b/code/app.py
@@ -1,4 +1,5 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, Security, HTTPException
+from fastapi.security import APIKeyHeader
 from pydantic import BaseModel
 import uvicorn
 import boto3
@@ -9,11 +10,28 @@ from pathlib import Path
 from urllib.parse import urlparse
 from PyPDF2 import PdfReader

+from datetime import datetime
 from utils.langgraph_agent import RULES, run_agent
+from utils.secrets_manager import SECRETS

 app = FastAPI()

 AWS_REGION = "us-east-2"
+INPUT_BUCKET="automated-pre-authorization"
+OUTPUT_BUCKET = "upflux-doc-analyzer"
+VERSION = "v1"
+
+# API Key auth
+_api_key_header = APIKeyHeader(name="X-API-Key")
+API_KEY = SECRETS["API-KEY"]
+
+AWS_ACCESS_KEY =SECRETS["AWS_ACCESS_KEY"]
+AWS_SECRET_KEY = SECRETS["AWS_SECRET_KEY"]
+
+def verify_api_key(api_key: str = Security(_api_key_header)):
+    if api_key != API_KEY:
+        raise HTTPException(status_code=403, detail="Invalid API key")
+    return api_key


 # --- S3 / Textract helpers ---
@@ -29,7 +47,13 @@ def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
    return bucket, key


-def get_s3_client():
+def get_s3_input_client():
+    """S3 client with cross-account credentials for INPUT_BUCKET."""
+    return boto3.client("s3",aws_access_key_id=AWS_ACCESS_KEY,aws_secret_access_key=AWS_SECRET_KEY,region_name=AWS_REGION)
+
+
+def get_s3_output_client():
+    """S3 client using ECS task role for OUTPUT_BUCKET."""
    return boto3.client("s3", region_name=AWS_REGION)


@@ -53,42 +77,66 @@ def extract_text_from_textract_response(response: dict) -> str:
    )


-def extract_text_from_s3_document(bucket: str, key: str) -> str:
-    s3 = get_s3_client()
+def extract_text_from_s3_document(bucket: str, key: str) -> tuple[str, int]:
+    """Returns (extracted_text, page_count)."""
+    s3_input = get_s3_input_client()
+    s3_output = get_s3_output_client()
    textract = get_textract_client()
    file_ext = Path(key).suffix.lower()

+    # Download file bytes using cross-account S3 credentials
+    obj = s3_input.get_object(Bucket=bucket, Key=key)
+    file_bytes = obj["Body"].read()
+
    if file_ext in [".png", ".jpg", ".jpeg"]:
+        # Pass bytes directly to Textract (avoids Textract needing cross-account S3 access)
        response = textract.detect_document_text(
-            Document={"S3Object": {"Bucket": bucket, "Name": key}}
+            Document={"Bytes": file_bytes}
        )
-        return extract_text_from_textract_response(response)
+        return extract_text_from_textract_response(response), 1

    if file_ext == ".pdf":
-        obj = s3.get_object(Bucket=bucket, Key=key)
-        pdf_bytes = obj["Body"].read()
-        page_count = get_pdf_page_count(pdf_bytes)
+        page_count = get_pdf_page_count(file_bytes)

        if page_count > 1:
+            # Async API requires S3Object — copy to local bucket Textract can access
+            temp_key = f"temp_textract/{Path(key).name}"
+            s3_output.put_object(Bucket=OUTPUT_BUCKET, Key=temp_key, Body=file_bytes)
+
            response = textract.start_document_text_detection(
-                DocumentLocation={"S3Object": {"Bucket": bucket, "Name": key}}
+                DocumentLocation={"S3Object": {"Bucket": OUTPUT_BUCKET, "Name": temp_key}}
            )
            job_id = response["JobId"]
-            while True:
-                result = textract.get_document_text_detection(JobId=job_id)
-                status = result["JobStatus"]
-                if status == "SUCCEEDED":
-                    return extract_text_from_textract_response(result)
-                elif status == "FAILED":
-                    return ""
-                time.sleep(2)
-        else:
-            response = textract.detect_document_text(
-                Document={"S3Object": {"Bucket": bucket, "Name": key}}
-            )
-            return extract_text_from_textract_response(response)
+            try:
+                # Wait for job to complete
+                while True:
+                    result = textract.get_document_text_detection(JobId=job_id)
+                    status = result["JobStatus"]
+                    if status == "SUCCEEDED":
+                        break
+                    elif status == "FAILED":
+                        return "", page_count
+                    time.sleep(2)

-    return ""
+                # Collect all blocks across paginated results
+                all_blocks = result.get("Blocks", [])
+                while "NextToken" in result:
+                    result = textract.get_document_text_detection(
+                        JobId=job_id, NextToken=result["NextToken"]
+                    )
+                    all_blocks.extend(result.get("Blocks", []))
+
+                return extract_text_from_textract_response({"Blocks": all_blocks}), page_count
+            finally:
+                s3_output.delete_object(Bucket=OUTPUT_BUCKET, Key=temp_key)
+        else:
+            # Single-page PDF — pass bytes directly to sync API
+            response = textract.detect_document_text(
+                Document={"Bytes": file_bytes}
+            )
+            return extract_text_from_textract_response(response), page_count
+
+    return "", 0


 # --- Guia processing ---
@@ -110,12 +158,14 @@ def process_guia(guia: dict) -> dict:

        try:
            bucket, key = parse_s3_uri(s3_uri)
-            extracted_text = extract_text_from_s3_document(bucket, key)
+            extracted_text, page_count = extract_text_from_s3_document(bucket, key)
        except Exception as e:
-            print(f"  Error extracting text from {nome_arquivo}: {e}")
            extracted_text = ""
+            page_count = 0
+            anexo["error"] = str(e)

        anexo["textoExtraido"] = extracted_text
+        anexo["pageCount"] = page_count
        all_extracted_texts.append(f"--- {nome_arquivo} ---\n{extracted_text}")

    file_content = "\n\n".join(all_extracted_texts)
@@ -146,15 +196,22 @@ def process_guia(guia: dict) -> dict:
        query = json.dumps(query_data, indent=2, ensure_ascii=False)

        try:
-            agent_output = run_agent(query, code, file_content)
+            result = run_agent(query, code, file_content)
+            agent_output = result["response"]
+            input_tokens = result["input_tokens"]
+            output_tokens = result["output_tokens"]
        except Exception as e:
            print(f"  Agent error for servico {codigo_servico_raw}: {e}")
            agent_output = f"ERROR: {str(e)}"
+            input_tokens = 0
+            output_tokens = 0

        avaliacao_resultados.append({
            "codigoServico": codigo_servico_raw,
            "resultado": "Aprovado" if "aprov" in "".join(c for c in agent_output.lower() if c.isalnum() or c == ' ') else "Reprovado",
-            "agentOutput": agent_output
+            "agentOutput": agent_output,
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
        })

    guia["avaliacaoAgente"] = avaliacao_resultados
@@ -170,7 +227,7 @@ class ProcessRequest(BaseModel):

 # --- Endpoints ---

-@app.post("/process")
+@app.post("/process", dependencies=[Security(verify_api_key)])
 async def process(request: ProcessRequest):
    results = []
    for idx, guia in enumerate(request.guias):
@@ -183,12 +240,30 @@ async def process(request: ProcessRequest):
                "guia": guia.get("guia", {}).get("codigoGuiaLocal", f"index_{idx}")
            })

-    return {
+    response_body = {
        "status": "success",
        "operadora": request.operadora,
        "guias": results
    }

+    # Save result to S3
+    try:
+        s3 = get_s3_output_client()
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        for guia_result in results:
+            numero_guia = guia_result.get("guia", {}).get("codigoGuiaLocal", "unknown")
+            key = f"{VERSION}/{numero_guia}_{timestamp}.json"
+            s3.put_object(
+                Bucket=OUTPUT_BUCKET,
+                Key=key,
+                Body=json.dumps(guia_result, ensure_ascii=False),
+                ContentType="application/json",
+            )
+    except Exception as e:
+        print(f"Error saving to S3: {e}")
+
+    return response_body
+

@app.get("/health")
 async def health():