AI-upflux-docprocessor/code/app.py

from fastapi import FastAPI, Security, HTTPException
from fastapi.security import APIKeyHeader
from pydantic import BaseModel
import uvicorn
import boto3
import asyncio
import json
import time
import io
from pathlib import Path
from urllib.parse import urlparse
from PyPDF2 import PdfReader,PdfWriter
from datetime import datetime
from utils.langgraph_agent import RULES, run_agent
from utils.secrets_manager import SECRETS
app = FastAPI()

AWS_REGION = "us-east-2"
OUTPUT_BUCKET = "upflux-doc-analyzer"
VERSION = "v1"

# API Key auth
_api_key_header = APIKeyHeader(name="X-API-Key")
API_KEY = SECRETS["API-KEY"]

AWS_ACCESS_KEY =SECRETS["AWS_ACCESS_KEY"]
AWS_SECRET_KEY = SECRETS["AWS_SECRET_KEY"]
_s3_input = boto3.client("s3",aws_access_key_id=AWS_ACCESS_KEY,aws_secret_access_key=AWS_SECRET_KEY,region_name=AWS_REGION)
_s3_output = boto3.client("s3", region_name=AWS_REGION)
_textract  = boto3.client("textract", region_name=AWS_REGION)
def verify_api_key(api_key: str = Security(_api_key_header)):
    if api_key != API_KEY:
        raise HTTPException(status_code=403, detail="Invalid API key")
    return api_key


# --- S3 / Textract helpers ---

def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
    parsed = urlparse(s3_uri)
    if parsed.scheme != "s3":
        raise ValueError(f"Not an S3 URI: {s3_uri}")
    bucket = parsed.netloc
    key = parsed.path.lstrip("/")
    if not bucket or not key:
        raise ValueError(f"Invalid S3 URI: {s3_uri}")
    return bucket, key


def extract_text_from_textract_response(response: dict) -> str:
    if not response:
        return ""
    return "\n".join(
        block["Text"] for block in response.get("Blocks", [])
        if block["BlockType"] == "LINE"
    )


def _split_pdf_pages(pdf_bytes: bytes) -> list[bytes]:
    reader = PdfReader(io.BytesIO(pdf_bytes))
    pages = []
    for page in reader.pages:
        writer = PdfWriter()
        writer.add_page(page)
        buf = io.BytesIO()
        writer.write(buf)
        pages.append(buf.getvalue())
    return pages

def _textract_detect_bytes(file_bytes: bytes) -> str:
    response = _textract.detect_document_text(Document={"Bytes": file_bytes})
    return extract_text_from_textract_response(response)

async def extract_text_from_s3_document(bucket: str, key: str) -> tuple[str, int]:
    file_bytes = await asyncio.to_thread(
        lambda: _s3_input.get_object(Bucket=bucket, Key=key)["Body"].read()
    )
    file_ext = Path(key).suffix.lower()

    if file_ext in [".png", ".jpg", ".jpeg"]:
        text = await asyncio.to_thread(_textract_detect_bytes, file_bytes)
        return text, 1

    if file_ext == ".pdf":
        page_bytes_list = await asyncio.to_thread(_split_pdf_pages, file_bytes)
        texts = await asyncio.gather(*[
            asyncio.to_thread(_textract_detect_bytes, p) for p in page_bytes_list
        ])
        return "\n".join(texts), len(page_bytes_list)

    return "", 0
async def process_guia(guia: dict) -> dict:
    guia_code = guia.get("guia", {}).get("codigoGuiaLocal", "unknown")
    t_start = time.time()

    # Step 1: Extract text from all anexos
    anexos = guia.get("anexos", [])
    all_extracted_texts = []

    async def _extract_anexo(anexo_idx: int, anexo: dict):
        s3_uri = anexo.get("urlAnexo") or anexo.get("URLAnexo", "")
        nome_arquivo = anexo.get("nomeArquivo", f"attachment_{anexo_idx}")
        if not s3_uri or not s3_uri.startswith("s3://"):
            anexo["textoExtraido"] = ""
            return None
        t0 = time.time()
        try:
            bucket, key = parse_s3_uri(s3_uri)
            extracted_text, page_count = await extract_text_from_s3_document(bucket, key)
        except Exception as e:
            extracted_text = ""
            page_count = 0
            anexo["error"] = str(e)
        anexo["textoExtraido"] = extracted_text
        anexo["pageCount"] = page_count
        anexo["tempoExtracaoSegundos"] = round(time.time() - t0, 2)
        return f"--- {nome_arquivo} ---\n{extracted_text}"

    t_extracao_start = time.time()
    parts = await asyncio.gather(*[_extract_anexo(i, a) for i, a in enumerate(anexos)])
    file_content = "\n\n".join(p for p in parts if p)
    t_extracao = round(time.time() - t_extracao_start, 2)

    # Step 2: For each servico, run the agent
    servicos = guia.get("servicos", [])
    avaliacao_resultados = []

    async def _run_servico(servico: dict) -> dict:
        codigo_servico_raw = str(servico.get("codigoServico", ""))
        code = "".join(c for c in codigo_servico_raw if c.isdigit())

        if code not in RULES:
            return {
                "codigoServico": codigo_servico_raw,
                "resultado": "SKIPPED",
                "motivo": f"Codigo '{code}' nao encontrado nas regras",
                "agentOutput": "",
                "tempoAgentSegundos": 0,
            }

        query_data = {
            "atendimento": guia.get("atendimento", {}),
            "guia": guia.get("guia", {}),
            "servico": servico,
            "historico": guia.get("historico", {})
        }
        query = json.dumps(query_data, indent=2, ensure_ascii=False)

        t0 = time.time()
        try:
            result = await run_agent(query, code, file_content)
            agent_output = result["response"]
            input_tokens = result["input_tokens"]
            output_tokens = result["output_tokens"]
        except Exception as e:
            print(f"  Agent error for servico {codigo_servico_raw}: {e}")
            agent_output = f"ERROR: {str(e)}"
            input_tokens = 0
            output_tokens = 0

        return {
            "codigoServico": codigo_servico_raw,
            "resultado": "Aprovado" if "".join(c for c in agent_output.lower() if c.isalpha()).startswith("aprov") else "Reprovado",
            "agentOutput": agent_output,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "tempoAgentSegundos": round(time.time() - t0, 2),
        }

    t_agent_start = time.time()
    guia["avaliacaoAgente"] = list(await asyncio.gather(*[_run_servico(s) for s in servicos]))
    t_agent = round(time.time() - t_agent_start, 2)

    guia["tempoProcessamento"] = {
        "extracaoSegundos": t_extracao,
        "agentSegundos": t_agent,
        "totalSegundos": round(time.time() - t_start, 2),
    }

    return guia


# --- API models ---

class ProcessRequest(BaseModel):
    operadora: dict
    guias: list[dict]


# --- Endpoints ---

@app.post("/process", dependencies=[Security(verify_api_key)])
async def process(request: ProcessRequest):
    raw_results = await asyncio.gather(
        *[process_guia(guia) for guia in request.guias],
        return_exceptions=True
    )
    results = [
        {"error": str(r), "guia": request.guias[i].get("guia", {}).get("codigoGuiaLocal", f"index_{i}")}
        if isinstance(r, Exception) else r
        for i, r in enumerate(raw_results)
    ]

    response_body = {
        "status": "success",
        "operadora": request.operadora,
        "guias": results
    }

    # Save result to S3
    # Save result to S3
    async def _save_guia(guia_result: dict):
        numero_guia = guia_result.get("guia", {}).get("codigoGuiaLocal", "unknown")
        key = f"{VERSION}/{numero_guia}_{timestamp}.json"
        await asyncio.to_thread(
            _s3_output.put_object,
            Bucket=OUTPUT_BUCKET,
            Key=key,
            Body=json.dumps(guia_result, ensure_ascii=False),
            ContentType="application/json",
        )

    try:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        await asyncio.gather(*[_save_guia(g) for g in results])
    except Exception as e:
        print(f"Error saving to S3: {e}")

    return response_body


@app.get("/health")
async def health():
    return {"status": "healthy"}


@app.get("/rules")
async def get_rules():
    return {"codes": list(RULES.keys())}


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)