Feat: Adds parallel and async process to calls and ocr

2026-03-11 17:14:47 -03:00
parent b7c0b92fa3
commit 08be8e314d
8 changed files with 193 additions and 1507 deletions
--- a/code/app.py
+++ b/code/app.py
@@ -3,21 +3,19 @@ from fastapi.security import APIKeyHeader
 from pydantic import BaseModel
 import uvicorn
 import boto3
 import asyncio
 import json
 import time
 import io
 from pathlib import Path
 from urllib.parse import urlparse
-from PyPDF2 import PdfReader
+from PyPDF2 import PdfReader,PdfWriter
 from datetime import datetime
 from utils.langgraph_agent import RULES, run_agent
 from utils.secrets_manager import SECRETS
 app = FastAPI()
 AWS_REGION = "us-east-2"
 INPUT_BUCKET="automated-pre-authorization"
 OUTPUT_BUCKET = "upflux-doc-analyzer"
 VERSION = "v1"
@@ -27,7 +25,9 @@ API_KEY = SECRETS["API-KEY"]
 AWS_ACCESS_KEY =SECRETS["AWS_ACCESS_KEY"]
 AWS_SECRET_KEY = SECRETS["AWS_SECRET_KEY"]
-
+_s3_input = boto3.client("s3",aws_access_key_id=AWS_ACCESS_KEY,aws_secret_access_key=AWS_SECRET_KEY,region_name=AWS_REGION)
 _s3_output = boto3.client("s3", region_name=AWS_REGION)
 _textract  = boto3.client("textract", region_name=AWS_REGION)
 def verify_api_key(api_key: str = Security(_api_key_header)):
    if api_key != API_KEY:
        raise HTTPException(status_code=403, detail="Invalid API key")
@@ -47,27 +47,6 @@ def parse_s3_uri(s3_uri: str) -> tuple[str, str]:
    return bucket, key
 def get_s3_input_client():
    """S3 client with cross-account credentials for INPUT_BUCKET."""
    return boto3.client("s3",aws_access_key_id=AWS_ACCESS_KEY,aws_secret_access_key=AWS_SECRET_KEY,region_name=AWS_REGION)
 def get_s3_output_client():
    """S3 client using ECS task role for OUTPUT_BUCKET."""
    return boto3.client("s3", region_name=AWS_REGION)
 def get_textract_client():
    return boto3.client("textract", region_name=AWS_REGION)
 def get_pdf_page_count(pdf_bytes: bytes) -> int:
    try:
        return len(PdfReader(io.BytesIO(pdf_bytes)).pages)
    except Exception:
        return 1
 def extract_text_from_textract_response(response: dict) -> str:
    if not response:
        return ""
@@ -77,115 +56,87 @@ def extract_text_from_textract_response(response: dict) -> str:
    )
-def extract_text_from_s3_document(bucket: str, key: str) -> tuple[str, int]:
+def _split_pdf_pages(pdf_bytes: bytes) -> list[bytes]:
-    """Returns (extracted_text, page_count)."""
+    reader = PdfReader(io.BytesIO(pdf_bytes))
-    s3_input = get_s3_input_client()
+    pages = []
-    s3_output = get_s3_output_client()
+    for page in reader.pages:
-    textract = get_textract_client()
+        writer = PdfWriter()
        writer.add_page(page)
        buf = io.BytesIO()
        writer.write(buf)
        pages.append(buf.getvalue())
    return pages
 def _textract_detect_bytes(file_bytes: bytes) -> str:
    response = _textract.detect_document_text(Document={"Bytes": file_bytes})
    return extract_text_from_textract_response(response)
 async def extract_text_from_s3_document(bucket: str, key: str) -> tuple[str, int]:
    file_bytes = await asyncio.to_thread(
        lambda: _s3_input.get_object(Bucket=bucket, Key=key)["Body"].read()
    )
    file_ext = Path(key).suffix.lower()
    # Download file bytes using cross-account S3 credentials
    obj = s3_input.get_object(Bucket=bucket, Key=key)
    file_bytes = obj["Body"].read()
    if file_ext in [".png", ".jpg", ".jpeg"]:
-        # Pass bytes directly to Textract (avoids Textract needing cross-account S3 access)
+        text = await asyncio.to_thread(_textract_detect_bytes, file_bytes)
-        response = textract.detect_document_text(
+        return text, 1
            Document={"Bytes": file_bytes}
        )
        return extract_text_from_textract_response(response), 1
    if file_ext == ".pdf":
-        page_count = get_pdf_page_count(file_bytes)
+        page_bytes_list = await asyncio.to_thread(_split_pdf_pages, file_bytes)
-
+        texts = await asyncio.gather(*[
-        if page_count > 1:
+            asyncio.to_thread(_textract_detect_bytes, p) for p in page_bytes_list
-            # Async API requires S3Object — copy to local bucket Textract can access
+        ])
-            temp_key = f"temp_textract/{Path(key).name}"
+        return "\n".join(texts), len(page_bytes_list)
            s3_output.put_object(Bucket=OUTPUT_BUCKET, Key=temp_key, Body=file_bytes)
            response = textract.start_document_text_detection(
                DocumentLocation={"S3Object": {"Bucket": OUTPUT_BUCKET, "Name": temp_key}}
            )
            job_id = response["JobId"]
            try:
                # Wait for job to complete
                while True:
                    result = textract.get_document_text_detection(JobId=job_id)
                    status = result["JobStatus"]
                    if status == "SUCCEEDED":
                        break
                    elif status == "FAILED":
                        return "", page_count
                    time.sleep(2)
                # Collect all blocks across paginated results
                all_blocks = result.get("Blocks", [])
                while "NextToken" in result:
                    result = textract.get_document_text_detection(
                        JobId=job_id, NextToken=result["NextToken"]
                    )
                    all_blocks.extend(result.get("Blocks", []))
                return extract_text_from_textract_response({"Blocks": all_blocks}), page_count
            finally:
                s3_output.delete_object(Bucket=OUTPUT_BUCKET, Key=temp_key)
        else:
            # Single-page PDF — pass bytes directly to sync API
            response = textract.detect_document_text(
                Document={"Bytes": file_bytes}
            )
            return extract_text_from_textract_response(response), page_count
    return "", 0
-
+async def process_guia(guia: dict) -> dict:
 # --- Guia processing ---
 def process_guia(guia: dict) -> dict:
    guia_code = guia.get("guia", {}).get("codigoGuiaLocal", "unknown")
    t_start = time.time()
    # Step 1: Extract text from all anexos
    anexos = guia.get("anexos", [])
    all_extracted_texts = []
-    for anexo_idx, anexo in enumerate(anexos):
+    async def _extract_anexo(anexo_idx: int, anexo: dict):
        s3_uri = anexo.get("urlAnexo") or anexo.get("URLAnexo", "")
        nome_arquivo = anexo.get("nomeArquivo", f"attachment_{anexo_idx}")
        if not s3_uri or not s3_uri.startswith("s3://"):
            anexo["textoExtraido"] = ""
-            continue
+            return None
-
+        t0 = time.time()
        try:
            bucket, key = parse_s3_uri(s3_uri)
-            extracted_text, page_count = extract_text_from_s3_document(bucket, key)
+            extracted_text, page_count = await extract_text_from_s3_document(bucket, key)
        except Exception as e:
            extracted_text = ""
            page_count = 0
            anexo["error"] = str(e)
        anexo["textoExtraido"] = extracted_text
        anexo["pageCount"] = page_count
-        all_extracted_texts.append(f"--- {nome_arquivo} ---\n{extracted_text}")
+        anexo["tempoExtracaoSegundos"] = round(time.time() - t0, 2)
        return f"--- {nome_arquivo} ---\n{extracted_text}"
-    file_content = "\n\n".join(all_extracted_texts)
+    t_extracao_start = time.time()
    parts = await asyncio.gather(*[_extract_anexo(i, a) for i, a in enumerate(anexos)])
    file_content = "\n\n".join(p for p in parts if p)
    t_extracao = round(time.time() - t_extracao_start, 2)
    # Step 2: For each servico, run the agent
    servicos = guia.get("servicos", [])
    avaliacao_resultados = []
-    for servico in servicos:
+    async def _run_servico(servico: dict) -> dict:
        codigo_servico_raw = str(servico.get("codigoServico", ""))
        code = "".join(c for c in codigo_servico_raw if c.isdigit())
        if code not in RULES:
-            avaliacao_resultados.append({
+            return {
                "codigoServico": codigo_servico_raw,
                "resultado": "SKIPPED",
                "motivo": f"Codigo '{code}' nao encontrado nas regras",
-                "agentOutput": ""
+                "agentOutput": "",
-            })
+                "tempoAgentSegundos": 0,
-            continue
+            }
        query_data = {
            "atendimento": guia.get("atendimento", {}),
@@ -195,8 +146,9 @@ def process_guia(guia: dict) -> dict:
        }
        query = json.dumps(query_data, indent=2, ensure_ascii=False)
        t0 = time.time()
        try:
-            result = run_agent(query, code, file_content)
+            result = await run_agent(query, code, file_content)
            agent_output = result["response"]
            input_tokens = result["input_tokens"]
            output_tokens = result["output_tokens"]
@@ -206,15 +158,25 @@ def process_guia(guia: dict) -> dict:
            input_tokens = 0
            output_tokens = 0
-        avaliacao_resultados.append({
+        return {
            "codigoServico": codigo_servico_raw,
-            "resultado": "Aprovado" if "aprov" in "".join(c for c in agent_output.lower() if c.isalnum() or c == ' ') else "Reprovado",
+            "resultado": "Aprovado" if "".join(c for c in agent_output.lower() if c.isalpha()).startswith("aprov") else "Reprovado",
            "agentOutput": agent_output,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
-        })
+            "tempoAgentSegundos": round(time.time() - t0, 2),
        }
    t_agent_start = time.time()
    guia["avaliacaoAgente"] = list(await asyncio.gather(*[_run_servico(s) for s in servicos]))
    t_agent = round(time.time() - t_agent_start, 2)
    guia["tempoProcessamento"] = {
        "extracaoSegundos": t_extracao,
        "agentSegundos": t_agent,
        "totalSegundos": round(time.time() - t_start, 2),
    }
    guia["avaliacaoAgente"] = avaliacao_resultados
    return guia
@@ -229,16 +191,15 @@ class ProcessRequest(BaseModel):
@app.post("/process", dependencies=[Security(verify_api_key)])
 async def process(request: ProcessRequest):
-    results = []
+    raw_results = await asyncio.gather(
-    for idx, guia in enumerate(request.guias):
+        *[process_guia(guia) for guia in request.guias],
-        try:
+        return_exceptions=True
-            enriched = process_guia(guia)
+    )
-            results.append(enriched)
+    results = [
-        except Exception as e:
+        {"error": str(r), "guia": request.guias[i].get("guia", {}).get("codigoGuiaLocal", f"index_{i}")}
-            results.append({
+        if isinstance(r, Exception) else r
-                "error": str(e),
+        for i, r in enumerate(raw_results)
-                "guia": guia.get("guia", {}).get("codigoGuiaLocal", f"index_{idx}")
+    ]
            })
    response_body = {
        "status": "success",
@@ -247,18 +208,21 @@ async def process(request: ProcessRequest):
    }
    # Save result to S3
    # Save result to S3
    async def _save_guia(guia_result: dict):
        numero_guia = guia_result.get("guia", {}).get("codigoGuiaLocal", "unknown")
        key = f"{VERSION}/{numero_guia}_{timestamp}.json"
        await asyncio.to_thread(
            _s3_output.put_object,
            Bucket=OUTPUT_BUCKET,
            Key=key,
            Body=json.dumps(guia_result, ensure_ascii=False),
            ContentType="application/json",
        )
    try:
        s3 = get_s3_output_client()
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        for guia_result in results:
+        await asyncio.gather(*[_save_guia(g) for g in results])
            numero_guia = guia_result.get("guia", {}).get("codigoGuiaLocal", "unknown")
            key = f"{VERSION}/{numero_guia}_{timestamp}.json"
            s3.put_object(
                Bucket=OUTPUT_BUCKET,
                Key=key,
                Body=json.dumps(guia_result, ensure_ascii=False),
                ContentType="application/json",
            )
    except Exception as e:
        print(f"Error saving to S3: {e}")
--- a/code/utils/langgraph_agent.py
+++ b/code/utils/langgraph_agent.py
@@ -7,6 +7,7 @@ This agent demonstrates a basic ReAct-style agent with tool calling capabilities
 import boto3
 import csv
 import json
 import asyncio
 from pathlib import Path
 from typing import Annotated, TypedDict, Literal
 from langgraph.graph import StateGraph, END
@@ -202,7 +203,7 @@ Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Todas as solicitações devem ser direcionadas para análise da Auditoria Médica,
-com a documentação mínima.""",
+com a Documentação.""",
 "4034906":"""Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
@@ -230,7 +231,7 @@ Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Todas as solicitações devem ser direcionadas para análise da Auditoria Médica,
-com a documentação mínima.""",
+com a Documentação.""",
 "20103190":"""Autorizar sem o parecer da Auditoria Médica desde que a justificativa indique pelo
 menos uma das patologias abaixo ou CID’s relacionados:
 ·Incontinência urinária (CID R32);
@@ -272,7 +273,7 @@ Para beneficiário Unimed 0032 (atendimento local):
 Esse código não requer análise da Auditoria Médica, analisar baseado na cobertura e
 limite contratual do beneficiário.
 Caso seja necessário análise da auditoria médica devido alguma mensagem de
-negação, consultar o item "Documentação Mínima".
+negação, consultar o item "Documentação".
 Para beneficiário Unimed 0032 (atendimento em outra singular/Exportado):
 A liberação do procedimento não está condicionada a solicitação dos materiais, pois
 o mesmo poderá ser realizado sem a utilização da "Alça de Polipectomia" e da
@@ -286,7 +287,7 @@ Para beneficiário Unimed 0032 (atendimento local):
 Esse código não requer análise da Auditoria Médica, analisar baseado na cobertura e
 limite contratual do beneficiário.
 Caso seja necessário análise da auditoria médica devido alguma mensagem de
-negação, consultar o item "Documentação Mínima".
+negação, consultar o item "Documentação".
 Para beneficiário Unimed 0032 (atendimento em outra singular/Exportado):
 A liberação do procedimento não está condicionada a solicitação dos materiais, pois
 o mesmo poderá ser realizado sem a utilização da "Alça de Polipectomia" e da
@@ -311,53 +312,53 @@ Encaminhar para deliberação da Unimed origem.""",
 }
 MIN_DOC={
-    "20103190":"""DOCUMENTAÇÃO MÍNIMA
+    "20103190":"""Documentação
 Justificativa Médica e/ou indicação clínica.""",
-    "20203020":"""DOCUMENTAÇÃO MÍNIMA:
+    "20203020":"""Documentação:
 ·
 Justificativa Médica e/ou indicação clínica.""",
-"31303293":"""DOCUMENTAÇÃO MÍNIMA
+"31303293":"""Documentação
 Beneficiário 0032 (atendidos em Curitiba e em Outras cidades):
 ·Justificativa médica e/ou indicação clínica;
 ·Formulário de Solicitação - DIU Hormonal (ver anexo do script).
 Beneficiário Intercâmbio/Outras Unimeds:
 ·
 Relatório Médico Detalhado (conforme racionalização).""",
-"40202542":"""DOCUMENTAÇÃO MÍNIMA
+"40202542":"""Documentação
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa Médica e/ou indicação clínica.""",
-"40202550":"""DOCUMENTAÇÃO MÍNIMA
+"40202550":"""Documentação
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa Médica e/ou indicação clínica.""",
-"40304906":"""DOCUMENTAÇÃO MÍNIMA
+"40304906":"""Documentação
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa e/ou indicação clínica.""",
-"4034906":"""DOCUMENTAÇÃO MÍNIMA
+"4034906":"""Documentação
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa e/ou indicação clínica.""",
-"40314626":"""DOCUMENTAÇÃO MÍNIMA
+"40314626":"""Documentação
 Não há;""",
-"40314618":"""DOCUMENTAÇÃO MÍNIMA
+"40314618":"""Documentação
 Não há;""",
-"40323676":"""DOCUMENTAÇÃO MÍNIMA
+"40323676":"""Documentação
 Não há;""",
 "40901254":"""Justificativa Médica e/ou indicação clínica informando a idade gestacional e Laudo
 do 1º exame sonográfico gestacional realizado.""",
-"40901262":"""DOCUMENTAÇÃO MÍNIMA
+"40901262":"""Documentação
 Relatório médico informando a idade gestacional + laudo do 1o exame sonográfico
 gestacional realizado.""",
-"4091262":"""DOCUMENTAÇÃO MÍNIMA
+"4091262":"""Documentação
 Relatório médico informando a idade gestacional + laudo do 1o exame sonográfico
 gestacional realizado.""",
-"4101230":"""DOCUMENTAÇÃO MÍNIMA
+"4101230":"""Documentação
 Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
@@ -370,7 +371,7 @@ esforço físico;
 e/ ou TIMI risk;
 ·
 Laudos de exames cardiológicos recentes.""",
-"4101230":"""DOCUMENTAÇÃO MÍNIMA
+"4101230":"""Documentação
 Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
@@ -383,7 +384,7 @@ esforço físico;
 e/ ou TIMI risk;
 ·
 Laudos de exames cardiológicos recentes.""",
-"41501144":"""DOCUMENTAÇÃO MÍNIMA
+"41501144":"""Documentação
 PARA BENEFICIÁRIOS 0032 SENDO ATENDIDO EM CURITIBA, FORA (EXPORTADO),
 E PAC:
 ·
@@ -407,18 +408,18 @@ Esclarecemos que o envio de retinografia, poderá eventualmente ser solicitado p
 auditoria para fins de elucidação diagnóstica.
 PARA BENEFICIÁRIOS INTERCÂMBIO NACIONAL:
 SEGUIR DOCUMENTAÇÃO CONFORME RACIONALIZAÇÃO.""",
-"31005101":"""DOCUMENTAÇÃO MÍNIMA:
+"31005101":"""Documentação:
 ·Relatório médico detalhado;
 ·Laudo RX e/ou tomografia e/ou ressonância e/ou ultrassonografia.""",
-"31005470":"""DOCUMENTAÇÃO MÍNIMA:
+"31005470":"""Documentação:
 ·Relatório médico detalhado;
 ·Laudo RX e/ou tomografia e/ou ressonância e/ou ultrassonografia.""",
-"40808122":"""DOCUMENTAÇÃO MÍNIMA
+"40808122":"""Documentação
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa e/ou indicação clínica.""",
-"40808130":"""DOCUMENTAÇÃO MÍNIMA
+"40808130":"""Documentação
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
@@ -531,7 +532,7 @@ def create_agent(file_content: str = ""):
    return workflow.compile()
-def run_agent(query: str, code: str, file_content: str = "") -> str:
+async def run_agent(query: str, code: str, file_content: str = "") -> str:
    """
    Run the agent with a given query.
@@ -544,14 +545,17 @@ def run_agent(query: str, code: str, file_content: str = "") -> str:
        The agent's final response
    """
    agent = create_agent(file_content)
-    SYSTEM_PROMPT = """You are a AI assistant responsible to check if a person is Allowed or Denied acces to medical procedure based on the following rules:
+    SYSTEM_PROMPT = """You are a AI assistant responsible to check if a person is Allowed or Denied acces to medical procedure based on the inpout data. There are a few always accepted criteira in which, any of them been met, even a single one, will be accepted, these criteria been:
-<rules>
+<auto-accept-criteria>
 """+RULES[code]+""""
-<\rules>
+<\auto-accept-criteria>
-Also this is the required documentation or infomation for aproval, equivalent information to the required one in another is allowed, do try to indetify every document as one of these if it fits, or contains the required information, or not identified:
+If those criteria aren´t met, you can check the documents to see if the following information are present, if so, aprove the procedure:
-"""+MIN_DOC[code]+""""
+<additional-information>"""+MIN_DOC[code]+"""
 <\additional-information>
 If the additional information is not present, but any of the auto-accept-criteira are met, allow the procedure.
 If there aren´t any auto-accept criteria present, check the documents for the additional information, and if they are all present, even if not in the exact type of document especified in them, allow the procedure.
 Your capabilities:
- You can check the OCR of anexed documents if the json input is not enough to determinate if it should be aproved, using the check tool.
+- You can check the OCR of all the documents anexed, at the same time, if the json input is not enough to determinate if it should be aproved, using the check tool.
 For every document, check if the name of the person in json is present, and at output list every document and if it belongs to the person in the request.
 Start your answer with either:
@@ -559,8 +563,38 @@ Start your answer with either:
 Reprovado: If there aren't any rules met.
 And list the document classification and the met criteira, in case of aprovation. Be really precise and succint.
- Start the response with either Aprovado or Reprovado, do not add any characters before either of them, even "*" """
+ Start the response with either Aprovado or Reprovado, do not add any characters before either of them, even "*".
  You must start the message with the result, either Aprovado or Reprovado. It must be the first word at the output.
   <examples>
   Exemplos de saída:
    Aprovado
    Critério:
    Idade superior a 25 anos
    Documentos anexados:
    Nome Documento 1 - Pertence a pessoa FUlana
    Nome documento 2- Pertence a pessoa FUlana
    Reprovado
    Critério:
    Nenhum crtiério preenchido e informações para aprovação faltando no documento
    Aprovado
    Critério:
    Fornecidos documentos nescessários contendo analise médica e pedido de exame
    Documentos anexados:
    Guia - Pertence a pessoa Fulana, contém pedido de exame, mesmo não sendo pedido de exame
    Laudo- Pertence a pessoa Fula, contém análise médica
    <\examples>
    <answer_format>
    (Aprovado ou Reprovado)
    Critério:
    ######
    Documentos anexados:
    ######
    ######
   <\answer_format>
    Follow the answer format strictly, do not list the person data nor start with phrases like (Vou analisar a solicitação de ####### par a NOME DA PESSOA) or anything like it. Just stick with the format, dont add anything else """
    user_message = query
    if file_content:
        user_message += "\n\n<documentos_anexados>\n" + file_content + "\n</documentos_anexados>"
@@ -572,13 +606,10 @@ Start your answer with either:
        ]
    }
    print(f"\nUser: {query}")
    print("-" * 50)
    # Run the agent
    langfuse_handler = CallbackHandler()
    config = {"callbacks": [langfuse_handler]}
-    final_state = agent.invoke(initial_state, config=config)
+    final_state = await agent.ainvoke(initial_state, config=config)
    # Get the final response
    final_message = final_state["messages"][-1]
@@ -593,9 +624,7 @@ Start your answer with either:
            input_tokens += usage.get("input_tokens", 0)
            output_tokens += usage.get("output_tokens", 0)
-    langfuse.flush()
+    await asyncio.to_thread(langfuse.flush)
    print(f"Agent: {response}")
    print(f"Tokens - input: {input_tokens}, output: {output_tokens}")
    return {"response": response, "input_tokens": input_tokens, "output_tokens": output_tokens}
--- a/infra/ecs_alb/main.tf
+++ b/infra/ecs_alb/main.tf
@@ -331,6 +331,45 @@ resource "aws_ecs_service" "app" {
    Name = "${var.app_name}-service"
  }
 }
 # Autoscaling
 resource "aws_appautoscaling_target" "ecs" {
  max_capacity       = 3
  min_capacity       = 1
  resource_id        = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.app.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  service_namespace  = "ecs"
 }
 resource "aws_appautoscaling_policy" "cpu" {
  name               = "${var.app_name}-cpu-autoscaling"
  policy_type        = "TargetTrackingScaling"
  resource_id        = aws_appautoscaling_target.ecs.resource_id
  scalable_dimension = aws_appautoscaling_target.ecs.scalable_dimension
  service_namespace  = aws_appautoscaling_target.ecs.service_namespace
  target_tracking_scaling_policy_configuration {
    predefined_metric_specification {
      predefined_metric_type = "ECSServiceAverageCPUUtilization"
    }
    target_value = 70.0
  }
 }
 resource "aws_appautoscaling_policy" "memory" {
  name               = "${var.app_name}-memory-autoscaling"
  policy_type        = "TargetTrackingScaling"
  resource_id        = aws_appautoscaling_target.ecs.resource_id
  scalable_dimension = aws_appautoscaling_target.ecs.scalable_dimension
  service_namespace  = aws_appautoscaling_target.ecs.service_namespace
  target_tracking_scaling_policy_configuration {
    predefined_metric_specification {
      predefined_metric_type = "ECSServiceAverageMemoryUtilization"
    }
    target_value = 80.0
  }
 }
 #ECS Task Role (for application to call AWS services)
 resource "aws_iam_role" "ecs_task_role" {
  name = "${var.app_name}-ecs-task-role"
--- a/infra/ecs_alb/variables.tf
+++ b/infra/ecs_alb/variables.tf
@@ -1,7 +1,7 @@
 variable "aws_region" {
  description = "AWS region"
  type        = string
-  default     = "us-east-1"
+  default     = "us-east-2"
 }
 variable "app_name" {
--- a/scripts/langgraph_agent.py
+++ b/scripts/langgraph_agent.py
@@ -1,703 +0,0 @@
 #!/usr/bin/env python3
 """
 Simple LangGraph agent using AWS Bedrock.
 This agent demonstrates a basic ReAct-style agent with tool calling capabilities.
 """
 import boto3
 import csv
 import json
 from pathlib import Path
 from typing import Annotated, TypedDict, Literal
 from langgraph.graph import StateGraph, END
 from langgraph.graph.message import add_messages
 from langchain_aws import ChatBedrock
 from langchain_core.messages import HumanMessage, AIMessage, ToolMessage, SystemMessage
 from langchain_core.tools import tool
 # Global variable to store file content for the check tool
 FILE = ""
 CODE=""
 # Base paths
 SCRIPTS_DIR = Path(__file__).parent
 JSON_OUTPUT_DIR = SCRIPTS_DIR / "json_output"
 TEXTRACT_OUTPUT_DIR = SCRIPTS_DIR / "textract_output"
 RULES={
    "40808130":"""- Mulheres acima de 45 anos ou menopausada
 - Homens com mais de 70 anos;
 - Osteogênese imperfeita (para esta patologia, poderá haver a liberação de (02) dois 
 exames ao ano - cada 180 dias);
 - RX com osteopenia ou fratura patológica;
 - Antecedente pessoal de fratura após os 40 anos: punho, ombros, vértebras, quadril;
 - Parente de primeiro grau com osteoporose.
 - Mulheres com massa corporal <20kg/m2 ou peso < 57,8kg;
 - Menopausa antes dos 45 anos ou hipogonasismo crônico (falência ovariana 
 precoce);
 - Uso de glicocorticóides (>=7,5 prednizona/ dia equivalente por mais três meses, ou 
 presença de síndrome de cushing;
 - Hiperparatireoidismo primário;
 - Uso prolongado de anticonvulsivantes (< 10 anos);
 - Síndrome de má absorção crônica ou desnutrição doenças inflamatória intestinal 
 (independente da causa: bariatrica, celiacos, intolerancia a lactose).
 - Quimioterapia, se sobrevida esperada for longa (< 5 anos);
 - Diminuição documentada de altura;
 - Presença de cifose após menopausa.
 - Imobilização prolongada""",
 "40808122":"""- Mulheres acima de 45 anos ou menopausada
 - Homens com mais de 70 anos;
 - Osteogênese imperfeita (para esta patologia, poderá haver a liberação de (02) dois 
 exames ao ano - cada 180 dias);
 - RX com osteopenia ou fratura patológica;
 - Antecedente pessoal de fratura após os 40 anos: punho, ombros, vértebras, quadril;
 - Parente de primeiro grau com osteoporose.
 - Mulheres com massa corporal <20kg/m2 ou peso < 57,8kg;
 - Menopausa antes dos 45 anos ou hipogonasismo crônico (falência ovariana 
 precoce);
 - Uso de glicocorticóides (>=7,5 prednizona/ dia equivalente por mais três meses, ou 
 presença de síndrome de cushing;
 - Hiperparatireoidismo primário;
 - Uso prolongado de anticonvulsivantes (< 10 anos);
 - Síndrome de má absorção crônica ou desnutrição doenças inflamatória intestinal 
 (independente da causa: bariatrica, celiacos, intolerancia a lactose).
 - Quimioterapia, se sobrevida esperada for longa (< 5 anos);
 - Diminuição documentada de altura;
 - Presença de cifose após menopausa.
 - Imobilização prolongada""",
 "31303293":"""Mirena e Kyleena possuem critérios diferentes para autorização.
 Autorizar sem o parecer da Auditoria Médica, somente quando for beneficiária com
 idade entre 18 e 45 anos e se o formulário estiver preenchido conforme as quatro
 validações abaixo:
 · Identificação completa do paciente e médico assistente;
 · Campo 1.1: selecionado: Mirena ou Kyleena;
 · Campo 1.2: selecionado:
 · Mirena: Menorragia idiopática (sangramento) ou Anticoncepção; ou Kyleena:
 Anticoncepção;
 · Se os itens 2 e 3 estiverem descritos como “NÃO”, “NÃO SE APLICA”, “- “ou EM
 BRANCO.
 Encaminhar para a auditoria médica (Mirena ou Kyleena) somente quando:
 - Solicitações que incluam outros eventos além do implante de DIU;
 - Constar pedido de liberação de anestesia;
 - No caso de prazo intervalar - Incluir na perícia, a negação gerada e qual a finalidade
 da análise do auditor, informar também a data da última liberação do DIU e a
 necessidade de avaliação por conta do prazo de repetição.
 - Campo 1.1 selecionado: “Outros”;
 - Campo 1.2 selecionado:
 Mirena: “Outros” ou nenhuma opção selecionada;Kyleena: Menorragia idiopática (Sangramento) ou “Outros”;
 - Se qualquer campo dos itens 2 e 3 descreverem alguma contraindicação.
 - Beneficiária menor de 18 anos ou acima de 45 anos.""",
 "31005470":"""Autorizar sem o parecer da Auditoria Médica se constar no pedido médico ou
 laudos, uma das justificativas abaixo:
 ·Cálculo biliar;
 ·Colelitíase;
 ·Litíase;
 ·Pólipos acima de 01 cm.
 Caso a indicação seja outra ou pólipo menor que 01 cm, encaminhar para
 auditoria.""",
 "41501012":"""Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Autorizar sem o parecer da Auditoria Médica solicitações de beneficiários acima de
 60 anos.Autorizar independentemente da idade solicitações em que a justificativa indique
 pelo menos uma das patologias abaixo:
 ·
 4)Catarata senil (CID’S H25-1, H25-2, H25-8, H25-9, H26-1, H26-2, H26-3, H26-
 ·Outros transtornos do cristalino.
 BANDEJA
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 - Bandeja 143 - AML - OFTALMOLOGIA - obrigatoriamente com a documentação
 mínima.
 - Bandeja 51 - AML - SEM INFORMAÇÃO MÍNIMA PARA ANÁLISE: Quando for
 solicitação eletiva e não houver indicação clínica.
 Botão Prioridade: Em caso de beneficiário Intercâmbio, PAC, GLP ou SAC Empresa, é
 obrigatório selecionar a bandeja de retorno no botão prioridade, para correta
 devolução ao responsável.
 ATENDIMENTO INTERCÃMBIO
 IMPORTADO (Beneficiários de Outras Unimed's sendo atendidos em Curitiba e
 região).
 1°Passo: Necessário documentação conforme planilha de racionalização.
 2° Passo: Gerar a solicitação em sistema com documentação em anexo.
 - Se tiver a documentação, anexar no sistema (ATF não necessita anexar).
 - Se não tiver documentação, solicitar que beneficiário providencie para dar entrada.3° Passo: Verificar no monitor de intercâmbio, se o processo está pendente de
 perícia.
 - Se tiver, abrir sala CHAT e anexar as documentações (ATF não necessita anexar).
 - Se não tiver, abrir GPU (ATF não necessita abertura).
 4° Passo: Deixar processo pendente na bandeja para monitoramento.
 -Se trafegou, bandeja 921.
 -Se não trafegou, bandeja 11.
 5° Passo: Registrar em anotações administrativas as informações pertinentes ao
 processo.""",
 "40901254":"""Autorizar no ato do atendimento a 1ª solicitação entre o período (DE 11- 14 SEMANAS
 DE GESTAÇÃO).
 ou
 Se o evento foi gerado decorrente da consulta obstétrica 1.01.01987 - CONSULTA
 OBSTÉTRICA - 2ª AVALIAÇÃO (ATÉ 10 SEMANAS DE GESTAÇÃO).""",
 "40901262":"""Autorizar no ato do atendimento a 1ª solicitação entre o período de 18 a 24 semanas
 de gestação ou se o evento foi gerado decorrente da consulta obstétrica 1.01.01989
 - CONSULTA OBSTÉTRICA - 4ª AVALIAÇÃO (ATÉ 18 SEMANAS DE GESTAÇÃO).A repetição poderá ser autorizada desde que se enquadre em um dos critérios
 abaixo:
 ·Anomalia fetal diagnosticada em exame de rotina.
 ·Antecedentes de doenças hereditárias.
 ·Consanguinidade.
 ·Exposição a drogas.
 ·Idade materna avançada.
 ·Infecções pré-natais.""",
 "4091262":"""Autorizar no ato do atendimento a 1ª solicitação entre o período de 18 a 24 semanas
 de gestação ou se o evento foi gerado decorrente da consulta obstétrica 1.01.01989
 - CONSULTA OBSTÉTRICA - 4ª AVALIAÇÃO (ATÉ 18 SEMANAS DE GESTAÇÃO).A repetição poderá ser autorizada desde que se enquadre em um dos critérios
 abaixo:
 ·Anomalia fetal diagnosticada em exame de rotina.
 ·Antecedentes de doenças hereditárias.
 ·Consanguinidade.
 ·Exposição a drogas.
 ·Idade materna avançada.
 ·Infecções pré-natais.""",
 "40304906":"""Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Poderá ser liberada de imediato as solicitações eletivas que contenha a indicação
 abaixo, conforme DUT:
 •
 Avaliação de pacientes adultos com sinais e sintomas de trombose
 venosa profunda dos membros inferiores;
 URGÊNCIA/EMERGÊNCIA (INTERNAMENTO e AMBULATORIAL)
 Poderá ser liberada de imediato as solicitações de Urgência e Emergência em regime
 Internamento ou em situação de Observação em Unidades de Urgência que
 contenha a indicação:
 •
 Pacientes adultos com sinais e sintomas de embolia pulmonar.
 •
 Pneumonia ou síndrome respiratória aguda grave, com quadro suspeito ou
 confirmado de infecção pelo SARS-CoV-2 (COVID 19).
 PLANOS NÃO REGULAMENTADOS E PLANOS ADAPTADOS
 Os planos não regulamentados (UNIPLAN, UNIPLAN 2000 E NOVO UNIPLAN) e
 adaptados, asseguram cobertura ilimitada para exames laboratoriais, pois não
 seguem o Rol e as diretrizes de utilização. Desta forma podem ser autorizados
 segundo regras contratuais.""",
 "20203020":"""utorizar sem o parecer da Auditoria Médica desde
 que a justificativa indique pelo menos uma das patologias abaixo ou CID’s
 relacionados:·Incontinência urinária (CID R32);
 ·Disfunção miccional (CID N39);
 ·Incontinência de tensão (“stress”) (CID N39.3);
 ·Síndrome da bexiga hiperativa;
 ·Distúrbios do assoalho pélvico;
 ·Incontinência fecal (CID R15);
 ·Outros transtornos funcionais do intestino (CID K59);
 ·Fortalecimento do Assoalho Pélvico pré e/ou pós-parto.""",
 "41001230":"""PROTOCOLO DE LIBERAÇÃO
 Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Todas as solicitações devem ser direcionadas para análise da Auditoria Médica,
 com a documentação mínima.""",
 "4034906":"""Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Poderá ser liberada de imediato as solicitações eletivas que contenha a indicação
 abaixo, conforme DUT:
 •
 Avaliação de pacientes adultos com sinais e sintomas de trombose
 venosa profunda dos membros inferiores;
 URGÊNCIA/EMERGÊNCIA (INTERNAMENTO e AMBULATORIAL)
 Poderá ser liberada de imediato as solicitações de Urgência e Emergência em regime
 Internamento ou em situação de Observação em Unidades de Urgência que
 contenha a indicação:
 •
 Pacientes adultos com sinais e sintomas de embolia pulmonar.
 •
 Pneumonia ou síndrome respiratória aguda grave, com quadro suspeito ou
 confirmado de infecção pelo SARS-CoV-2 (COVID 19).
 PLANOS NÃO REGULAMENTADOS E PLANOS ADAPTADOS
 Os planos não regulamentados (UNIPLAN, UNIPLAN 2000 E NOVO UNIPLAN) e
 adaptados, asseguram cobertura ilimitada para exames laboratoriais, pois não
 seguem o Rol e as diretrizes de utilização. Desta forma podem ser autorizados
 segundo regras contratuais.""",
 "4101230":"""PROTOCOLO DE LIBERAÇÃO
 Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Todas as solicitações devem ser direcionadas para análise da Auditoria Médica,
 com a documentação mínima.""",
 "20103190":"""Autorizar sem o parecer da Auditoria Médica desde que a justificativa indique pelo
 menos uma das patologias abaixo ou CID’s relacionados:
 ·Incontinência urinária (CID R32);
 ·Disfunção miccional (CID N39);
 ·Incontinência de tensão (“stress”) (CID N39.3);·Síndrome da bexiga hiperativa;
 ·Distúrbios do assoalho pélvico;
 ·Incontinência fecal (CID R15);
 ·Outros transtornos funcionais do intestino (CID K59);
 ·Fortalecimento do Assoalho Pélvico pré e/ou pós-parto.
 BENEFICIARIOS ADULTOS (ACIMA DE 18 ANOS):
 Se a solicitação estiver enquadrada nas indicações acima poderá ser autorizado sem
 encaminhar para o AML até 10 quantidades ao mês;
 Ponto de atenção - pertinente 10 sessões total ao mês: considerar códigos
 associados ou individuais (eletroestimulação do assoalho pélvico, disfunção vesico
 uretral, biofeedback, reabilitação perineal)
 BENEFICIARIOS CRIANÇAS (ATÉ 17 ANOS E 11 MESES):
 Se a solicitaçãoEste evento está parametrizado para autorizar automaticamente para beneficiários
 carteirinha 0032.
 · Para beneficiários PAC, início do cartão 09759032, as solicitações destes exames
 devem ser liberadas de imediato, revertendo a guia no motivo: 9 - AUTORIZADO
 CONFORME PROTOCOLO DE LIBERAÇÃO· Para beneficiários 0032 no intercâmbio exportado, as solicitações destes exames
 devem ser liberadas de imediato, revertendo a guia no motivo: 9 - AUTORIZADO
 CONFORME PROTOCOLO DE LIBERAÇÃO
 NÃO NEGAR ATENDIMENTO.
 ATENÇÃO: Exame contratualizado para realização na Unimed Laboratório,
 exclusivamente na Megaunidade.
 Telefone de contato para mais informações: (41) 3021-5252.
 ATENDIMENTO INTERCÂMBIO
 Encaminhar para deliberação da Unimed origem. estiver enquadrada nas indicações acima poderá ser autorizado sem
 encaminhar para o AML até 04 quantidades ao mês;
 Ponto de atenção - pertinente 04 sessões ao mês: para cada modalidade
 (eletroestimulação do assoalho pélvico, disfunção vesico uretral, biofeedback,
 reabilitação perineal)
 Acima desta quantidade encaminhar para a análise da Auditoria Médica com
 justificativa médica.""",
 "40202542":"""Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Esse código não requer análise da Auditoria Médica, analisar baseado na cobertura e
 limite contratual do beneficiário.
 Caso seja necessário análise da auditoria médica devido alguma mensagem de
 negação, consultar o item "Documentação Mínima".
 Para beneficiário Unimed 0032 (atendimento em outra singular/Exportado):
 A liberação do procedimento não está condicionada a solicitação dos materiais, pois
 o mesmo poderá ser realizado sem a utilização da "Alça de Polipectomia" e da
 “Agulha de Esclerose ou Injetor”.
 Quando houver solicitação dos materiais posteriormente à autorização do evento,
 não há a necessidade de análise da AML para os materiais. Deverá ser cadastrado
 conforme informações acima e, liberar. (Circular n.º 056/2008 de 01/08/2008).""",
 "40202550":"""Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Esse código não requer análise da Auditoria Médica, analisar baseado na cobertura e
 limite contratual do beneficiário.
 Caso seja necessário análise da auditoria médica devido alguma mensagem de
 negação, consultar o item "Documentação Mínima".
 Para beneficiário Unimed 0032 (atendimento em outra singular/Exportado):
 A liberação do procedimento não está condicionada a solicitação dos materiais, pois
 o mesmo poderá ser realizado sem a utilização da "Alça de Polipectomia" e da
 “Agulha de Esclerose ou Injetor”.
 Quando houver solicitação dos materiais posteriormente à autorização do evento,
 não há a necessidade de análise da AML para os materiais. Deverá ser cadastrado
 conforme informações acima e, liberar. (Circular n.º 056/2008 de 01/08/2008).""",
 "40323676":"""Este evento está parametrizado para autorizar automaticamente para beneficiários
 carteirinha 0032.
 · Para beneficiários PAC, início do cartão 09759032, as solicitações destes exames
 devem ser liberadas de imediato, revertendo a guia no motivo: 9 - AUTORIZADO
 CONFORME PROTOCOLO DE LIBERAÇÃO· Para beneficiários 0032 no intercâmbio exportado, as solicitações destes exames
 devem ser liberadas de imediato, revertendo a guia no motivo: 9 - AUTORIZADO
 CONFORME PROTOCOLO DE LIBERAÇÃO
 NÃO NEGAR ATENDIMENTO.
 ATENÇÃO: Exame contratualizado para realização na Unimed Laboratório,
 exclusivamente na Megaunidade.
 Telefone de contato para mais informações: (41) 3021-5252.
 ATENDIMENTO INTERCÂMBIO
 Encaminhar para deliberação da Unimed origem.""",
 }
 MIN_DOC={
    "20103190":"""DOCUMENTAÇÃO MÍNIMA
 Justificativa Médica e/ou indicação clínica.""",
    "20203020":"""DOCUMENTAÇÃO MÍNIMA:
 ·
 Justificativa Médica e/ou indicação clínica.""",
 "31303293":"""DOCUMENTAÇÃO MÍNIMA
 Beneficiário 0032 (atendidos em Curitiba e em Outras cidades):
 ·Justificativa médica e/ou indicação clínica;
 ·Formulário de Solicitação - DIU Hormonal (ver anexo do script).
 Beneficiário Intercâmbio/Outras Unimeds:
 ·
 Relatório Médico Detalhado (conforme racionalização).""",
 "40202542":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa Médica e/ou indicação clínica.""",
 "40202550":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa Médica e/ou indicação clínica.""",
 "40304906":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa e/ou indicação clínica.""",
 "4034906":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa e/ou indicação clínica.""",
 "40314626":"""DOCUMENTAÇÃO MÍNIMA
 Não há;""",
 "40314618":"""DOCUMENTAÇÃO MÍNIMA
 Não há;""",
 "40323676":"""DOCUMENTAÇÃO MÍNIMA
 Não há;""",
 "40901254":"""Justificativa Médica e/ou indicação clínica informando a idade gestacional e Laudo
 do 1º exame sonográfico gestacional realizado.""",
 "40901262":"""DOCUMENTAÇÃO MÍNIMA
 Relatório médico informando a idade gestacional + laudo do 1o exame sonográfico
 gestacional realizado.""",
 "4091262":"""DOCUMENTAÇÃO MÍNIMA
 Relatório médico informando a idade gestacional + laudo do 1o exame sonográfico
 gestacional realizado.""",
 "4101230":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 ·
 Relatório Médico detalhado com histórico indicando a necessidade de
 realização do exame, além de sinais e sintomas, se possui limitação para teste de
 esforço físico;
 ·
 Índice de pré-teste segundo os critérios de Diamons e Forrester revisados
 e/ ou TIMI risk;
 ·
 Laudos de exames cardiológicos recentes.""",
 "4101230":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado)
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 ·
 Relatório Médico detalhado com histórico indicando a necessidade de
 realização do exame, além de sinais e sintomas, se possui limitação para teste de
 esforço físico;
 ·
 Índice de pré-teste segundo os critérios de Diamons e Forrester revisados
 e/ ou TIMI risk;
 ·
 Laudos de exames cardiológicos recentes.""",
 "41501144":"""DOCUMENTAÇÃO MÍNIMA
 PARA BENEFICIÁRIOS 0032 SENDO ATENDIDO EM CURITIBA, FORA (EXPORTADO),
 E PAC:
 ·
 Relatório Médico descrevendo a necessidade de realização do exame.·
 Se o pedido for para avaliação de glaucoma, é necessário incluir laudo e
 imagem da retinografia.
 Atenção: Para as demais situações, NÃO é necessário enviar o laudo da retinografia.
 PARA BENEFICIÁRIOS INTERCÂMBIO ESTADUAL, CONFORME DETERMINAÇÃO
 DO CERS
 ·Relatório médico descrevendo a necessidade da realização do exame;
 ·Laudo e imagem de retinografia.
 Atenção: NÃO É OBRIGATÓRIO o envio do laudo e imagem de retinografia para:
 > Pacientes com diagnóstico da doença já confirmado e que estejam em tratamento
 com aplicação de antiangiogênicos, para as patologias abaixo:
 ·Doença Macular Relacionada à Idade (DMRI);
 ·Oclusões Vasculares Retinianas;
 ·Edema Macular secundário a Diabetes Mellitus.
 Caso a informação referente ao tratamento não conste no relatório médico, consultar
 eventos recentes para identificar se o beneficiário está em tratamento.
 Esclarecemos que o envio de retinografia, poderá eventualmente ser solicitado pela
 auditoria para fins de elucidação diagnóstica.
 PARA BENEFICIÁRIOS INTERCÂMBIO NACIONAL:
 SEGUIR DOCUMENTAÇÃO CONFORME RACIONALIZAÇÃO.""",
 "31005101":"""DOCUMENTAÇÃO MÍNIMA:
 ·Relatório médico detalhado;
 ·Laudo RX e/ou tomografia e/ou ressonância e/ou ultrassonografia.""",
 "31005470":"""DOCUMENTAÇÃO MÍNIMA:
 ·Relatório médico detalhado;
 ·Laudo RX e/ou tomografia e/ou ressonância e/ou ultrassonografia.""",
 "40808122":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa e/ou indicação clínica.""",
 "40808130":"""DOCUMENTAÇÃO MÍNIMA
 Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÂMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa e/ou indicação clínica.""",
 "41501012":"""Para beneficiário de outras Unimed’s (importado):
 Ver item ATENDIMENTO INTERCÃMBIO.
 Para beneficiário Unimed 0032 (atendimento local):
 Justificativa médica e/ou indicação clínica."""
 }
 # System prompt for the agent
 # Define tools the agent can use
@tool
 def check(expression: str) -> str:
    """Retrieves the values of the files associated with the input for aditional information, if the json is not enough"""
    return FILE
 # Define the agent state
 class AgentState(TypedDict):
    messages: Annotated[list, add_messages]
 def get_bedrock_client():
    """Initialize and return AWS Bedrock runtime client."""
    return boto3.client("bedrock-runtime", region_name="us-east-2")
 def create_llm():
    """Create and return the Bedrock LLM."""
    return ChatBedrock(
        model_id="arn:aws:bedrock:us-east-2:232048051668:application-inference-profile/uy4xskop19zn",
        region_name="us-east-2",
        provider="anthropic"
    )
 def create_agent():
    """Create and return the LangGraph agent."""
    # Initialize the LLM with tools
    llm = create_llm()
    tools = [check]
    llm_with_tools = llm.bind_tools(tools)
    # Create tool lookup
    tool_map = {tool.name: tool for tool in tools}
    # Define the agent node
    def call_model(state: AgentState) -> dict:
        """Call the LLM with the current state."""
        messages = state["messages"]
        response = llm_with_tools.invoke(messages)
        return {"messages": [response]}
    # Define the tool execution node
    def call_tools(state: AgentState) -> dict:
        """Execute tools based on the last message."""
        last_message = state["messages"][-1]
        tool_messages = []
        for tool_call in last_message.tool_calls:
            tool_name = tool_call["name"]
            tool_args = tool_call["args"]
            if tool_name in tool_map:
                result = tool_map[tool_name].invoke(tool_args)
                tool_messages.append(
                    ToolMessage(content=str(result), tool_call_id=tool_call["id"])
                )
            else:
                tool_messages.append(
                    ToolMessage(
                        content=f"Tool {tool_name} not found",
                        tool_call_id=tool_call["id"],
                    )
                )
        return {"messages": tool_messages}
    # Define the routing function
    def should_continue(state: AgentState) -> Literal["tools", "end"]:
        """Determine whether to continue with tools or end."""
        last_message = state["messages"][-1]
        if hasattr(last_message, "tool_calls") and last_message.tool_calls:
            return "tools"
        return "end"
    # Build the graph
    workflow = StateGraph(AgentState)
    # Add nodes
    workflow.add_node("agent", call_model)
    workflow.add_node("tools", call_tools)
    # Set entry point
    workflow.set_entry_point("agent")
    # Add conditional edges
    workflow.add_conditional_edges(
        "agent",
        should_continue,
        {
            "tools": "tools",
            "end": END,
        },
    )
    # Add edge from tools back to agent
    workflow.add_edge("tools", "agent")
    # Compile the graph
    return workflow.compile()
 def run_agent(query: str,code:str) -> str:
    """
    Run the agent with a given query.
    Args:
        query: The user's question or request
    Returns:
        The agent's final response
    """
    agent = create_agent()
    SYSTEM_PROMPT = """You are a AI assistant responsible to check if a person is Allowed or Denied acces to medical procedure based on the following rules:
 <rules>
 """+RULES[code]+""""
 <\rules>
 Also this is the required documentation for aproval, try to indetify every document as one of these, or not identified:
 """+MIN_DOC[code]+""""
 Your capabilities:
 - You can check the OCR of anexed documents if the json input is not enough to determinate if it should be aproved, using the check tool.
 Start your answer with either:
 Aproved: if any of the rules are met, firts look only at the input, then check the file
 Reproved: If there aren't any rules met.
 And list the document classification and the met criteira, in case of aprovation. Be really precise and succint"""
    initial_state = {
        "messages": [
            SystemMessage(content=SYSTEM_PROMPT),
            HumanMessage(content=query)
        ]
    }
    print(f"\nUser: {query}")
    print("-" * 50)
    # Run the agent
    final_state = agent.invoke(initial_state)
    # Get the final response
    final_message = final_state["messages"][-1]
    response = final_message.content if hasattr(final_message, "content") else str(final_message)
    print(f"Agent: {response}")
    return response
 def get_textract_folder_for_guia(numero_guia: str) -> Path | None:
    """Find the textract output folder that starts with the numeroGuia."""
    for folder in TEXTRACT_OUTPUT_DIR.iterdir():
        if folder.is_dir() and folder.name.startswith(numero_guia):
            return folder
    return None
 def load_txt_files_from_folder(folder: Path) -> str:
    """Load and concatenate all .txt files from a folder."""
    content_parts = []
    for txt_file in sorted(folder.glob("*.txt")):
        content_parts.append(f"--- {txt_file.name} ---\n{txt_file.read_text()}")
    return "\n\n".join(content_parts)
 def main():
    """Main function to process JSON files and run the agent."""
    global FILE
    print("=" * 60)
    print("LangGraph Agent with AWS Bedrock")
    print("=" * 60)
    # Prepare CSV output
    output_csv_path = SCRIPTS_DIR / "agent_results.csv"
    results = []
    # Iterate over all JSON files in json_output folder
    for json_file in sorted(JSON_OUTPUT_DIR.glob("*.json")):
        print(f"\nProcessing: {json_file.name}")
        print("-" * 60)
        # Load JSON data
        with open(json_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        # Get numeroGuia
        numero_guia = data.get("numeroGuia")
        nome_beneficiario = data.get("nomeBeneficiario", "")
        resultado_da_analise = data.get("Resultado_da_analise", "")
        por_que_libera = data.get("Por_que_libera?", "")
        # Skip if numeroGuia is NaN or None
        if numero_guia is None or (isinstance(numero_guia, float) and str(numero_guia) == "nan"):
            skip_reason = "numeroGuia is NaN or missing"
            print(f"  Skipping: {skip_reason}")
            results.append({
                "numeroGuia": str(numero_guia) if numero_guia else "",
                "nomeBeneficiario": nome_beneficiario,
                "Resultado_da_analise": resultado_da_analise,
                "Por_que_libera": por_que_libera,
                "agent_output": f"SKIPPED: {skip_reason}"
            })
            continue
        numero_guia = str(numero_guia)
        print(f"  numeroGuia: {numero_guia}")
        # Find corresponding textract folder
        textract_folder = get_textract_folder_for_guia(numero_guia)
        if textract_folder is None:
            skip_reason = f"No textract folder found for {numero_guia}"
            print(f"  Skipping: {skip_reason}")
            results.append({
                "numeroGuia": numero_guia,
                "nomeBeneficiario": nome_beneficiario,
                "Resultado_da_analise": resultado_da_analise,
                "Por_que_libera": por_que_libera,
                "agent_output": f"SKIPPED: {skip_reason}"
            })
            continue
        print(f"  Textract folder: {textract_folder.name}")
        # Load txt files content into FILE global variable
        FILE = load_txt_files_from_folder(textract_folder)
        # Get codigoServico and convert to numerical string (remove dots/special chars)
        codigo_servico = data.get("codigoServico", "")
        code = "".join(c for c in str(codigo_servico) if c.isdigit())
        # Skip if code not in RULES
        if code not in RULES:
            skip_reason = f"codigoServico '{codigo_servico}' (code: {code}) not in RULES"
            print(f"  Skipping: {skip_reason}")
            results.append({
                "numeroGuia": numero_guia,
                "nomeBeneficiario": nome_beneficiario,
                "Resultado_da_analise": resultado_da_analise,
                "Por_que_libera": por_que_libera,
                "agent_output": f"SKIPPED: {skip_reason}"
            })
            continue
        print(f"  codigoServico: {codigo_servico} -> code: {code}")
        # Remove fields that should not be sent to the agent
        fields_to_drop = ["Resultado_da_analise", "Onde_foi_liberado", "Canal_de_entrada", "Por_que_libera?"]
        for field in fields_to_drop:
            data.pop(field, None)
        # Run the agent with the JSON data as query
        query = json.dumps(data, indent=2, ensure_ascii=False)
        agent_output = run_agent(query, code)
        # Add result to list
        results.append({
            "numeroGuia": numero_guia,
            "nomeBeneficiario": nome_beneficiario,
            "Resultado_da_analise": resultado_da_analise,
            "Por_que_libera": por_que_libera,
            "agent_output": agent_output
        })
        print("\n" + "=" * 60)
    # Write results to CSV
    with open(output_csv_path, "w", encoding="utf-8", newline="") as csvfile:
        fieldnames = ["numeroGuia", "nomeBeneficiario", "Resultado_da_analise", "Por_que_libera", "agent_output"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)
    print(f"\nResults saved to: {output_csv_path}")
 if __name__ == "__main__":
    main()
--- a/scripts/process_images_batch.py
+++ b/scripts/process_images_batch.py
@@ -1,404 +0,0 @@
 #!/usr/bin/env python3
 """
 Batch process images from S3 using AWS Textract.
 Iterates through folders (prefixes) in an S3 bucket and processes any PDF, PNG, or JPEG files
 that haven't been processed yet (checking for existing textract output files).
 Saves both JSON and plain text outputs locally.
 """
 import boto3
 import json
 import sys
 import os
 import io
 from pathlib import Path
 from typing import Dict, List, Optional
 import time
 from PyPDF2 import PdfReader
 def get_s3_client():
    """Initialize and return AWS S3 client."""
    return boto3.client('s3',region_name="us-east-2")
 def get_textract_client():
    """Initialize and return AWS Textract client."""
    return boto3.client('textract',region_name="us-east-2")
 def get_pdf_page_count(pdf_bytes: bytes) -> int:
    """
    Get the number of pages in a PDF file.
    Args:
        pdf_bytes: PDF file content as bytes
    Returns:
        int: Number of pages in the PDF
    """
    try:
        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
        return len(pdf_reader.pages)
    except Exception as e:
        print(f"    Warning: Could not determine page count: {str(e)}")
        return 1
 def is_already_processed(s3_key: str, output_dir: Path) -> bool:
    """
    Check if an image has already been processed by looking for output file.
    Args:
        s3_key: S3 object key
        output_dir: Directory where output files are stored
    Returns:
        bool: True if output file exists, False otherwise
    """
    filename = Path(s3_key).stem
    output_file = output_dir / f"{filename}_textract.json"
    return output_file.exists()
 def process_image_from_s3(bucket_name: str, s3_key: str) -> Dict:
    """
    Process an image file from S3 with AWS Textract.
    Supports PDF, PNG, and JPEG formats.
    Uses async API (start_document_text_detection) for multi-page PDFs,
    and sync API (detect_document_text) for single-page PDFs and images.
    Args:
        bucket_name: S3 bucket name
        s3_key: S3 object key
    Returns:
        dict: Textract response containing detected text
    """
    textract = get_textract_client()
    s3 = get_s3_client()
    try:
        # Verify the object exists first
        try:
            s3.head_object(Bucket=bucket_name, Key=s3_key)
        except Exception as e:
            print(f"    Error accessing S3 object: {str(e)}")
            print(f"    Bucket: {bucket_name}")
            print(f"    Key: {s3_key}")
            return None
        file_ext = Path(s3_key).suffix.lower()
        # For images (PNG, JPEG), always use sync API
        if file_ext in ['.png', '.jpg', '.jpeg']:
            print(f"    Processing image with sync API")
            response = textract.detect_document_text(
                Document={
                    'S3Object': {
                        'Bucket': bucket_name,
                        'Name': s3_key
                    }
                }
            )
            return response
        # For PDFs, check page count to decide which API to use
        if file_ext == '.pdf':
            # Download PDF to check page count
            response = s3.get_object(Bucket=bucket_name, Key=s3_key)
            pdf_bytes = response['Body'].read()
            page_count = get_pdf_page_count(pdf_bytes)
            print(f"    PDF has {page_count} page(s)")
            # Use async API for multi-page PDFs
            if page_count > 1:
                print(f"    Using async API (start_document_text_detection) for multi-page PDF")
                response = textract.start_document_text_detection(
                    DocumentLocation={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': s3_key
                        }
                    }
                )
                job_id = response['JobId']
                print(f"    Started async job: {job_id}")
                # Wait for job to complete
                while True:
                    result = textract.get_document_text_detection(JobId=job_id)
                    status = result['JobStatus']
                    if status == 'SUCCEEDED':
                        return result
                    elif status == 'FAILED':
                        print(f"    Job failed: {result.get('StatusMessage', 'Unknown error')}")
                        return None
                    time.sleep(2)
            else:
                # Use sync API for single-page PDFs
                print(f"    Using sync API (detect_document_text) for single-page PDF")
                response = textract.detect_document_text(
                    Document={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': s3_key
                        }
                    }
                )
                return response
    except Exception as e:
        print(f"    Error processing {s3_key}: {str(e)}")
        return None
 def extract_text_from_response(response: Dict) -> str:
    """
    Extract plain text from Textract response.
    Args:
        response: Textract API response
    Returns:
        str: Extracted text
    """
    if not response:
        return ""
    text_lines = []
    for block in response.get('Blocks', []):
        if block['BlockType'] == 'LINE':
            text_lines.append(block['Text'])
    return '\n'.join(text_lines)
 def save_textract_output(s3_key: str, response: Dict, output_dir: Path):
    """
    Save Textract response to JSON file and plain text file locally.
    Args:
        s3_key: S3 object key
        response: Textract API response
        output_dir: Directory to save output files
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    filename = Path(s3_key).stem
    # Extract text
    extracted_text = extract_text_from_response(response)
    # Save JSON output
    json_output_file = output_dir / f"{filename}_textract.json"
    if response:
        response['extracted_text'] = extracted_text
        response['source_s3_key'] = s3_key
    with open(json_output_file, 'w', encoding='utf-8') as f:
        json.dump(response, f, indent=2, ensure_ascii=False)
    print(f"  ✓ Saved JSON to: {json_output_file.name}")
    # Save plain text output
    text_output_file = output_dir / f"{filename}.txt"
    with open(text_output_file, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    print(f"  ✓ Saved text to: {text_output_file.name}")
 def get_supported_images_from_s3(bucket_name: str, prefix: str) -> List[str]:
    """
    Get list of supported image files in an S3 prefix (folder).
    Filters out files containing 'script' (case-insensitive).
    Args:
        bucket_name: S3 bucket name
        prefix: S3 prefix (folder path)
    Returns:
        List of S3 keys for supported image files
    """
    s3 = get_s3_client()
    supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'}
    images = []
    # Ensure prefix ends with / if it's not empty
    if prefix and not prefix.endswith('/'):
        prefix += '/'
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
    for page in pages:
        for obj in page.get('Contents', []):
            key = obj['Key']
            file_path = Path(key)
            # Check if it's a file (not a folder) and has supported extension
            if file_path.suffix.lower() in supported_extensions:
                # Filter out files containing 'script' (case-insensitive)
                if 'script' not in file_path.name.lower():
                    images.append(key)
    return sorted(images)
 def get_folders_from_s3(bucket_name: str, base_prefix: str = '') -> List[str]:
    """
    Get list of folders (prefixes) in S3 bucket.
    Args:
        bucket_name: S3 bucket name
        base_prefix: Base prefix to search under
    Returns:
        List of folder prefixes
    """
    s3 = get_s3_client()
    folders = []
    # Ensure prefix ends with / if it's not empty
    if base_prefix and not base_prefix.endswith('/'):
        base_prefix += '/'
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=base_prefix, Delimiter='/')
    for page in pages:
        for prefix_info in page.get('CommonPrefixes', []):
            folders.append(prefix_info['Prefix'])
    return folders
 def process_folder(bucket_name: str, prefix: str, output_base_dir: Path, skip_existing: bool = True):
    """
    Process all images in an S3 folder (prefix).
    Args:
        bucket_name: S3 bucket name
        prefix: S3 prefix (folder path)
        output_base_dir: Base directory for output files
        skip_existing: Whether to skip already processed files
    """
    folder_name = prefix.rstrip('/').split('/')[-1] or 'root'
    output_dir = output_base_dir / folder_name
    print(f"\n{'='*80}")
    print(f"Processing folder: {prefix}")
    print(f"{'='*80}")
    images = get_supported_images_from_s3(bucket_name, prefix)
    if not images:
        print(f"  No supported images found (PDF, PNG, JPEG)")
        return
    print(f"  Found {len(images)} image(s)")
    processed_count = 0
    skipped_count = 0
    error_count = 0
    for s3_key in images:
        filename = Path(s3_key).name
        print(f"\n  Processing: {filename}")
        # Check if already processed
        if skip_existing and is_already_processed(s3_key, output_dir):
            print(f"  ⊘ Skipped (already processed)")
            skipped_count += 1
            continue
        # Process with Textract
        response = process_image_from_s3(bucket_name, s3_key)
        if response:
            # Save output (both JSON and text)
            save_textract_output(s3_key, response, output_dir)
            # Print summary
            num_blocks = len(response.get('Blocks', []))
            text_length = len(extract_text_from_response(response))
            print(f"  ℹ Extracted {text_length} characters, {num_blocks} blocks")
            processed_count += 1
            # Small delay to avoid rate limiting
            time.sleep(0.5)
        else:
            error_count += 1
    print(f"\n  Summary for {folder_name}:")
    print(f"    Processed: {processed_count}")
    print(f"    Skipped: {skipped_count}")
    print(f"    Errors: {error_count}")
 def main():
    """Main entry point for the script."""
    # Get bucket name from environment or command line
    bucket_name = os.environ.get('S3_BUCKET_NAME')
    base_prefix = os.environ.get('S3_BASE_PREFIX', 'imagens')
    if len(sys.argv) > 1:
        bucket_name = sys.argv[1]
    if len(sys.argv) > 2:
        base_prefix = sys.argv[2]
    if not bucket_name:
        print("Error: S3 bucket name not provided.")
        print("\nUsage:")
        print("  python process_images_batch.py <bucket_name> [base_prefix]")
        print("\nOr set environment variables:")
        print("  export S3_BUCKET_NAME=my-bucket")
        print("  export S3_BASE_PREFIX=imagens")
        print("  python process_images_batch.py")
        sys.exit(1)
    # Get output directory
    script_dir = Path(__file__).parent
    output_base_dir = script_dir / "textract_output"
    print(f"S3 Bucket: {bucket_name}")
    print(f"Base prefix: {base_prefix}")
    print(f"Output directory: {output_base_dir}")
    # Get all folders (prefixes) in the bucket
    print(f"\nScanning S3 bucket for folders...")
    folders = get_folders_from_s3(bucket_name, base_prefix)
    if not folders:
        print(f"\nNo subdirectories found under '{base_prefix}'.")
        print("Processing files in the base prefix instead...")
        folders = [base_prefix]
    else:
        print(f"\nFound {len(folders)} folder(s) to process")
    # Process each folder
    total_start = time.time()
    for prefix in folders:
        try:
            process_folder(bucket_name, prefix, output_base_dir)
        except Exception as e:
            print(f"\nError processing folder {prefix}: {str(e)}")
            import traceback
            traceback.print_exc()
            continue
    total_time = time.time() - total_start
    print(f"\n{'='*80}")
    print(f"Batch processing complete!")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"{'='*80}")
 if __name__ == '__main__':
    main()
--- a/scripts/textract.py
+++ b/scripts/textract.py
@@ -1,209 +0,0 @@
 #!/usr/bin/env python3
 """
 Simple script to invoke AWS Textract on a PDF file.
 Extracts text and returns the detected content.
 """
 import boto3
 import sys
 import io
 from pathlib import Path
 from PyPDF2 import PdfReader
 def get_pdf_page_count(pdf_bytes: bytes) -> int:
    """
    Get the number of pages in a PDF file.
    Args:
        pdf_bytes: PDF file content as bytes
    Returns:
        int: Number of pages in the PDF
    """
    try:
        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
        return len(pdf_reader.pages)
    except Exception as e:
        print(f"Warning: Could not determine page count: {str(e)}")
        return 1
 def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
    """
    Process a document file (PDF, PNG, JPEG) with AWS Textract.
    Uses async API (start_document_text_detection) for multi-page PDFs,
    and sync API (detect_document_text) for single-page PDFs and images.
    Args:
        pdf_path: Path to the document file (local path or S3 key)
        bucket_name: Optional S3 bucket name if document is in S3
    Returns:
        dict: Textract response containing detected text
    """
    textract = boto3.client('textract')
    file_ext = Path(pdf_path).suffix.lower()
    # For images (PNG, JPEG), always use sync API
    if file_ext in ['.png', '.jpg', '.jpeg']:
        print(f"Processing image file with sync API")
        if bucket_name:
            response = textract.detect_document_text(
                Document={
                    'S3Object': {
                        'Bucket': bucket_name,
                        'Name': pdf_path
                    }
                }
            )
        else:
            with open(pdf_path, 'rb') as file:
                file_bytes = file.read()
            response = textract.detect_document_text(
                Document={'Bytes': file_bytes}
            )
        return response
    # For PDFs, check page count to decide which API to use
    if file_ext == '.pdf':
        s3 = boto3.client('s3')
        # Determine number of pages
        if bucket_name:
            # Download PDF from S3 to check page count
            response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
            pdf_bytes = response['Body'].read()
        else:
            # Read local PDF
            with open(pdf_path, 'rb') as pdf_file:
                pdf_bytes = pdf_file.read()
        page_count = get_pdf_page_count(pdf_bytes)
        print(f"PDF has {page_count} page(s)")
        # Use async API for multi-page PDFs
        if page_count > 1:
            print("Using async API (start_document_text_detection) for multi-page PDF")
            if bucket_name:
                # Process from S3
                response = textract.start_document_text_detection(
                    DocumentLocation={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': pdf_path
                        }
                    }
                )
            else:
                # For local files with multiple pages, we need to use S3
                # Note: Textract async API requires S3
                raise ValueError(
                    "Multi-page PDFs must be processed from S3. "
                    "Please upload the file to S3 first."
                )
            job_id = response['JobId']
            print(f"Started Textract job: {job_id}")
            # Wait for job to complete
            import time
            while True:
                result = textract.get_document_text_detection(JobId=job_id)
                status = result['JobStatus']
                print(f"Job status: {status}")
                if status in ['SUCCEEDED', 'FAILED']:
                    break
                time.sleep(2)
            return result
        else:
            # Use sync API for single-page PDFs
            print("Using sync API (detect_document_text) for single-page PDF")
            if bucket_name:
                response = textract.detect_document_text(
                    Document={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': pdf_path
                        }
                    }
                )
            else:
                response = textract.detect_document_text(
                    Document={'Bytes': pdf_bytes}
                )
            return response
    # Unsupported file type
    raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
 def extract_text_from_response(response: dict) -> str:
    """
    Extract plain text from Textract response.
    Args:
        response: Textract API response
    Returns:
        str: Extracted text
    """
    text_lines = []
    for block in response.get('Blocks', []):
        if block['BlockType'] == 'LINE':
            text_lines.append(block['Text'])
    return '\n'.join(text_lines)
 def main():
    if len(sys.argv) < 2:
        print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
        print("\nExamples:")
        print("  python textract_pdf.py document.pdf")
        print("  python textract_pdf.py path/to/doc.pdf my-bucket")
        sys.exit(1)
    pdf_path = sys.argv[1]
    bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
    if not bucket_name and not Path(pdf_path).exists():
        print(f"Error: File not found: {pdf_path}")
        sys.exit(1)
    print(f"Processing PDF: {pdf_path}")
    if bucket_name:
        print(f"Using S3 bucket: {bucket_name}")
    # Process PDF
    response = process_pdf_with_textract(pdf_path, bucket_name)
    # Extract and display text
    text = extract_text_from_response(response)
    print("\n" + "="*80)
    print("EXTRACTED TEXT")
    print("="*80)
    print(text)
    print("="*80)
    # Print summary
    num_blocks = len(response.get('Blocks', []))
    num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
    print(f"\nSummary:")
    print(f"  Pages processed: {num_pages}")
    print(f"  Total blocks: {num_blocks}")
    print(f"  Text length: {len(text)} characters")
 if __name__ == '__main__':
    main()
--- a/scripts/tojson.py
+++ b/scripts/tojson.py
@@ -1,30 +0,0 @@
 import pandas as pd
 import json
 from pathlib import Path
 # Configuration
 CSV_FILE = "guias.csv"
 OUTPUT_DIR = "json_output"
 ENCODING = "utf-8"
 # Create output directory
 Path(OUTPUT_DIR).mkdir(exist_ok=True)
 # Read CSV
 df = pd.read_csv(CSV_FILE, encoding=ENCODING)
 # Convert each row to JSON, skipping row 2 (index 1)
 for index, row in df.iterrows():
    # Skip the second row (index 1)
    if index == 0:
        print(f"⊗ Skipped row {index + 1}")
        continue
    # Save to individual JSON file
    output_file = f"{OUTPUT_DIR}/row_{index + 1}.json"
    with open(output_file, 'w', encoding=ENCODING) as json_file:
        json.dump(row.to_dict(), json_file, indent=2, ensure_ascii=False)
    print(f"✓ Created {output_file}")
 print(f"\nDone! Created {len(df) - 1} JSON files in '{OUTPUT_DIR}/' directory")