Adds initial files]

2026-01-20 13:48:13 -03:00
parent 12176c50c1
commit 6026870c5c
16 changed files with 1316 additions and 0 deletions
--- a/code/app.py
+++ b/code/app.py
@@ -0,0 +1,156 @@
+from fastapi import FastAPI,Header
+import uvicorn
+"""
+Simple LangGraph Agent using @tool decorator
+
+Clean implementation with decorator-based tool definitions.
+"""
+
+from typing import Annotated, TypedDict
+from langgraph.graph import StateGraph, START, END
+from langgraph.graph.message import add_messages
+from langgraph.prebuilt import ToolNode, tools_condition
+from langchain_core.messages import HumanMessage, SystemMessage
+from langchain_core.tools import tool
+from langchain_aws import ChatBedrock
+app = FastAPI()
+
+# Define tools using @tool decorator
+rules="""- Mulheres acima de 45 anos ou menopausada
+- Homens com mais de 70 anos;
+- Osteogênese imperfeita (para esta patologia, poderá haver a liberação de (02) dois 
+exames ao ano - cada 180 dias);
+- RX com osteopenia ou fratura patológica;
+- Antecedente pessoal de fratura após os 40 anos: punho, ombros, vértebras, quadril;
+- Parente de primeiro grau com osteoporose.
+- Mulheres com massa corporal <20kg/m2 ou peso < 57,8kg;
+- Menopausa antes dos 45 anos ou hipogonasismo crônico (falência ovariana 
+precoce);
+- Uso de glicocorticóides (>=7,5 prednizona/ dia equivalente por mais três meses, ou 
+presença de síndrome de cushing;
+- Hiperparatireoidismo primário;
+- Uso prolongado de anticonvulsivantes (< 10 anos);
+- Síndrome de má absorção crônica ou desnutrição doenças inflamatória intestinal 
+(independente da causa: bariatrica, celiacos, intolerancia a lactose).
+- Quimioterapia, se sobrevida esperada for longa (< 5 anos);
+- Diminuição documentada de altura;
+- Presença de cifose após menopausa.
+- Imobilização prolongada"""
+SYSTEM_PROMPT="""
+You are a assisant, your job is to aprove or not a procedure based on the following rules:"""
+rules+"""
+Your input will be a json, evaluate it based on the rules and return:
+Aproved: If one of the criteira is met
+Reproved: If not a single one of the criterias are met."""
+@tool
+def add(a: int, b: int) -> int:
+    """Add two numbers together.
+    
+    Args:
+        a: First number
+        b: Second number
+    """
+    return a + b
+
+
+@tool
+def multiply(a: int, b: int) -> int:
+    """Multiply two numbers.
+    
+    Args:
+        a: First number
+        b: Second number
+    """
+    return a * b
+
+
+@tool
+def get_word_length(word: str) -> int:
+    """Get the length of a word.
+    
+    Args:
+        word: The word to measure
+    """
+    return len(word)
+
+
+@tool
+def search_info(topic: str) -> str:
+    """Search for information about a topic (mock implementation).
+    
+    Args:
+        topic: The topic to search for
+    """
+    # Mock response - replace with actual search/API
+    return f"Information about {topic}: This is a mock response. In production, this would return real data."
+
+
+# Define agent state
+class AgentState(TypedDict):
+    messages: Annotated[list, add_messages]
+
+
+# Define tools list
+tools = [add, multiply, get_word_length, search_info]
+
+
+# Agent node
+def call_model(state: AgentState):
+    """Call the LLM with current state and tools."""
+    
+    model = ChatBedrock(
+        model_id="arn:aws:bedrock:us-east-2:232048051668:application-inference-profile/uy4xskop19zn",
+        region_name="us-east-2",
+        provider="anthropic"
+    )
+    
+    model_with_tools = model.bind_tools(tools)
+    
+    messages = [
+        SystemMessage(content=SYSTEM_PROMPT)
+    ] + state["messages"]
+    
+    response = model_with_tools.invoke(messages)
+    
+    return {"messages": [response]}
+
+
+# Build the graph
+def create_agent():
+    """Create and compile the agent graph."""
+    
+    workflow = StateGraph(AgentState)
+    
+    # Add nodes
+    workflow.add_node("agent", call_model)
+    workflow.add_node("tools", ToolNode(tools))
+    
+    # Add edges
+    workflow.add_edge(START, "agent")
+    workflow.add_conditional_edges("agent", tools_condition)
+    workflow.add_edge("tools", "agent")
+    
+    return workflow.compile()
+
+
+# Main execution
+
+
+@app.post("/")
+async def root(json:str= Header(...)):
+    agent = create_agent()
+    query=json
+    result = agent.invoke(
+            {"messages": [HumanMessage(content=query)]},
+            config={"recursion_limit": 10}
+        )
+        
+    final_message = result["messages"][-1]
+    return {"status": "success", "message": final_message.content}
+
+@app.get("/health")
+async def health():
+    return {"status": "healthy"}
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/code/dockerfile
+++ b/code/dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.12
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install curl for health checks
+RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
+
+COPY app.py .
+
+EXPOSE 8000
+
+CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/code/requirements.txt
+++ b/code/requirements.txt
@@ -0,0 +1,6 @@
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+langgraph
+langchain-aws
+langchain
+PyPDF2
--- a/infra/.terraform.lock.hcl
+++ b/infra/.terraform.lock.hcl
@@ -0,0 +1,25 @@
+# This file is maintained automatically by "terraform init".
+# Manual edits may be lost in future updates.
+
+provider "registry.terraform.io/hashicorp/aws" {
+  version     = "6.27.0"
+  constraints = "~> 6.27"
+  hashes = [
+    "h1:bixp2PSsP5ZGBczGCxcbSDn6lF5QFlUXlNroq9cdab4=",
+    "zh:177a24b806c72e8484b5cabc93b2b38e3d770ae6f745a998b54d6619fd0e8129",
+    "zh:4ac4a85c14fb868a3306b542e6a56c10bd6c6d5a67bc0c9b8f6a9060cf5f3be7",
+    "zh:552652185bc85c8ba1da1d65dea47c454728a5c6839c458b6dcd3ce71c19ccfc",
+    "zh:60284b8172d09aee91eae0856f09855eaf040ce3a58d6933602ae17c53f8ed04",
+    "zh:6be38d156756ca61fb8e7c752cc5d769cd709686700ac4b230f40a6e95b5dbc9",
+    "zh:7a409138fae4ef42e3a637e37cb9efedf96459e28a3c764fc4e855e8db9a7485",
+    "zh:8070cf5224ed1ed3a3e9a59f7c30ff88bf071c7567165275d477c1738a56c064",
+    "zh:894439ef340a9a79f69cd759e27ad11c7826adeca27be1b1ca82b3c9702fa300",
+    "zh:89d035eebf08a97c89374ff06040955ddc09f275ecca609d0c9d58d149bef5cf",
+    "zh:985b1145d724fc1f38369099e4a5087141885740fd6c0b1dbc492171e73c2e49",
+    "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
+    "zh:a80b47ae8d1475201c86bd94a5dcb9dd4da5e8b73102a90820b68b66b76d50fd",
+    "zh:d3395be1556210f82199b9166a6b2e677cee9c4b67e96e63f6c3a98325ad7ab0",
+    "zh:db0b869d09657f6f1e4110b56093c5fcdf9dbdd97c020db1e577b239c0adcbce",
+    "zh:ffc72e680370ae7c21f9bd3082c6317730df805c6797427839a6b6b7e9a26a01",
+  ]
+}
--- a/infra/ecr/main.tf
+++ b/infra/ecr/main.tf
@@ -0,0 +1,32 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 6.27"
+    }
+  }
+}
+
+
+provider "aws" {
+  region = var.aws_region
+}
+
+resource "aws_ecr_repository" "app" {
+  name                 = var.repository_name
+  image_tag_mutability = "MUTABLE"  # or "IMMUTABLE"
+
+  image_scanning_configuration {
+    scan_on_push = true
+  }
+
+  encryption_configuration {
+    encryption_type = "AES256"  # or "KMS" for customer managed keys
+  }
+
+  tags = {
+    Name        = var.repository_name
+    Environment = var.environment
+    ManagedBy   = "Terraform"
+  }
+}
--- a/infra/ecr/outputs.tf
+++ b/infra/ecr/outputs.tf
@@ -0,0 +1,14 @@
+output "repository_url" {
+  description = "ECR repository URL"
+  value       = aws_ecr_repository.app.repository_url
+}
+
+output "repository_arn" {
+  description = "ECR repository ARN"
+  value       = aws_ecr_repository.app.arn
+}
+
+output "repository_name" {
+  description = "ECR repository name"
+  value       = aws_ecr_repository.app.name
+}
--- a/infra/ecr/terraform.tfstate
+++ b/infra/ecr/terraform.tfstate
@@ -0,0 +1,75 @@
+{
+  "version": 4,
+  "terraform_version": "1.14.3",
+  "serial": 6,
+  "lineage": "b2b2331d-cf66-169e-d25b-38e0528505fc",
+  "outputs": {
+    "repository_arn": {
+      "value": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser",
+      "type": "string"
+    },
+    "repository_name": {
+      "value": "upflux-doc-analyser",
+      "type": "string"
+    },
+    "repository_url": {
+      "value": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser",
+      "type": "string"
+    }
+  },
+  "resources": [
+    {
+      "mode": "managed",
+      "type": "aws_ecr_repository",
+      "name": "app",
+      "provider": "provider[\"registry.terraform.io/hashicorp/aws\"]",
+      "instances": [
+        {
+          "schema_version": 0,
+          "attributes": {
+            "arn": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser",
+            "encryption_configuration": [
+              {
+                "encryption_type": "AES256",
+                "kms_key": ""
+              }
+            ],
+            "force_delete": null,
+            "id": "upflux-doc-analyser",
+            "image_scanning_configuration": [
+              {
+                "scan_on_push": true
+              }
+            ],
+            "image_tag_mutability": "MUTABLE",
+            "image_tag_mutability_exclusion_filter": [],
+            "name": "upflux-doc-analyser",
+            "region": "us-east-2",
+            "registry_id": "232048051668",
+            "repository_url": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser",
+            "tags": {
+              "Environment": "dev",
+              "ManagedBy": "Terraform",
+              "Name": "upflux-doc-analyser"
+            },
+            "tags_all": {
+              "Environment": "dev",
+              "ManagedBy": "Terraform",
+              "Name": "upflux-doc-analyser"
+            },
+            "timeouts": null
+          },
+          "sensitive_attributes": [],
+          "identity_schema_version": 0,
+          "identity": {
+            "account_id": "232048051668",
+            "name": "upflux-doc-analyser",
+            "region": "us-east-2"
+          },
+          "private": "eyJlMmJmYjczMC1lY2FhLTExZTYtOGY4OC0zNDM2M2JjN2M0YzAiOnsiZGVsZXRlIjoxMjAwMDAwMDAwMDAwfX0="
+        }
+      ]
+    }
+  ],
+  "check_results": null
+}
--- a/infra/ecr/terraform.tfvars
+++ b/infra/ecr/terraform.tfvars
@@ -0,0 +1,3 @@
+aws_region      = "us-east-2"
+repository_name = "upflux-doc-analyser"
+environment     = "dev"
--- a/infra/ecr/variable.tf
+++ b/infra/ecr/variable.tf
@@ -0,0 +1,16 @@
+variable "aws_region" {
+  description = "AWS region"
+  type        = string
+  default     = "us-east-1"
+}
+
+variable "repository_name" {
+  description = "Name of the ECR repository"
+  type        = string
+}
+
+variable "environment" {
+  description = "Environment name"
+  type        = string
+  default     = "dev"
+}
--- a/infra/ecs_alb/main.tf
+++ b/infra/ecs_alb/main.tf
@@ -0,0 +1,249 @@
+terraform {
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 6.27"
+    }
+  }
+}
+
+provider "aws" {
+  region = var.aws_region
+}
+
+# Get current AWS account
+data "aws_caller_identity" "current" {}
+
+# Reference existing VPC
+data "aws_vpc" "existing" {
+  id = var.vpc_id
+}
+
+# Reference existing public subnets
+data "aws_subnet" "public" {
+  count = length(var.public_subnet_ids)
+  id    = var.public_subnet_ids[count.index]
+}
+
+# Reference existing private subnets (for ECS tasks)
+data "aws_subnet" "private" {
+  count = length(var.private_subnet_ids)
+  id    = var.private_subnet_ids[count.index]
+}
+
+# Security Group for ALB (in public subnets)
+resource "aws_security_group" "alb" {
+  name        = "${var.app_name}-alb-sg"
+  description = "Allow inbound traffic to ALB"
+  vpc_id      = data.aws_vpc.existing.id
+
+  ingress {
+    from_port   = 80
+    to_port     = 80
+    protocol    = "tcp"
+    cidr_blocks = ["3.14.44.224/32"]
+    description = "Allow HTTP from internet"
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "Allow all outbound"
+  }
+
+  tags = {
+    Name = "${var.app_name}-alb-sg"
+  }
+}
+
+# Security Group for ECS Tasks (in private subnets)
+resource "aws_security_group" "ecs_tasks" {
+  name        = "${var.app_name}-ecs-tasks-sg"
+  description = "Allow inbound traffic from ALB"
+  vpc_id      = data.aws_vpc.existing.id
+
+  ingress {
+    from_port       = 8000
+    to_port         = 8000
+    protocol        = "tcp"
+    security_groups = [aws_security_group.alb.id]
+    description     = "Allow traffic from ALB"
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "Allow all outbound"
+  }
+
+  tags = {
+    Name = "${var.app_name}-ecs-tasks-sg"
+  }
+}
+
+# Application Load Balancer (in public subnets)
+resource "aws_lb" "main" {
+  name               = "${var.app_name}-alb"
+  internal           = false
+  load_balancer_type = "application"
+  security_groups    = [aws_security_group.alb.id]
+  subnets            = var.public_subnet_ids
+
+  enable_deletion_protection = false
+
+  tags = {
+    Name = "${var.app_name}-alb"
+  }
+}
+
+# Target Group
+resource "aws_lb_target_group" "app" {
+  name        = "${var.app_name}-tg"
+  port        = 8000
+  protocol    = "HTTP"
+  vpc_id      = data.aws_vpc.existing.id
+  target_type = "ip"
+
+  health_check {
+    enabled             = true
+    healthy_threshold   = 2
+    interval            = 30
+    matcher             = "200"
+    path                = "/health"
+    port                = "traffic-port"
+    protocol            = "HTTP"
+    timeout             = 5
+    unhealthy_threshold = 3
+  }
+
+  deregistration_delay = 30
+
+  tags = {
+    Name = "${var.app_name}-tg"
+  }
+}
+
+# ALB Listener
+resource "aws_lb_listener" "app" {
+  load_balancer_arn = aws_lb.main.arn
+  port              = "80"
+  protocol          = "HTTP"
+
+  default_action {
+    type             = "forward"
+    target_group_arn = aws_lb_target_group.app.arn
+  }
+}
+
+# ECS Cluster
+resource "aws_ecs_cluster" "main" {
+  name = "${var.app_name}-cluster"
+
+  tags = {
+    Name = "${var.app_name}-cluster"
+  }
+}
+
+# CloudWatch Log Group
+resource "aws_cloudwatch_log_group" "app" {
+  name              = "/ecs/${var.app_name}"
+  retention_in_days = 7
+
+  tags = {
+    Name = "${var.app_name}-logs"
+  }
+}
+
+# ECS Task Execution Role
+resource "aws_iam_role" "ecs_task_execution_role" {
+  name = "${var.app_name}-ecs-task-execution-role"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [{
+      Action = "sts:AssumeRole"
+      Effect = "Allow"
+      Principal = {
+        Service = "ecs-tasks.amazonaws.com"
+      }
+    }]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_policy" {
+  role       = aws_iam_role.ecs_task_execution_role.name
+  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
+}
+
+# ECS Task Definition
+resource "aws_ecs_task_definition" "app" {
+  family                   = var.app_name
+  network_mode             = "awsvpc"
+  requires_compatibilities = ["FARGATE"]
+  cpu                      = var.fargate_cpu
+  memory                   = var.fargate_memory
+  execution_role_arn       = aws_iam_role.ecs_task_execution_role.arn
+
+  container_definitions = jsonencode([{
+    name  = var.app_name
+    image = "${data.aws_caller_identity.current.account_id}.dkr.ecr.${var.aws_region}.amazonaws.com/${var.ecr_repository_name}:${var.image_tag}"
+    
+    portMappings = [{
+      containerPort = 8000
+      hostPort      = 8000
+      protocol      = "tcp"
+    }]
+
+    logConfiguration = {
+      logDriver = "awslogs"
+      options = {
+        "awslogs-group"         = aws_cloudwatch_log_group.app.name
+        "awslogs-region"        = var.aws_region
+        "awslogs-stream-prefix" = "ecs"
+      }
+    }
+
+    healthCheck = {
+      command     = ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
+      interval    = 30
+      timeout     = 5
+      retries     = 3
+      startPeriod = 60
+    }
+  }])
+
+  tags = {
+    Name = "${var.app_name}-task"
+  }
+}
+
+# ECS Service (tasks in private subnets)
+resource "aws_ecs_service" "app" {
+  name            = "${var.app_name}-service"
+  cluster         = aws_ecs_cluster.main.id
+  task_definition = aws_ecs_task_definition.app.arn
+  desired_count   = var.app_count
+  launch_type     = "FARGATE"
+
+  network_configuration {
+    security_groups  = [aws_security_group.ecs_tasks.id]
+    subnets          = var.private_subnet_ids  # ECS tasks in private subnets
+    assign_public_ip = false  # No public IP needed with NAT gateway
+  }
+
+  load_balancer {
+    target_group_arn = aws_lb_target_group.app.arn
+    container_name   = var.app_name
+    container_port   = 8000
+  }
+
+  depends_on = [aws_lb_listener.app]
+
+  tags = {
+    Name = "${var.app_name}-service"
+  }
+}
--- a/infra/ecs_alb/terraform.tfvars
+++ b/infra/ecs_alb/terraform.tfvars
@@ -0,0 +1,23 @@
+aws_region  = "us-east-2"
+app_name    = "upflux-doc-analyser"
+
+# Replace these with your actual IDs
+vpc_id = "vpc-0270f02aee3bf1b8d"
+
+# Your public subnets (where ALB will be)
+public_subnet_ids = [
+  "subnet-088bc49c54ec8f028",  # Public subnet 1
+  "subnet-003f1693910a99afb"   # Public subnet 2
+]
+
+# Your private subnets (where ECS tasks will run)
+private_subnet_ids = [
+  "subnet-045f73d784beed091",  # Private subnet 1
+  "subnet-06e660f44bf141442"   # Private subnet 2
+]
+
+ecr_repository_name = "upflux-doc-analyser"
+image_tag           = "latest"
+fargate_cpu         = "256"
+fargate_memory      = "512"
+app_count           = 1
--- a/infra/ecs_alb/variables.tf
+++ b/infra/ecs_alb/variables.tf
@@ -0,0 +1,56 @@
+variable "aws_region" {
+  description = "AWS region"
+  type        = string
+  default     = "us-east-1"
+}
+
+variable "app_name" {
+  description = "Application name"
+  type        = string
+  default     = "fastapi-app"
+}
+
+variable "vpc_id" {
+  description = "Existing VPC ID"
+  type        = string
+}
+
+variable "public_subnet_ids" {
+  description = "List of existing public subnet IDs (for ALB)"
+  type        = list(string)
+}
+
+variable "private_subnet_ids" {
+  description = "List of existing private subnet IDs (for ECS tasks)"
+  type        = list(string)
+}
+
+variable "ecr_repository_name" {
+  description = "ECR repository name"
+  type        = string
+  default     = "fastapi-app"
+}
+
+variable "image_tag" {
+  description = "Docker image tag"
+  type        = string
+  default     = "latest"
+}
+
+variable "fargate_cpu" {
+  description = "Fargate CPU units"
+  type        = string
+  default     = "256"
+}
+
+variable "fargate_memory" {
+  description = "Fargate memory in MB"
+  type        = string
+  default     = "512"
+}
+
+variable "app_count" {
+  description = "Number of tasks to run"
+  type        = number
+  default     = 1
+}
--- a/infra/terraform.tfvars
+++ b/infra/terraform.tfvars
@@ -0,0 +1,3 @@
+aws_region      = "us-east-2"
+repository_name = "upflux-doc-analyser"
+environment     = "dev"
--- a/scripts/process_images_batch.py
+++ b/scripts/process_images_batch.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python3
+"""
+Batch process images from S3 using AWS Textract.
+Iterates through folders (prefixes) in an S3 bucket and processes any PDF, PNG, or JPEG files
+that haven't been processed yet (checking for existing textract output files).
+Saves both JSON and plain text outputs locally.
+"""
+
+import boto3
+import json
+import sys
+import os
+import io
+from pathlib import Path
+from typing import Dict, List, Optional
+import time
+from PyPDF2 import PdfReader
+
+
+def get_s3_client():
+    """Initialize and return AWS S3 client."""
+    return boto3.client('s3',region_name="us-east-2")
+
+
+def get_textract_client():
+    """Initialize and return AWS Textract client."""
+    return boto3.client('textract',region_name="us-east-2")
+
+
+def get_pdf_page_count(pdf_bytes: bytes) -> int:
+    """
+    Get the number of pages in a PDF file.
+
+    Args:
+        pdf_bytes: PDF file content as bytes
+
+    Returns:
+        int: Number of pages in the PDF
+    """
+    try:
+        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
+        return len(pdf_reader.pages)
+    except Exception as e:
+        print(f"    Warning: Could not determine page count: {str(e)}")
+        return 1
+
+
+def is_already_processed(s3_key: str, output_dir: Path) -> bool:
+    """
+    Check if an image has already been processed by looking for output file.
+
+    Args:
+        s3_key: S3 object key
+        output_dir: Directory where output files are stored
+
+    Returns:
+        bool: True if output file exists, False otherwise
+    """
+    filename = Path(s3_key).stem
+    output_file = output_dir / f"{filename}_textract.json"
+    return output_file.exists()
+
+
+def process_image_from_s3(bucket_name: str, s3_key: str) -> Dict:
+    """
+    Process an image file from S3 with AWS Textract.
+    Supports PDF, PNG, and JPEG formats.
+    Uses async API (start_document_text_detection) for multi-page PDFs,
+    and sync API (detect_document_text) for single-page PDFs and images.
+
+    Args:
+        bucket_name: S3 bucket name
+        s3_key: S3 object key
+
+    Returns:
+        dict: Textract response containing detected text
+    """
+    textract = get_textract_client()
+    s3 = get_s3_client()
+
+    try:
+        # Verify the object exists first
+        try:
+            s3.head_object(Bucket=bucket_name, Key=s3_key)
+        except Exception as e:
+            print(f"    Error accessing S3 object: {str(e)}")
+            print(f"    Bucket: {bucket_name}")
+            print(f"    Key: {s3_key}")
+            return None
+
+        file_ext = Path(s3_key).suffix.lower()
+
+        # For images (PNG, JPEG), always use sync API
+        if file_ext in ['.png', '.jpg', '.jpeg']:
+            print(f"    Processing image with sync API")
+            response = textract.detect_document_text(
+                Document={
+                    'S3Object': {
+                        'Bucket': bucket_name,
+                        'Name': s3_key
+                    }
+                }
+            )
+            return response
+
+        # For PDFs, check page count to decide which API to use
+        if file_ext == '.pdf':
+            # Download PDF to check page count
+            response = s3.get_object(Bucket=bucket_name, Key=s3_key)
+            pdf_bytes = response['Body'].read()
+            page_count = get_pdf_page_count(pdf_bytes)
+
+            print(f"    PDF has {page_count} page(s)")
+
+            # Use async API for multi-page PDFs
+            if page_count > 1:
+                print(f"    Using async API (start_document_text_detection) for multi-page PDF")
+                response = textract.start_document_text_detection(
+                    DocumentLocation={
+                        'S3Object': {
+                            'Bucket': bucket_name,
+                            'Name': s3_key
+                        }
+                    }
+                )
+                job_id = response['JobId']
+                print(f"    Started async job: {job_id}")
+
+                # Wait for job to complete
+                while True:
+                    result = textract.get_document_text_detection(JobId=job_id)
+                    status = result['JobStatus']
+
+                    if status == 'SUCCEEDED':
+                        return result
+                    elif status == 'FAILED':
+                        print(f"    Job failed: {result.get('StatusMessage', 'Unknown error')}")
+                        return None
+
+                    time.sleep(2)
+            else:
+                # Use sync API for single-page PDFs
+                print(f"    Using sync API (detect_document_text) for single-page PDF")
+                response = textract.detect_document_text(
+                    Document={
+                        'S3Object': {
+                            'Bucket': bucket_name,
+                            'Name': s3_key
+                        }
+                    }
+                )
+                return response
+
+    except Exception as e:
+        print(f"    Error processing {s3_key}: {str(e)}")
+        return None
+
+
+def extract_text_from_response(response: Dict) -> str:
+    """
+    Extract plain text from Textract response.
+
+    Args:
+        response: Textract API response
+
+    Returns:
+        str: Extracted text
+    """
+    if not response:
+        return ""
+
+    text_lines = []
+    for block in response.get('Blocks', []):
+        if block['BlockType'] == 'LINE':
+            text_lines.append(block['Text'])
+
+    return '\n'.join(text_lines)
+
+
+def save_textract_output(s3_key: str, response: Dict, output_dir: Path):
+    """
+    Save Textract response to JSON file and plain text file locally.
+
+    Args:
+        s3_key: S3 object key
+        response: Textract API response
+        output_dir: Directory to save output files
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    filename = Path(s3_key).stem
+
+    # Extract text
+    extracted_text = extract_text_from_response(response)
+
+    # Save JSON output
+    json_output_file = output_dir / f"{filename}_textract.json"
+    if response:
+        response['extracted_text'] = extracted_text
+        response['source_s3_key'] = s3_key
+
+    with open(json_output_file, 'w', encoding='utf-8') as f:
+        json.dump(response, f, indent=2, ensure_ascii=False)
+
+    print(f"  ✓ Saved JSON to: {json_output_file.name}")
+
+    # Save plain text output
+    text_output_file = output_dir / f"{filename}.txt"
+    with open(text_output_file, 'w', encoding='utf-8') as f:
+        f.write(extracted_text)
+
+    print(f"  ✓ Saved text to: {text_output_file.name}")
+
+
+def get_supported_images_from_s3(bucket_name: str, prefix: str) -> List[str]:
+    """
+    Get list of supported image files in an S3 prefix (folder).
+    Filters out files containing 'script' (case-insensitive).
+
+    Args:
+        bucket_name: S3 bucket name
+        prefix: S3 prefix (folder path)
+
+    Returns:
+        List of S3 keys for supported image files
+    """
+    s3 = get_s3_client()
+    supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'}
+    images = []
+
+    # Ensure prefix ends with / if it's not empty
+    if prefix and not prefix.endswith('/'):
+        prefix += '/'
+
+    paginator = s3.get_paginator('list_objects_v2')
+    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
+
+    for page in pages:
+        for obj in page.get('Contents', []):
+            key = obj['Key']
+            file_path = Path(key)
+
+            # Check if it's a file (not a folder) and has supported extension
+            if file_path.suffix.lower() in supported_extensions:
+                # Filter out files containing 'script' (case-insensitive)
+                if 'script' not in file_path.name.lower():
+                    images.append(key)
+
+    return sorted(images)
+
+
+def get_folders_from_s3(bucket_name: str, base_prefix: str = '') -> List[str]:
+    """
+    Get list of folders (prefixes) in S3 bucket.
+
+    Args:
+        bucket_name: S3 bucket name
+        base_prefix: Base prefix to search under
+
+    Returns:
+        List of folder prefixes
+    """
+    s3 = get_s3_client()
+    folders = []
+
+    # Ensure prefix ends with / if it's not empty
+    if base_prefix and not base_prefix.endswith('/'):
+        base_prefix += '/'
+
+    paginator = s3.get_paginator('list_objects_v2')
+    pages = paginator.paginate(Bucket=bucket_name, Prefix=base_prefix, Delimiter='/')
+
+    for page in pages:
+        for prefix_info in page.get('CommonPrefixes', []):
+            folders.append(prefix_info['Prefix'])
+
+    return folders
+
+
+def process_folder(bucket_name: str, prefix: str, output_base_dir: Path, skip_existing: bool = True):
+    """
+    Process all images in an S3 folder (prefix).
+
+    Args:
+        bucket_name: S3 bucket name
+        prefix: S3 prefix (folder path)
+        output_base_dir: Base directory for output files
+        skip_existing: Whether to skip already processed files
+    """
+    folder_name = prefix.rstrip('/').split('/')[-1] or 'root'
+    output_dir = output_base_dir / folder_name
+
+    print(f"\n{'='*80}")
+    print(f"Processing folder: {prefix}")
+    print(f"{'='*80}")
+
+    images = get_supported_images_from_s3(bucket_name, prefix)
+
+    if not images:
+        print(f"  No supported images found (PDF, PNG, JPEG)")
+        return
+
+    print(f"  Found {len(images)} image(s)")
+
+    processed_count = 0
+    skipped_count = 0
+    error_count = 0
+
+    for s3_key in images:
+        filename = Path(s3_key).name
+        print(f"\n  Processing: {filename}")
+
+        # Check if already processed
+        if skip_existing and is_already_processed(s3_key, output_dir):
+            print(f"  ⊘ Skipped (already processed)")
+            skipped_count += 1
+            continue
+
+        # Process with Textract
+        response = process_image_from_s3(bucket_name, s3_key)
+
+        if response:
+            # Save output (both JSON and text)
+            save_textract_output(s3_key, response, output_dir)
+
+            # Print summary
+            num_blocks = len(response.get('Blocks', []))
+            text_length = len(extract_text_from_response(response))
+            print(f"  ℹ Extracted {text_length} characters, {num_blocks} blocks")
+
+            processed_count += 1
+
+            # Small delay to avoid rate limiting
+            time.sleep(0.5)
+        else:
+            error_count += 1
+
+    print(f"\n  Summary for {folder_name}:")
+    print(f"    Processed: {processed_count}")
+    print(f"    Skipped: {skipped_count}")
+    print(f"    Errors: {error_count}")
+
+
+def main():
+    """Main entry point for the script."""
+    # Get bucket name from environment or command line
+    bucket_name = os.environ.get('S3_BUCKET_NAME')
+    base_prefix = os.environ.get('S3_BASE_PREFIX', 'imagens')
+
+    if len(sys.argv) > 1:
+        bucket_name = sys.argv[1]
+    if len(sys.argv) > 2:
+        base_prefix = sys.argv[2]
+
+    if not bucket_name:
+        print("Error: S3 bucket name not provided.")
+        print("\nUsage:")
+        print("  python process_images_batch.py <bucket_name> [base_prefix]")
+        print("\nOr set environment variables:")
+        print("  export S3_BUCKET_NAME=my-bucket")
+        print("  export S3_BASE_PREFIX=imagens")
+        print("  python process_images_batch.py")
+        sys.exit(1)
+
+    # Get output directory
+    script_dir = Path(__file__).parent
+    output_base_dir = script_dir / "textract_output"
+
+    print(f"S3 Bucket: {bucket_name}")
+    print(f"Base prefix: {base_prefix}")
+    print(f"Output directory: {output_base_dir}")
+
+    # Get all folders (prefixes) in the bucket
+    print(f"\nScanning S3 bucket for folders...")
+    folders = get_folders_from_s3(bucket_name, base_prefix)
+
+    if not folders:
+        print(f"\nNo subdirectories found under '{base_prefix}'.")
+        print("Processing files in the base prefix instead...")
+        folders = [base_prefix]
+    else:
+        print(f"\nFound {len(folders)} folder(s) to process")
+
+    # Process each folder
+    total_start = time.time()
+
+    for prefix in folders:
+        try:
+            process_folder(bucket_name, prefix, output_base_dir)
+        except Exception as e:
+            print(f"\nError processing folder {prefix}: {str(e)}")
+            import traceback
+            traceback.print_exc()
+            continue
+
+    total_time = time.time() - total_start
+
+    print(f"\n{'='*80}")
+    print(f"Batch processing complete!")
+    print(f"Total time: {total_time:.2f} seconds")
+    print(f"{'='*80}")
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/textract.py
+++ b/scripts/textract.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Simple script to invoke AWS Textract on a PDF file.
+Extracts text and returns the detected content.
+"""
+
+import boto3
+import sys
+import io
+from pathlib import Path
+from PyPDF2 import PdfReader
+
+
+def get_pdf_page_count(pdf_bytes: bytes) -> int:
+    """
+    Get the number of pages in a PDF file.
+
+    Args:
+        pdf_bytes: PDF file content as bytes
+
+    Returns:
+        int: Number of pages in the PDF
+    """
+    try:
+        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
+        return len(pdf_reader.pages)
+    except Exception as e:
+        print(f"Warning: Could not determine page count: {str(e)}")
+        return 1
+
+
+def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
+    """
+    Process a document file (PDF, PNG, JPEG) with AWS Textract.
+    Uses async API (start_document_text_detection) for multi-page PDFs,
+    and sync API (detect_document_text) for single-page PDFs and images.
+
+    Args:
+        pdf_path: Path to the document file (local path or S3 key)
+        bucket_name: Optional S3 bucket name if document is in S3
+
+    Returns:
+        dict: Textract response containing detected text
+    """
+    textract = boto3.client('textract')
+    file_ext = Path(pdf_path).suffix.lower()
+
+    # For images (PNG, JPEG), always use sync API
+    if file_ext in ['.png', '.jpg', '.jpeg']:
+        print(f"Processing image file with sync API")
+
+        if bucket_name:
+            response = textract.detect_document_text(
+                Document={
+                    'S3Object': {
+                        'Bucket': bucket_name,
+                        'Name': pdf_path
+                    }
+                }
+            )
+        else:
+            with open(pdf_path, 'rb') as file:
+                file_bytes = file.read()
+            response = textract.detect_document_text(
+                Document={'Bytes': file_bytes}
+            )
+
+        return response
+
+    # For PDFs, check page count to decide which API to use
+    if file_ext == '.pdf':
+        s3 = boto3.client('s3')
+
+        # Determine number of pages
+        if bucket_name:
+            # Download PDF from S3 to check page count
+            response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
+            pdf_bytes = response['Body'].read()
+        else:
+            # Read local PDF
+            with open(pdf_path, 'rb') as pdf_file:
+                pdf_bytes = pdf_file.read()
+
+        page_count = get_pdf_page_count(pdf_bytes)
+        print(f"PDF has {page_count} page(s)")
+
+        # Use async API for multi-page PDFs
+        if page_count > 1:
+            print("Using async API (start_document_text_detection) for multi-page PDF")
+
+            if bucket_name:
+                # Process from S3
+                response = textract.start_document_text_detection(
+                    DocumentLocation={
+                        'S3Object': {
+                            'Bucket': bucket_name,
+                            'Name': pdf_path
+                        }
+                    }
+                )
+            else:
+                # For local files with multiple pages, we need to use S3
+                # Note: Textract async API requires S3
+                raise ValueError(
+                    "Multi-page PDFs must be processed from S3. "
+                    "Please upload the file to S3 first."
+                )
+
+            job_id = response['JobId']
+            print(f"Started Textract job: {job_id}")
+
+            # Wait for job to complete
+            import time
+            while True:
+                result = textract.get_document_text_detection(JobId=job_id)
+                status = result['JobStatus']
+                print(f"Job status: {status}")
+
+                if status in ['SUCCEEDED', 'FAILED']:
+                    break
+                time.sleep(2)
+
+            return result
+        else:
+            # Use sync API for single-page PDFs
+            print("Using sync API (detect_document_text) for single-page PDF")
+
+            if bucket_name:
+                response = textract.detect_document_text(
+                    Document={
+                        'S3Object': {
+                            'Bucket': bucket_name,
+                            'Name': pdf_path
+                        }
+                    }
+                )
+            else:
+                response = textract.detect_document_text(
+                    Document={'Bytes': pdf_bytes}
+                )
+
+            return response
+
+    # Unsupported file type
+    raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
+
+
+def extract_text_from_response(response: dict) -> str:
+    """
+    Extract plain text from Textract response.
+    
+    Args:
+        response: Textract API response
+    
+    Returns:
+        str: Extracted text
+    """
+    text_lines = []
+    
+    for block in response.get('Blocks', []):
+        if block['BlockType'] == 'LINE':
+            text_lines.append(block['Text'])
+    
+    return '\n'.join(text_lines)
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
+        print("\nExamples:")
+        print("  python textract_pdf.py document.pdf")
+        print("  python textract_pdf.py path/to/doc.pdf my-bucket")
+        sys.exit(1)
+    
+    pdf_path = sys.argv[1]
+    bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
+    
+    if not bucket_name and not Path(pdf_path).exists():
+        print(f"Error: File not found: {pdf_path}")
+        sys.exit(1)
+    
+    print(f"Processing PDF: {pdf_path}")
+    if bucket_name:
+        print(f"Using S3 bucket: {bucket_name}")
+    
+    # Process PDF
+    response = process_pdf_with_textract(pdf_path, bucket_name)
+    
+    # Extract and display text
+    text = extract_text_from_response(response)
+    
+    print("\n" + "="*80)
+    print("EXTRACTED TEXT")
+    print("="*80)
+    print(text)
+    print("="*80)
+    
+    # Print summary
+    num_blocks = len(response.get('Blocks', []))
+    num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
+    
+    print(f"\nSummary:")
+    print(f"  Pages processed: {num_pages}")
+    print(f"  Total blocks: {num_blocks}")
+    print(f"  Text length: {len(text)} characters")
+
+
+if __name__ == '__main__':
+    main()
--- a/scripts/tojson.py
+++ b/scripts/tojson.py
@@ -0,0 +1,30 @@
+import pandas as pd
+import json
+from pathlib import Path
+
+# Configuration
+CSV_FILE = "guias.csv"
+OUTPUT_DIR = "json_output"
+ENCODING = "utf-8"
+
+# Create output directory
+Path(OUTPUT_DIR).mkdir(exist_ok=True)
+
+# Read CSV
+df = pd.read_csv(CSV_FILE, encoding=ENCODING)
+
+# Convert each row to JSON, skipping row 2 (index 1)
+for index, row in df.iterrows():
+    # Skip the second row (index 1)
+    if index == 0:
+        print(f"⊗ Skipped row {index + 1}")
+        continue
+    
+    # Save to individual JSON file
+    output_file = f"{OUTPUT_DIR}/row_{index + 1}.json"
+    with open(output_file, 'w', encoding=ENCODING) as json_file:
+        json.dump(row.to_dict(), json_file, indent=2, ensure_ascii=False)
+    
+    print(f"✓ Created {output_file}")
+
+print(f"\nDone! Created {len(df) - 1} JSON files in '{OUTPUT_DIR}/' directory")