Adds initial files]

2026-01-20 13:48:13 -03:00
parent 12176c50c1
commit 6026870c5c
16 changed files with 1316 additions and 0 deletions
--- a/code/app.py
+++ b/code/app.py
@@ -0,0 +1,156 @@
 from fastapi import FastAPI,Header
 import uvicorn
 """
 Simple LangGraph Agent using @tool decorator
 Clean implementation with decorator-based tool definitions.
 """
 from typing import Annotated, TypedDict
 from langgraph.graph import StateGraph, START, END
 from langgraph.graph.message import add_messages
 from langgraph.prebuilt import ToolNode, tools_condition
 from langchain_core.messages import HumanMessage, SystemMessage
 from langchain_core.tools import tool
 from langchain_aws import ChatBedrock
 app = FastAPI()
 # Define tools using @tool decorator
 rules="""- Mulheres acima de 45 anos ou menopausada
 - Homens com mais de 70 anos;
 - Osteogênese imperfeita (para esta patologia, poderá haver a liberação de (02) dois 
 exames ao ano - cada 180 dias);
 - RX com osteopenia ou fratura patológica;
 - Antecedente pessoal de fratura após os 40 anos: punho, ombros, vértebras, quadril;
 - Parente de primeiro grau com osteoporose.
 - Mulheres com massa corporal <20kg/m2 ou peso < 57,8kg;
 - Menopausa antes dos 45 anos ou hipogonasismo crônico (falência ovariana 
 precoce);
 - Uso de glicocorticóides (>=7,5 prednizona/ dia equivalente por mais três meses, ou 
 presença de síndrome de cushing;
 - Hiperparatireoidismo primário;
 - Uso prolongado de anticonvulsivantes (< 10 anos);
 - Síndrome de má absorção crônica ou desnutrição doenças inflamatória intestinal 
 (independente da causa: bariatrica, celiacos, intolerancia a lactose).
 - Quimioterapia, se sobrevida esperada for longa (< 5 anos);
 - Diminuição documentada de altura;
 - Presença de cifose após menopausa.
 - Imobilização prolongada"""
 SYSTEM_PROMPT="""
 You are a assisant, your job is to aprove or not a procedure based on the following rules:"""
 +rules+"""
 Your input will be a json, evaluate it based on the rules and return:
 Aproved: If one of the criteira is met
 Reproved: If not a single one of the criterias are met."""
@tool
 def add(a: int, b: int) -> int:
    """Add two numbers together.
    Args:
        a: First number
        b: Second number
    """
    return a + b
@tool
 def multiply(a: int, b: int) -> int:
    """Multiply two numbers.
    Args:
        a: First number
        b: Second number
    """
    return a * b
@tool
 def get_word_length(word: str) -> int:
    """Get the length of a word.
    Args:
        word: The word to measure
    """
    return len(word)
@tool
 def search_info(topic: str) -> str:
    """Search for information about a topic (mock implementation).
    Args:
        topic: The topic to search for
    """
    # Mock response - replace with actual search/API
    return f"Information about {topic}: This is a mock response. In production, this would return real data."
 # Define agent state
 class AgentState(TypedDict):
    messages: Annotated[list, add_messages]
 # Define tools list
 tools = [add, multiply, get_word_length, search_info]
 # Agent node
 def call_model(state: AgentState):
    """Call the LLM with current state and tools."""
    model = ChatBedrock(
        model_id="arn:aws:bedrock:us-east-2:232048051668:application-inference-profile/uy4xskop19zn",
        region_name="us-east-2",
        provider="anthropic"
    )
    model_with_tools = model.bind_tools(tools)
    messages = [
        SystemMessage(content=SYSTEM_PROMPT)
    ] + state["messages"]
    response = model_with_tools.invoke(messages)
    return {"messages": [response]}
 # Build the graph
 def create_agent():
    """Create and compile the agent graph."""
    workflow = StateGraph(AgentState)
    # Add nodes
    workflow.add_node("agent", call_model)
    workflow.add_node("tools", ToolNode(tools))
    # Add edges
    workflow.add_edge(START, "agent")
    workflow.add_conditional_edges("agent", tools_condition)
    workflow.add_edge("tools", "agent")
    return workflow.compile()
 # Main execution
@app.post("/")
 async def root(json:str= Header(...)):
    agent = create_agent()
    query=json
    result = agent.invoke(
            {"messages": [HumanMessage(content=query)]},
            config={"recursion_limit": 10}
        )
    final_message = result["messages"][-1]
    return {"status": "success", "message": final_message.content}
@app.get("/health")
 async def health():
    return {"status": "healthy"}
 if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/code/dockerfile
+++ b/code/dockerfile
@@ -0,0 +1,15 @@
 FROM python:3.12
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Install curl for health checks
 RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
 COPY app.py .
 EXPOSE 8000
 CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/code/requirements.txt
+++ b/code/requirements.txt
@@ -0,0 +1,6 @@
 fastapi==0.104.1
 uvicorn[standard]==0.24.0
 langgraph
 langchain-aws
 langchain
 PyPDF2
--- a/infra/.terraform.lock.hcl
+++ b/infra/.terraform.lock.hcl
@@ -0,0 +1,25 @@
 # This file is maintained automatically by "terraform init".
 # Manual edits may be lost in future updates.
 provider "registry.terraform.io/hashicorp/aws" {
  version     = "6.27.0"
  constraints = "~> 6.27"
  hashes = [
    "h1:bixp2PSsP5ZGBczGCxcbSDn6lF5QFlUXlNroq9cdab4=",
    "zh:177a24b806c72e8484b5cabc93b2b38e3d770ae6f745a998b54d6619fd0e8129",
    "zh:4ac4a85c14fb868a3306b542e6a56c10bd6c6d5a67bc0c9b8f6a9060cf5f3be7",
    "zh:552652185bc85c8ba1da1d65dea47c454728a5c6839c458b6dcd3ce71c19ccfc",
    "zh:60284b8172d09aee91eae0856f09855eaf040ce3a58d6933602ae17c53f8ed04",
    "zh:6be38d156756ca61fb8e7c752cc5d769cd709686700ac4b230f40a6e95b5dbc9",
    "zh:7a409138fae4ef42e3a637e37cb9efedf96459e28a3c764fc4e855e8db9a7485",
    "zh:8070cf5224ed1ed3a3e9a59f7c30ff88bf071c7567165275d477c1738a56c064",
    "zh:894439ef340a9a79f69cd759e27ad11c7826adeca27be1b1ca82b3c9702fa300",
    "zh:89d035eebf08a97c89374ff06040955ddc09f275ecca609d0c9d58d149bef5cf",
    "zh:985b1145d724fc1f38369099e4a5087141885740fd6c0b1dbc492171e73c2e49",
    "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
    "zh:a80b47ae8d1475201c86bd94a5dcb9dd4da5e8b73102a90820b68b66b76d50fd",
    "zh:d3395be1556210f82199b9166a6b2e677cee9c4b67e96e63f6c3a98325ad7ab0",
    "zh:db0b869d09657f6f1e4110b56093c5fcdf9dbdd97c020db1e577b239c0adcbce",
    "zh:ffc72e680370ae7c21f9bd3082c6317730df805c6797427839a6b6b7e9a26a01",
  ]
 }
--- a/infra/ecr/main.tf
+++ b/infra/ecr/main.tf
@@ -0,0 +1,32 @@
 terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 6.27"
    }
  }
 }
 provider "aws" {
  region = var.aws_region
 }
 resource "aws_ecr_repository" "app" {
  name                 = var.repository_name
  image_tag_mutability = "MUTABLE"  # or "IMMUTABLE"
  image_scanning_configuration {
    scan_on_push = true
  }
  encryption_configuration {
    encryption_type = "AES256"  # or "KMS" for customer managed keys
  }
  tags = {
    Name        = var.repository_name
    Environment = var.environment
    ManagedBy   = "Terraform"
  }
 }
--- a/infra/ecr/outputs.tf
+++ b/infra/ecr/outputs.tf
@@ -0,0 +1,14 @@
 output "repository_url" {
  description = "ECR repository URL"
  value       = aws_ecr_repository.app.repository_url
 }
 output "repository_arn" {
  description = "ECR repository ARN"
  value       = aws_ecr_repository.app.arn
 }
 output "repository_name" {
  description = "ECR repository name"
  value       = aws_ecr_repository.app.name
 }
--- a/infra/ecr/terraform.tfstate
+++ b/infra/ecr/terraform.tfstate
@@ -0,0 +1,75 @@
 {
  "version": 4,
  "terraform_version": "1.14.3",
  "serial": 6,
  "lineage": "b2b2331d-cf66-169e-d25b-38e0528505fc",
  "outputs": {
    "repository_arn": {
      "value": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser",
      "type": "string"
    },
    "repository_name": {
      "value": "upflux-doc-analyser",
      "type": "string"
    },
    "repository_url": {
      "value": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser",
      "type": "string"
    }
  },
  "resources": [
    {
      "mode": "managed",
      "type": "aws_ecr_repository",
      "name": "app",
      "provider": "provider[\"registry.terraform.io/hashicorp/aws\"]",
      "instances": [
        {
          "schema_version": 0,
          "attributes": {
            "arn": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser",
            "encryption_configuration": [
              {
                "encryption_type": "AES256",
                "kms_key": ""
              }
            ],
            "force_delete": null,
            "id": "upflux-doc-analyser",
            "image_scanning_configuration": [
              {
                "scan_on_push": true
              }
            ],
            "image_tag_mutability": "MUTABLE",
            "image_tag_mutability_exclusion_filter": [],
            "name": "upflux-doc-analyser",
            "region": "us-east-2",
            "registry_id": "232048051668",
            "repository_url": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser",
            "tags": {
              "Environment": "dev",
              "ManagedBy": "Terraform",
              "Name": "upflux-doc-analyser"
            },
            "tags_all": {
              "Environment": "dev",
              "ManagedBy": "Terraform",
              "Name": "upflux-doc-analyser"
            },
            "timeouts": null
          },
          "sensitive_attributes": [],
          "identity_schema_version": 0,
          "identity": {
            "account_id": "232048051668",
            "name": "upflux-doc-analyser",
            "region": "us-east-2"
          },
          "private": "eyJlMmJmYjczMC1lY2FhLTExZTYtOGY4OC0zNDM2M2JjN2M0YzAiOnsiZGVsZXRlIjoxMjAwMDAwMDAwMDAwfX0="
        }
      ]
    }
  ],
  "check_results": null
 }
--- a/infra/ecr/terraform.tfvars
+++ b/infra/ecr/terraform.tfvars
@@ -0,0 +1,3 @@
 aws_region      = "us-east-2"
 repository_name = "upflux-doc-analyser"
 environment     = "dev"
--- a/infra/ecr/variable.tf
+++ b/infra/ecr/variable.tf
@@ -0,0 +1,16 @@
 variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-east-1"
 }
 variable "repository_name" {
  description = "Name of the ECR repository"
  type        = string
 }
 variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
 }
--- a/infra/ecs_alb/main.tf
+++ b/infra/ecs_alb/main.tf
@@ -0,0 +1,249 @@
 terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 6.27"
    }
  }
 }
 provider "aws" {
  region = var.aws_region
 }
 # Get current AWS account
 data "aws_caller_identity" "current" {}
 # Reference existing VPC
 data "aws_vpc" "existing" {
  id = var.vpc_id
 }
 # Reference existing public subnets
 data "aws_subnet" "public" {
  count = length(var.public_subnet_ids)
  id    = var.public_subnet_ids[count.index]
 }
 # Reference existing private subnets (for ECS tasks)
 data "aws_subnet" "private" {
  count = length(var.private_subnet_ids)
  id    = var.private_subnet_ids[count.index]
 }
 # Security Group for ALB (in public subnets)
 resource "aws_security_group" "alb" {
  name        = "${var.app_name}-alb-sg"
  description = "Allow inbound traffic to ALB"
  vpc_id      = data.aws_vpc.existing.id
  ingress {
    from_port   = 80
    to_port     = 80
    protocol    = "tcp"
    cidr_blocks = ["3.14.44.224/32"]
    description = "Allow HTTP from internet"
  }
  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
    description = "Allow all outbound"
  }
  tags = {
    Name = "${var.app_name}-alb-sg"
  }
 }
 # Security Group for ECS Tasks (in private subnets)
 resource "aws_security_group" "ecs_tasks" {
  name        = "${var.app_name}-ecs-tasks-sg"
  description = "Allow inbound traffic from ALB"
  vpc_id      = data.aws_vpc.existing.id
  ingress {
    from_port       = 8000
    to_port         = 8000
    protocol        = "tcp"
    security_groups = [aws_security_group.alb.id]
    description     = "Allow traffic from ALB"
  }
  egress {
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
    description = "Allow all outbound"
  }
  tags = {
    Name = "${var.app_name}-ecs-tasks-sg"
  }
 }
 # Application Load Balancer (in public subnets)
 resource "aws_lb" "main" {
  name               = "${var.app_name}-alb"
  internal           = false
  load_balancer_type = "application"
  security_groups    = [aws_security_group.alb.id]
  subnets            = var.public_subnet_ids
  enable_deletion_protection = false
  tags = {
    Name = "${var.app_name}-alb"
  }
 }
 # Target Group
 resource "aws_lb_target_group" "app" {
  name        = "${var.app_name}-tg"
  port        = 8000
  protocol    = "HTTP"
  vpc_id      = data.aws_vpc.existing.id
  target_type = "ip"
  health_check {
    enabled             = true
    healthy_threshold   = 2
    interval            = 30
    matcher             = "200"
    path                = "/health"
    port                = "traffic-port"
    protocol            = "HTTP"
    timeout             = 5
    unhealthy_threshold = 3
  }
  deregistration_delay = 30
  tags = {
    Name = "${var.app_name}-tg"
  }
 }
 # ALB Listener
 resource "aws_lb_listener" "app" {
  load_balancer_arn = aws_lb.main.arn
  port              = "80"
  protocol          = "HTTP"
  default_action {
    type             = "forward"
    target_group_arn = aws_lb_target_group.app.arn
  }
 }
 # ECS Cluster
 resource "aws_ecs_cluster" "main" {
  name = "${var.app_name}-cluster"
  tags = {
    Name = "${var.app_name}-cluster"
  }
 }
 # CloudWatch Log Group
 resource "aws_cloudwatch_log_group" "app" {
  name              = "/ecs/${var.app_name}"
  retention_in_days = 7
  tags = {
    Name = "${var.app_name}-logs"
  }
 }
 # ECS Task Execution Role
 resource "aws_iam_role" "ecs_task_execution_role" {
  name = "${var.app_name}-ecs-task-execution-role"
  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [{
      Action = "sts:AssumeRole"
      Effect = "Allow"
      Principal = {
        Service = "ecs-tasks.amazonaws.com"
      }
    }]
  })
 }
 resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_policy" {
  role       = aws_iam_role.ecs_task_execution_role.name
  policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
 }
 # ECS Task Definition
 resource "aws_ecs_task_definition" "app" {
  family                   = var.app_name
  network_mode             = "awsvpc"
  requires_compatibilities = ["FARGATE"]
  cpu                      = var.fargate_cpu
  memory                   = var.fargate_memory
  execution_role_arn       = aws_iam_role.ecs_task_execution_role.arn
  container_definitions = jsonencode([{
    name  = var.app_name
    image = "${data.aws_caller_identity.current.account_id}.dkr.ecr.${var.aws_region}.amazonaws.com/${var.ecr_repository_name}:${var.image_tag}"
    portMappings = [{
      containerPort = 8000
      hostPort      = 8000
      protocol      = "tcp"
    }]
    logConfiguration = {
      logDriver = "awslogs"
      options = {
        "awslogs-group"         = aws_cloudwatch_log_group.app.name
        "awslogs-region"        = var.aws_region
        "awslogs-stream-prefix" = "ecs"
      }
    }
    healthCheck = {
      command     = ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
      interval    = 30
      timeout     = 5
      retries     = 3
      startPeriod = 60
    }
  }])
  tags = {
    Name = "${var.app_name}-task"
  }
 }
 # ECS Service (tasks in private subnets)
 resource "aws_ecs_service" "app" {
  name            = "${var.app_name}-service"
  cluster         = aws_ecs_cluster.main.id
  task_definition = aws_ecs_task_definition.app.arn
  desired_count   = var.app_count
  launch_type     = "FARGATE"
  network_configuration {
    security_groups  = [aws_security_group.ecs_tasks.id]
    subnets          = var.private_subnet_ids  # ECS tasks in private subnets
    assign_public_ip = false  # No public IP needed with NAT gateway
  }
  load_balancer {
    target_group_arn = aws_lb_target_group.app.arn
    container_name   = var.app_name
    container_port   = 8000
  }
  depends_on = [aws_lb_listener.app]
  tags = {
    Name = "${var.app_name}-service"
  }
 }
--- a/infra/ecs_alb/terraform.tfvars
+++ b/infra/ecs_alb/terraform.tfvars
@@ -0,0 +1,23 @@
 aws_region  = "us-east-2"
 app_name    = "upflux-doc-analyser"
 # Replace these with your actual IDs
 vpc_id = "vpc-0270f02aee3bf1b8d"
 # Your public subnets (where ALB will be)
 public_subnet_ids = [
  "subnet-088bc49c54ec8f028",  # Public subnet 1
  "subnet-003f1693910a99afb"   # Public subnet 2
 ]
 # Your private subnets (where ECS tasks will run)
 private_subnet_ids = [
  "subnet-045f73d784beed091",  # Private subnet 1
  "subnet-06e660f44bf141442"   # Private subnet 2
 ]
 ecr_repository_name = "upflux-doc-analyser"
 image_tag           = "latest"
 fargate_cpu         = "256"
 fargate_memory      = "512"
 app_count           = 1
--- a/infra/ecs_alb/variables.tf
+++ b/infra/ecs_alb/variables.tf
@@ -0,0 +1,56 @@
 variable "aws_region" {
  description = "AWS region"
  type        = string
  default     = "us-east-1"
 }
 variable "app_name" {
  description = "Application name"
  type        = string
  default     = "fastapi-app"
 }
 variable "vpc_id" {
  description = "Existing VPC ID"
  type        = string
 }
 variable "public_subnet_ids" {
  description = "List of existing public subnet IDs (for ALB)"
  type        = list(string)
 }
 variable "private_subnet_ids" {
  description = "List of existing private subnet IDs (for ECS tasks)"
  type        = list(string)
 }
 variable "ecr_repository_name" {
  description = "ECR repository name"
  type        = string
  default     = "fastapi-app"
 }
 variable "image_tag" {
  description = "Docker image tag"
  type        = string
  default     = "latest"
 }
 variable "fargate_cpu" {
  description = "Fargate CPU units"
  type        = string
  default     = "256"
 }
 variable "fargate_memory" {
  description = "Fargate memory in MB"
  type        = string
  default     = "512"
 }
 variable "app_count" {
  description = "Number of tasks to run"
  type        = number
  default     = 1
 }
--- a/infra/terraform.tfvars
+++ b/infra/terraform.tfvars
@@ -0,0 +1,3 @@
 aws_region      = "us-east-2"
 repository_name = "upflux-doc-analyser"
 environment     = "dev"
--- a/scripts/process_images_batch.py
+++ b/scripts/process_images_batch.py
@@ -0,0 +1,404 @@
 #!/usr/bin/env python3
 """
 Batch process images from S3 using AWS Textract.
 Iterates through folders (prefixes) in an S3 bucket and processes any PDF, PNG, or JPEG files
 that haven't been processed yet (checking for existing textract output files).
 Saves both JSON and plain text outputs locally.
 """
 import boto3
 import json
 import sys
 import os
 import io
 from pathlib import Path
 from typing import Dict, List, Optional
 import time
 from PyPDF2 import PdfReader
 def get_s3_client():
    """Initialize and return AWS S3 client."""
    return boto3.client('s3',region_name="us-east-2")
 def get_textract_client():
    """Initialize and return AWS Textract client."""
    return boto3.client('textract',region_name="us-east-2")
 def get_pdf_page_count(pdf_bytes: bytes) -> int:
    """
    Get the number of pages in a PDF file.
    Args:
        pdf_bytes: PDF file content as bytes
    Returns:
        int: Number of pages in the PDF
    """
    try:
        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
        return len(pdf_reader.pages)
    except Exception as e:
        print(f"    Warning: Could not determine page count: {str(e)}")
        return 1
 def is_already_processed(s3_key: str, output_dir: Path) -> bool:
    """
    Check if an image has already been processed by looking for output file.
    Args:
        s3_key: S3 object key
        output_dir: Directory where output files are stored
    Returns:
        bool: True if output file exists, False otherwise
    """
    filename = Path(s3_key).stem
    output_file = output_dir / f"{filename}_textract.json"
    return output_file.exists()
 def process_image_from_s3(bucket_name: str, s3_key: str) -> Dict:
    """
    Process an image file from S3 with AWS Textract.
    Supports PDF, PNG, and JPEG formats.
    Uses async API (start_document_text_detection) for multi-page PDFs,
    and sync API (detect_document_text) for single-page PDFs and images.
    Args:
        bucket_name: S3 bucket name
        s3_key: S3 object key
    Returns:
        dict: Textract response containing detected text
    """
    textract = get_textract_client()
    s3 = get_s3_client()
    try:
        # Verify the object exists first
        try:
            s3.head_object(Bucket=bucket_name, Key=s3_key)
        except Exception as e:
            print(f"    Error accessing S3 object: {str(e)}")
            print(f"    Bucket: {bucket_name}")
            print(f"    Key: {s3_key}")
            return None
        file_ext = Path(s3_key).suffix.lower()
        # For images (PNG, JPEG), always use sync API
        if file_ext in ['.png', '.jpg', '.jpeg']:
            print(f"    Processing image with sync API")
            response = textract.detect_document_text(
                Document={
                    'S3Object': {
                        'Bucket': bucket_name,
                        'Name': s3_key
                    }
                }
            )
            return response
        # For PDFs, check page count to decide which API to use
        if file_ext == '.pdf':
            # Download PDF to check page count
            response = s3.get_object(Bucket=bucket_name, Key=s3_key)
            pdf_bytes = response['Body'].read()
            page_count = get_pdf_page_count(pdf_bytes)
            print(f"    PDF has {page_count} page(s)")
            # Use async API for multi-page PDFs
            if page_count > 1:
                print(f"    Using async API (start_document_text_detection) for multi-page PDF")
                response = textract.start_document_text_detection(
                    DocumentLocation={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': s3_key
                        }
                    }
                )
                job_id = response['JobId']
                print(f"    Started async job: {job_id}")
                # Wait for job to complete
                while True:
                    result = textract.get_document_text_detection(JobId=job_id)
                    status = result['JobStatus']
                    if status == 'SUCCEEDED':
                        return result
                    elif status == 'FAILED':
                        print(f"    Job failed: {result.get('StatusMessage', 'Unknown error')}")
                        return None
                    time.sleep(2)
            else:
                # Use sync API for single-page PDFs
                print(f"    Using sync API (detect_document_text) for single-page PDF")
                response = textract.detect_document_text(
                    Document={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': s3_key
                        }
                    }
                )
                return response
    except Exception as e:
        print(f"    Error processing {s3_key}: {str(e)}")
        return None
 def extract_text_from_response(response: Dict) -> str:
    """
    Extract plain text from Textract response.
    Args:
        response: Textract API response
    Returns:
        str: Extracted text
    """
    if not response:
        return ""
    text_lines = []
    for block in response.get('Blocks', []):
        if block['BlockType'] == 'LINE':
            text_lines.append(block['Text'])
    return '\n'.join(text_lines)
 def save_textract_output(s3_key: str, response: Dict, output_dir: Path):
    """
    Save Textract response to JSON file and plain text file locally.
    Args:
        s3_key: S3 object key
        response: Textract API response
        output_dir: Directory to save output files
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    filename = Path(s3_key).stem
    # Extract text
    extracted_text = extract_text_from_response(response)
    # Save JSON output
    json_output_file = output_dir / f"{filename}_textract.json"
    if response:
        response['extracted_text'] = extracted_text
        response['source_s3_key'] = s3_key
    with open(json_output_file, 'w', encoding='utf-8') as f:
        json.dump(response, f, indent=2, ensure_ascii=False)
    print(f"  ✓ Saved JSON to: {json_output_file.name}")
    # Save plain text output
    text_output_file = output_dir / f"{filename}.txt"
    with open(text_output_file, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    print(f"  ✓ Saved text to: {text_output_file.name}")
 def get_supported_images_from_s3(bucket_name: str, prefix: str) -> List[str]:
    """
    Get list of supported image files in an S3 prefix (folder).
    Filters out files containing 'script' (case-insensitive).
    Args:
        bucket_name: S3 bucket name
        prefix: S3 prefix (folder path)
    Returns:
        List of S3 keys for supported image files
    """
    s3 = get_s3_client()
    supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'}
    images = []
    # Ensure prefix ends with / if it's not empty
    if prefix and not prefix.endswith('/'):
        prefix += '/'
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
    for page in pages:
        for obj in page.get('Contents', []):
            key = obj['Key']
            file_path = Path(key)
            # Check if it's a file (not a folder) and has supported extension
            if file_path.suffix.lower() in supported_extensions:
                # Filter out files containing 'script' (case-insensitive)
                if 'script' not in file_path.name.lower():
                    images.append(key)
    return sorted(images)
 def get_folders_from_s3(bucket_name: str, base_prefix: str = '') -> List[str]:
    """
    Get list of folders (prefixes) in S3 bucket.
    Args:
        bucket_name: S3 bucket name
        base_prefix: Base prefix to search under
    Returns:
        List of folder prefixes
    """
    s3 = get_s3_client()
    folders = []
    # Ensure prefix ends with / if it's not empty
    if base_prefix and not base_prefix.endswith('/'):
        base_prefix += '/'
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=base_prefix, Delimiter='/')
    for page in pages:
        for prefix_info in page.get('CommonPrefixes', []):
            folders.append(prefix_info['Prefix'])
    return folders
 def process_folder(bucket_name: str, prefix: str, output_base_dir: Path, skip_existing: bool = True):
    """
    Process all images in an S3 folder (prefix).
    Args:
        bucket_name: S3 bucket name
        prefix: S3 prefix (folder path)
        output_base_dir: Base directory for output files
        skip_existing: Whether to skip already processed files
    """
    folder_name = prefix.rstrip('/').split('/')[-1] or 'root'
    output_dir = output_base_dir / folder_name
    print(f"\n{'='*80}")
    print(f"Processing folder: {prefix}")
    print(f"{'='*80}")
    images = get_supported_images_from_s3(bucket_name, prefix)
    if not images:
        print(f"  No supported images found (PDF, PNG, JPEG)")
        return
    print(f"  Found {len(images)} image(s)")
    processed_count = 0
    skipped_count = 0
    error_count = 0
    for s3_key in images:
        filename = Path(s3_key).name
        print(f"\n  Processing: {filename}")
        # Check if already processed
        if skip_existing and is_already_processed(s3_key, output_dir):
            print(f"  ⊘ Skipped (already processed)")
            skipped_count += 1
            continue
        # Process with Textract
        response = process_image_from_s3(bucket_name, s3_key)
        if response:
            # Save output (both JSON and text)
            save_textract_output(s3_key, response, output_dir)
            # Print summary
            num_blocks = len(response.get('Blocks', []))
            text_length = len(extract_text_from_response(response))
            print(f"  ℹ Extracted {text_length} characters, {num_blocks} blocks")
            processed_count += 1
            # Small delay to avoid rate limiting
            time.sleep(0.5)
        else:
            error_count += 1
    print(f"\n  Summary for {folder_name}:")
    print(f"    Processed: {processed_count}")
    print(f"    Skipped: {skipped_count}")
    print(f"    Errors: {error_count}")
 def main():
    """Main entry point for the script."""
    # Get bucket name from environment or command line
    bucket_name = os.environ.get('S3_BUCKET_NAME')
    base_prefix = os.environ.get('S3_BASE_PREFIX', 'imagens')
    if len(sys.argv) > 1:
        bucket_name = sys.argv[1]
    if len(sys.argv) > 2:
        base_prefix = sys.argv[2]
    if not bucket_name:
        print("Error: S3 bucket name not provided.")
        print("\nUsage:")
        print("  python process_images_batch.py <bucket_name> [base_prefix]")
        print("\nOr set environment variables:")
        print("  export S3_BUCKET_NAME=my-bucket")
        print("  export S3_BASE_PREFIX=imagens")
        print("  python process_images_batch.py")
        sys.exit(1)
    # Get output directory
    script_dir = Path(__file__).parent
    output_base_dir = script_dir / "textract_output"
    print(f"S3 Bucket: {bucket_name}")
    print(f"Base prefix: {base_prefix}")
    print(f"Output directory: {output_base_dir}")
    # Get all folders (prefixes) in the bucket
    print(f"\nScanning S3 bucket for folders...")
    folders = get_folders_from_s3(bucket_name, base_prefix)
    if not folders:
        print(f"\nNo subdirectories found under '{base_prefix}'.")
        print("Processing files in the base prefix instead...")
        folders = [base_prefix]
    else:
        print(f"\nFound {len(folders)} folder(s) to process")
    # Process each folder
    total_start = time.time()
    for prefix in folders:
        try:
            process_folder(bucket_name, prefix, output_base_dir)
        except Exception as e:
            print(f"\nError processing folder {prefix}: {str(e)}")
            import traceback
            traceback.print_exc()
            continue
    total_time = time.time() - total_start
    print(f"\n{'='*80}")
    print(f"Batch processing complete!")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"{'='*80}")
 if __name__ == '__main__':
    main()
--- a/scripts/textract.py
+++ b/scripts/textract.py
@@ -0,0 +1,209 @@
 #!/usr/bin/env python3
 """
 Simple script to invoke AWS Textract on a PDF file.
 Extracts text and returns the detected content.
 """
 import boto3
 import sys
 import io
 from pathlib import Path
 from PyPDF2 import PdfReader
 def get_pdf_page_count(pdf_bytes: bytes) -> int:
    """
    Get the number of pages in a PDF file.
    Args:
        pdf_bytes: PDF file content as bytes
    Returns:
        int: Number of pages in the PDF
    """
    try:
        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
        return len(pdf_reader.pages)
    except Exception as e:
        print(f"Warning: Could not determine page count: {str(e)}")
        return 1
 def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
    """
    Process a document file (PDF, PNG, JPEG) with AWS Textract.
    Uses async API (start_document_text_detection) for multi-page PDFs,
    and sync API (detect_document_text) for single-page PDFs and images.
    Args:
        pdf_path: Path to the document file (local path or S3 key)
        bucket_name: Optional S3 bucket name if document is in S3
    Returns:
        dict: Textract response containing detected text
    """
    textract = boto3.client('textract')
    file_ext = Path(pdf_path).suffix.lower()
    # For images (PNG, JPEG), always use sync API
    if file_ext in ['.png', '.jpg', '.jpeg']:
        print(f"Processing image file with sync API")
        if bucket_name:
            response = textract.detect_document_text(
                Document={
                    'S3Object': {
                        'Bucket': bucket_name,
                        'Name': pdf_path
                    }
                }
            )
        else:
            with open(pdf_path, 'rb') as file:
                file_bytes = file.read()
            response = textract.detect_document_text(
                Document={'Bytes': file_bytes}
            )
        return response
    # For PDFs, check page count to decide which API to use
    if file_ext == '.pdf':
        s3 = boto3.client('s3')
        # Determine number of pages
        if bucket_name:
            # Download PDF from S3 to check page count
            response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
            pdf_bytes = response['Body'].read()
        else:
            # Read local PDF
            with open(pdf_path, 'rb') as pdf_file:
                pdf_bytes = pdf_file.read()
        page_count = get_pdf_page_count(pdf_bytes)
        print(f"PDF has {page_count} page(s)")
        # Use async API for multi-page PDFs
        if page_count > 1:
            print("Using async API (start_document_text_detection) for multi-page PDF")
            if bucket_name:
                # Process from S3
                response = textract.start_document_text_detection(
                    DocumentLocation={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': pdf_path
                        }
                    }
                )
            else:
                # For local files with multiple pages, we need to use S3
                # Note: Textract async API requires S3
                raise ValueError(
                    "Multi-page PDFs must be processed from S3. "
                    "Please upload the file to S3 first."
                )
            job_id = response['JobId']
            print(f"Started Textract job: {job_id}")
            # Wait for job to complete
            import time
            while True:
                result = textract.get_document_text_detection(JobId=job_id)
                status = result['JobStatus']
                print(f"Job status: {status}")
                if status in ['SUCCEEDED', 'FAILED']:
                    break
                time.sleep(2)
            return result
        else:
            # Use sync API for single-page PDFs
            print("Using sync API (detect_document_text) for single-page PDF")
            if bucket_name:
                response = textract.detect_document_text(
                    Document={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': pdf_path
                        }
                    }
                )
            else:
                response = textract.detect_document_text(
                    Document={'Bytes': pdf_bytes}
                )
            return response
    # Unsupported file type
    raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
 def extract_text_from_response(response: dict) -> str:
    """
    Extract plain text from Textract response.
    Args:
        response: Textract API response
    Returns:
        str: Extracted text
    """
    text_lines = []
    for block in response.get('Blocks', []):
        if block['BlockType'] == 'LINE':
            text_lines.append(block['Text'])
    return '\n'.join(text_lines)
 def main():
    if len(sys.argv) < 2:
        print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
        print("\nExamples:")
        print("  python textract_pdf.py document.pdf")
        print("  python textract_pdf.py path/to/doc.pdf my-bucket")
        sys.exit(1)
    pdf_path = sys.argv[1]
    bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
    if not bucket_name and not Path(pdf_path).exists():
        print(f"Error: File not found: {pdf_path}")
        sys.exit(1)
    print(f"Processing PDF: {pdf_path}")
    if bucket_name:
        print(f"Using S3 bucket: {bucket_name}")
    # Process PDF
    response = process_pdf_with_textract(pdf_path, bucket_name)
    # Extract and display text
    text = extract_text_from_response(response)
    print("\n" + "="*80)
    print("EXTRACTED TEXT")
    print("="*80)
    print(text)
    print("="*80)
    # Print summary
    num_blocks = len(response.get('Blocks', []))
    num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
    print(f"\nSummary:")
    print(f"  Pages processed: {num_pages}")
    print(f"  Total blocks: {num_blocks}")
    print(f"  Text length: {len(text)} characters")
 if __name__ == '__main__':
    main()
--- a/scripts/tojson.py
+++ b/scripts/tojson.py
@@ -0,0 +1,30 @@
 import pandas as pd
 import json
 from pathlib import Path
 # Configuration
 CSV_FILE = "guias.csv"
 OUTPUT_DIR = "json_output"
 ENCODING = "utf-8"
 # Create output directory
 Path(OUTPUT_DIR).mkdir(exist_ok=True)
 # Read CSV
 df = pd.read_csv(CSV_FILE, encoding=ENCODING)
 # Convert each row to JSON, skipping row 2 (index 1)
 for index, row in df.iterrows():
    # Skip the second row (index 1)
    if index == 0:
        print(f"⊗ Skipped row {index + 1}")
        continue
    # Save to individual JSON file
    output_file = f"{OUTPUT_DIR}/row_{index + 1}.json"
    with open(output_file, 'w', encoding=ENCODING) as json_file:
        json.dump(row.to_dict(), json_file, indent=2, ensure_ascii=False)
    print(f"✓ Created {output_file}")
 print(f"\nDone! Created {len(df) - 1} JSON files in '{OUTPUT_DIR}/' directory")