From 6026870c5ce187baafcba06c1e842f15ea5cec86 Mon Sep 17 00:00:00 2001 From: DNXBrasil Date: Tue, 20 Jan 2026 13:48:13 -0300 Subject: [PATCH] Adds initial files] --- code/app.py | 156 ++++++++++++ code/dockerfile | 15 ++ code/requirements.txt | 6 + infra/.terraform.lock.hcl | 25 ++ infra/ecr/main.tf | 32 +++ infra/ecr/outputs.tf | 14 ++ infra/ecr/terraform.tfstate | 75 ++++++ infra/ecr/terraform.tfvars | 3 + infra/ecr/variable.tf | 16 ++ infra/ecs_alb/main.tf | 249 ++++++++++++++++++++ infra/ecs_alb/terraform.tfvars | 23 ++ infra/ecs_alb/variables.tf | 56 +++++ infra/terraform.tfvars | 3 + scripts/process_images_batch.py | 404 ++++++++++++++++++++++++++++++++ scripts/textract.py | 209 +++++++++++++++++ scripts/tojson.py | 30 +++ 16 files changed, 1316 insertions(+) create mode 100644 code/app.py create mode 100644 code/dockerfile create mode 100644 code/requirements.txt create mode 100644 infra/.terraform.lock.hcl create mode 100644 infra/ecr/main.tf create mode 100644 infra/ecr/outputs.tf create mode 100644 infra/ecr/terraform.tfstate create mode 100644 infra/ecr/terraform.tfvars create mode 100644 infra/ecr/variable.tf create mode 100644 infra/ecs_alb/main.tf create mode 100644 infra/ecs_alb/terraform.tfvars create mode 100644 infra/ecs_alb/variables.tf create mode 100644 infra/terraform.tfvars create mode 100755 scripts/process_images_batch.py create mode 100644 scripts/textract.py create mode 100644 scripts/tojson.py diff --git a/code/app.py b/code/app.py new file mode 100644 index 0000000..2b3e674 --- /dev/null +++ b/code/app.py @@ -0,0 +1,156 @@ +from fastapi import FastAPI,Header +import uvicorn +""" +Simple LangGraph Agent using @tool decorator + +Clean implementation with decorator-based tool definitions. +""" + +from typing import Annotated, TypedDict +from langgraph.graph import StateGraph, START, END +from langgraph.graph.message import add_messages +from langgraph.prebuilt import ToolNode, tools_condition +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.tools import tool +from langchain_aws import ChatBedrock +app = FastAPI() + +# Define tools using @tool decorator +rules="""- Mulheres acima de 45 anos ou menopausada +- Homens com mais de 70 anos; +- Osteogênese imperfeita (para esta patologia, poderá haver a liberação de (02) dois +exames ao ano - cada 180 dias); +- RX com osteopenia ou fratura patológica; +- Antecedente pessoal de fratura após os 40 anos: punho, ombros, vértebras, quadril; +- Parente de primeiro grau com osteoporose. +- Mulheres com massa corporal <20kg/m2 ou peso < 57,8kg; +- Menopausa antes dos 45 anos ou hipogonasismo crônico (falência ovariana +precoce); +- Uso de glicocorticóides (>=7,5 prednizona/ dia equivalente por mais três meses, ou +presença de síndrome de cushing; +- Hiperparatireoidismo primário; +- Uso prolongado de anticonvulsivantes (< 10 anos); +- Síndrome de má absorção crônica ou desnutrição doenças inflamatória intestinal +(independente da causa: bariatrica, celiacos, intolerancia a lactose). +- Quimioterapia, se sobrevida esperada for longa (< 5 anos); +- Diminuição documentada de altura; +- Presença de cifose após menopausa. +- Imobilização prolongada""" +SYSTEM_PROMPT=""" +You are a assisant, your job is to aprove or not a procedure based on the following rules:""" ++rules+""" +Your input will be a json, evaluate it based on the rules and return: +Aproved: If one of the criteira is met +Reproved: If not a single one of the criterias are met.""" +@tool +def add(a: int, b: int) -> int: + """Add two numbers together. + + Args: + a: First number + b: Second number + """ + return a + b + + +@tool +def multiply(a: int, b: int) -> int: + """Multiply two numbers. + + Args: + a: First number + b: Second number + """ + return a * b + + +@tool +def get_word_length(word: str) -> int: + """Get the length of a word. + + Args: + word: The word to measure + """ + return len(word) + + +@tool +def search_info(topic: str) -> str: + """Search for information about a topic (mock implementation). + + Args: + topic: The topic to search for + """ + # Mock response - replace with actual search/API + return f"Information about {topic}: This is a mock response. In production, this would return real data." + + +# Define agent state +class AgentState(TypedDict): + messages: Annotated[list, add_messages] + + +# Define tools list +tools = [add, multiply, get_word_length, search_info] + + +# Agent node +def call_model(state: AgentState): + """Call the LLM with current state and tools.""" + + model = ChatBedrock( + model_id="arn:aws:bedrock:us-east-2:232048051668:application-inference-profile/uy4xskop19zn", + region_name="us-east-2", + provider="anthropic" + ) + + model_with_tools = model.bind_tools(tools) + + messages = [ + SystemMessage(content=SYSTEM_PROMPT) + ] + state["messages"] + + response = model_with_tools.invoke(messages) + + return {"messages": [response]} + + +# Build the graph +def create_agent(): + """Create and compile the agent graph.""" + + workflow = StateGraph(AgentState) + + # Add nodes + workflow.add_node("agent", call_model) + workflow.add_node("tools", ToolNode(tools)) + + # Add edges + workflow.add_edge(START, "agent") + workflow.add_conditional_edges("agent", tools_condition) + workflow.add_edge("tools", "agent") + + return workflow.compile() + + +# Main execution + + +@app.post("/") +async def root(json:str= Header(...)): + agent = create_agent() + query=json + result = agent.invoke( + {"messages": [HumanMessage(content=query)]}, + config={"recursion_limit": 10} + ) + + final_message = result["messages"][-1] + return {"status": "success", "message": final_message.content} + +@app.get("/health") +async def health(): + return {"status": "healthy"} + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/code/dockerfile b/code/dockerfile new file mode 100644 index 0000000..8349787 --- /dev/null +++ b/code/dockerfile @@ -0,0 +1,15 @@ +FROM python:3.12 + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Install curl for health checks +RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* + +COPY app.py . + +EXPOSE 8000 + +CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/code/requirements.txt b/code/requirements.txt new file mode 100644 index 0000000..5e6ca5c --- /dev/null +++ b/code/requirements.txt @@ -0,0 +1,6 @@ +fastapi==0.104.1 +uvicorn[standard]==0.24.0 +langgraph +langchain-aws +langchain +PyPDF2 \ No newline at end of file diff --git a/infra/.terraform.lock.hcl b/infra/.terraform.lock.hcl new file mode 100644 index 0000000..d39b37b --- /dev/null +++ b/infra/.terraform.lock.hcl @@ -0,0 +1,25 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "6.27.0" + constraints = "~> 6.27" + hashes = [ + "h1:bixp2PSsP5ZGBczGCxcbSDn6lF5QFlUXlNroq9cdab4=", + "zh:177a24b806c72e8484b5cabc93b2b38e3d770ae6f745a998b54d6619fd0e8129", + "zh:4ac4a85c14fb868a3306b542e6a56c10bd6c6d5a67bc0c9b8f6a9060cf5f3be7", + "zh:552652185bc85c8ba1da1d65dea47c454728a5c6839c458b6dcd3ce71c19ccfc", + "zh:60284b8172d09aee91eae0856f09855eaf040ce3a58d6933602ae17c53f8ed04", + "zh:6be38d156756ca61fb8e7c752cc5d769cd709686700ac4b230f40a6e95b5dbc9", + "zh:7a409138fae4ef42e3a637e37cb9efedf96459e28a3c764fc4e855e8db9a7485", + "zh:8070cf5224ed1ed3a3e9a59f7c30ff88bf071c7567165275d477c1738a56c064", + "zh:894439ef340a9a79f69cd759e27ad11c7826adeca27be1b1ca82b3c9702fa300", + "zh:89d035eebf08a97c89374ff06040955ddc09f275ecca609d0c9d58d149bef5cf", + "zh:985b1145d724fc1f38369099e4a5087141885740fd6c0b1dbc492171e73c2e49", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:a80b47ae8d1475201c86bd94a5dcb9dd4da5e8b73102a90820b68b66b76d50fd", + "zh:d3395be1556210f82199b9166a6b2e677cee9c4b67e96e63f6c3a98325ad7ab0", + "zh:db0b869d09657f6f1e4110b56093c5fcdf9dbdd97c020db1e577b239c0adcbce", + "zh:ffc72e680370ae7c21f9bd3082c6317730df805c6797427839a6b6b7e9a26a01", + ] +} diff --git a/infra/ecr/main.tf b/infra/ecr/main.tf new file mode 100644 index 0000000..5436ac6 --- /dev/null +++ b/infra/ecr/main.tf @@ -0,0 +1,32 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 6.27" + } + } +} + + +provider "aws" { + region = var.aws_region +} + +resource "aws_ecr_repository" "app" { + name = var.repository_name + image_tag_mutability = "MUTABLE" # or "IMMUTABLE" + + image_scanning_configuration { + scan_on_push = true + } + + encryption_configuration { + encryption_type = "AES256" # or "KMS" for customer managed keys + } + + tags = { + Name = var.repository_name + Environment = var.environment + ManagedBy = "Terraform" + } +} \ No newline at end of file diff --git a/infra/ecr/outputs.tf b/infra/ecr/outputs.tf new file mode 100644 index 0000000..d749a2a --- /dev/null +++ b/infra/ecr/outputs.tf @@ -0,0 +1,14 @@ +output "repository_url" { + description = "ECR repository URL" + value = aws_ecr_repository.app.repository_url +} + +output "repository_arn" { + description = "ECR repository ARN" + value = aws_ecr_repository.app.arn +} + +output "repository_name" { + description = "ECR repository name" + value = aws_ecr_repository.app.name +} \ No newline at end of file diff --git a/infra/ecr/terraform.tfstate b/infra/ecr/terraform.tfstate new file mode 100644 index 0000000..6c2add3 --- /dev/null +++ b/infra/ecr/terraform.tfstate @@ -0,0 +1,75 @@ +{ + "version": 4, + "terraform_version": "1.14.3", + "serial": 6, + "lineage": "b2b2331d-cf66-169e-d25b-38e0528505fc", + "outputs": { + "repository_arn": { + "value": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser", + "type": "string" + }, + "repository_name": { + "value": "upflux-doc-analyser", + "type": "string" + }, + "repository_url": { + "value": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser", + "type": "string" + } + }, + "resources": [ + { + "mode": "managed", + "type": "aws_ecr_repository", + "name": "app", + "provider": "provider[\"registry.terraform.io/hashicorp/aws\"]", + "instances": [ + { + "schema_version": 0, + "attributes": { + "arn": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser", + "encryption_configuration": [ + { + "encryption_type": "AES256", + "kms_key": "" + } + ], + "force_delete": null, + "id": "upflux-doc-analyser", + "image_scanning_configuration": [ + { + "scan_on_push": true + } + ], + "image_tag_mutability": "MUTABLE", + "image_tag_mutability_exclusion_filter": [], + "name": "upflux-doc-analyser", + "region": "us-east-2", + "registry_id": "232048051668", + "repository_url": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser", + "tags": { + "Environment": "dev", + "ManagedBy": "Terraform", + "Name": "upflux-doc-analyser" + }, + "tags_all": { + "Environment": "dev", + "ManagedBy": "Terraform", + "Name": "upflux-doc-analyser" + }, + "timeouts": null + }, + "sensitive_attributes": [], + "identity_schema_version": 0, + "identity": { + "account_id": "232048051668", + "name": "upflux-doc-analyser", + "region": "us-east-2" + }, + "private": "eyJlMmJmYjczMC1lY2FhLTExZTYtOGY4OC0zNDM2M2JjN2M0YzAiOnsiZGVsZXRlIjoxMjAwMDAwMDAwMDAwfX0=" + } + ] + } + ], + "check_results": null +} diff --git a/infra/ecr/terraform.tfvars b/infra/ecr/terraform.tfvars new file mode 100644 index 0000000..15972c9 --- /dev/null +++ b/infra/ecr/terraform.tfvars @@ -0,0 +1,3 @@ +aws_region = "us-east-2" +repository_name = "upflux-doc-analyser" +environment = "dev" \ No newline at end of file diff --git a/infra/ecr/variable.tf b/infra/ecr/variable.tf new file mode 100644 index 0000000..27853f3 --- /dev/null +++ b/infra/ecr/variable.tf @@ -0,0 +1,16 @@ +variable "aws_region" { + description = "AWS region" + type = string + default = "us-east-1" +} + +variable "repository_name" { + description = "Name of the ECR repository" + type = string +} + +variable "environment" { + description = "Environment name" + type = string + default = "dev" +} \ No newline at end of file diff --git a/infra/ecs_alb/main.tf b/infra/ecs_alb/main.tf new file mode 100644 index 0000000..6dfb2c9 --- /dev/null +++ b/infra/ecs_alb/main.tf @@ -0,0 +1,249 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 6.27" + } + } +} + +provider "aws" { + region = var.aws_region +} + +# Get current AWS account +data "aws_caller_identity" "current" {} + +# Reference existing VPC +data "aws_vpc" "existing" { + id = var.vpc_id +} + +# Reference existing public subnets +data "aws_subnet" "public" { + count = length(var.public_subnet_ids) + id = var.public_subnet_ids[count.index] +} + +# Reference existing private subnets (for ECS tasks) +data "aws_subnet" "private" { + count = length(var.private_subnet_ids) + id = var.private_subnet_ids[count.index] +} + +# Security Group for ALB (in public subnets) +resource "aws_security_group" "alb" { + name = "${var.app_name}-alb-sg" + description = "Allow inbound traffic to ALB" + vpc_id = data.aws_vpc.existing.id + + ingress { + from_port = 80 + to_port = 80 + protocol = "tcp" + cidr_blocks = ["3.14.44.224/32"] + description = "Allow HTTP from internet" + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + description = "Allow all outbound" + } + + tags = { + Name = "${var.app_name}-alb-sg" + } +} + +# Security Group for ECS Tasks (in private subnets) +resource "aws_security_group" "ecs_tasks" { + name = "${var.app_name}-ecs-tasks-sg" + description = "Allow inbound traffic from ALB" + vpc_id = data.aws_vpc.existing.id + + ingress { + from_port = 8000 + to_port = 8000 + protocol = "tcp" + security_groups = [aws_security_group.alb.id] + description = "Allow traffic from ALB" + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + description = "Allow all outbound" + } + + tags = { + Name = "${var.app_name}-ecs-tasks-sg" + } +} + +# Application Load Balancer (in public subnets) +resource "aws_lb" "main" { + name = "${var.app_name}-alb" + internal = false + load_balancer_type = "application" + security_groups = [aws_security_group.alb.id] + subnets = var.public_subnet_ids + + enable_deletion_protection = false + + tags = { + Name = "${var.app_name}-alb" + } +} + +# Target Group +resource "aws_lb_target_group" "app" { + name = "${var.app_name}-tg" + port = 8000 + protocol = "HTTP" + vpc_id = data.aws_vpc.existing.id + target_type = "ip" + + health_check { + enabled = true + healthy_threshold = 2 + interval = 30 + matcher = "200" + path = "/health" + port = "traffic-port" + protocol = "HTTP" + timeout = 5 + unhealthy_threshold = 3 + } + + deregistration_delay = 30 + + tags = { + Name = "${var.app_name}-tg" + } +} + +# ALB Listener +resource "aws_lb_listener" "app" { + load_balancer_arn = aws_lb.main.arn + port = "80" + protocol = "HTTP" + + default_action { + type = "forward" + target_group_arn = aws_lb_target_group.app.arn + } +} + +# ECS Cluster +resource "aws_ecs_cluster" "main" { + name = "${var.app_name}-cluster" + + tags = { + Name = "${var.app_name}-cluster" + } +} + +# CloudWatch Log Group +resource "aws_cloudwatch_log_group" "app" { + name = "/ecs/${var.app_name}" + retention_in_days = 7 + + tags = { + Name = "${var.app_name}-logs" + } +} + +# ECS Task Execution Role +resource "aws_iam_role" "ecs_task_execution_role" { + name = "${var.app_name}-ecs-task-execution-role" + + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { + Service = "ecs-tasks.amazonaws.com" + } + }] + }) +} + +resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_policy" { + role = aws_iam_role.ecs_task_execution_role.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +# ECS Task Definition +resource "aws_ecs_task_definition" "app" { + family = var.app_name + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = var.fargate_cpu + memory = var.fargate_memory + execution_role_arn = aws_iam_role.ecs_task_execution_role.arn + + container_definitions = jsonencode([{ + name = var.app_name + image = "${data.aws_caller_identity.current.account_id}.dkr.ecr.${var.aws_region}.amazonaws.com/${var.ecr_repository_name}:${var.image_tag}" + + portMappings = [{ + containerPort = 8000 + hostPort = 8000 + protocol = "tcp" + }] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.app.name + "awslogs-region" = var.aws_region + "awslogs-stream-prefix" = "ecs" + } + } + + healthCheck = { + command = ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval = 30 + timeout = 5 + retries = 3 + startPeriod = 60 + } + }]) + + tags = { + Name = "${var.app_name}-task" + } +} + +# ECS Service (tasks in private subnets) +resource "aws_ecs_service" "app" { + name = "${var.app_name}-service" + cluster = aws_ecs_cluster.main.id + task_definition = aws_ecs_task_definition.app.arn + desired_count = var.app_count + launch_type = "FARGATE" + + network_configuration { + security_groups = [aws_security_group.ecs_tasks.id] + subnets = var.private_subnet_ids # ECS tasks in private subnets + assign_public_ip = false # No public IP needed with NAT gateway + } + + load_balancer { + target_group_arn = aws_lb_target_group.app.arn + container_name = var.app_name + container_port = 8000 + } + + depends_on = [aws_lb_listener.app] + + tags = { + Name = "${var.app_name}-service" + } +} \ No newline at end of file diff --git a/infra/ecs_alb/terraform.tfvars b/infra/ecs_alb/terraform.tfvars new file mode 100644 index 0000000..a27df00 --- /dev/null +++ b/infra/ecs_alb/terraform.tfvars @@ -0,0 +1,23 @@ +aws_region = "us-east-2" +app_name = "upflux-doc-analyser" + +# Replace these with your actual IDs +vpc_id = "vpc-0270f02aee3bf1b8d" + +# Your public subnets (where ALB will be) +public_subnet_ids = [ + "subnet-088bc49c54ec8f028", # Public subnet 1 + "subnet-003f1693910a99afb" # Public subnet 2 +] + +# Your private subnets (where ECS tasks will run) +private_subnet_ids = [ + "subnet-045f73d784beed091", # Private subnet 1 + "subnet-06e660f44bf141442" # Private subnet 2 +] + +ecr_repository_name = "upflux-doc-analyser" +image_tag = "latest" +fargate_cpu = "256" +fargate_memory = "512" +app_count = 1 \ No newline at end of file diff --git a/infra/ecs_alb/variables.tf b/infra/ecs_alb/variables.tf new file mode 100644 index 0000000..ad2cb2d --- /dev/null +++ b/infra/ecs_alb/variables.tf @@ -0,0 +1,56 @@ +variable "aws_region" { + description = "AWS region" + type = string + default = "us-east-1" +} + +variable "app_name" { + description = "Application name" + type = string + default = "fastapi-app" +} + +variable "vpc_id" { + description = "Existing VPC ID" + type = string +} + +variable "public_subnet_ids" { + description = "List of existing public subnet IDs (for ALB)" + type = list(string) +} + +variable "private_subnet_ids" { + description = "List of existing private subnet IDs (for ECS tasks)" + type = list(string) +} + +variable "ecr_repository_name" { + description = "ECR repository name" + type = string + default = "fastapi-app" +} + +variable "image_tag" { + description = "Docker image tag" + type = string + default = "latest" +} + +variable "fargate_cpu" { + description = "Fargate CPU units" + type = string + default = "256" +} + +variable "fargate_memory" { + description = "Fargate memory in MB" + type = string + default = "512" +} + +variable "app_count" { + description = "Number of tasks to run" + type = number + default = 1 +} \ No newline at end of file diff --git a/infra/terraform.tfvars b/infra/terraform.tfvars new file mode 100644 index 0000000..f671cbf --- /dev/null +++ b/infra/terraform.tfvars @@ -0,0 +1,3 @@ +aws_region = "us-east-2" +repository_name = "upflux-doc-analyser" +environment = "dev" diff --git a/scripts/process_images_batch.py b/scripts/process_images_batch.py new file mode 100755 index 0000000..df863ee --- /dev/null +++ b/scripts/process_images_batch.py @@ -0,0 +1,404 @@ +#!/usr/bin/env python3 +""" +Batch process images from S3 using AWS Textract. +Iterates through folders (prefixes) in an S3 bucket and processes any PDF, PNG, or JPEG files +that haven't been processed yet (checking for existing textract output files). +Saves both JSON and plain text outputs locally. +""" + +import boto3 +import json +import sys +import os +import io +from pathlib import Path +from typing import Dict, List, Optional +import time +from PyPDF2 import PdfReader + + +def get_s3_client(): + """Initialize and return AWS S3 client.""" + return boto3.client('s3',region_name="us-east-2") + + +def get_textract_client(): + """Initialize and return AWS Textract client.""" + return boto3.client('textract',region_name="us-east-2") + + +def get_pdf_page_count(pdf_bytes: bytes) -> int: + """ + Get the number of pages in a PDF file. + + Args: + pdf_bytes: PDF file content as bytes + + Returns: + int: Number of pages in the PDF + """ + try: + pdf_reader = PdfReader(io.BytesIO(pdf_bytes)) + return len(pdf_reader.pages) + except Exception as e: + print(f" Warning: Could not determine page count: {str(e)}") + return 1 + + +def is_already_processed(s3_key: str, output_dir: Path) -> bool: + """ + Check if an image has already been processed by looking for output file. + + Args: + s3_key: S3 object key + output_dir: Directory where output files are stored + + Returns: + bool: True if output file exists, False otherwise + """ + filename = Path(s3_key).stem + output_file = output_dir / f"{filename}_textract.json" + return output_file.exists() + + +def process_image_from_s3(bucket_name: str, s3_key: str) -> Dict: + """ + Process an image file from S3 with AWS Textract. + Supports PDF, PNG, and JPEG formats. + Uses async API (start_document_text_detection) for multi-page PDFs, + and sync API (detect_document_text) for single-page PDFs and images. + + Args: + bucket_name: S3 bucket name + s3_key: S3 object key + + Returns: + dict: Textract response containing detected text + """ + textract = get_textract_client() + s3 = get_s3_client() + + try: + # Verify the object exists first + try: + s3.head_object(Bucket=bucket_name, Key=s3_key) + except Exception as e: + print(f" Error accessing S3 object: {str(e)}") + print(f" Bucket: {bucket_name}") + print(f" Key: {s3_key}") + return None + + file_ext = Path(s3_key).suffix.lower() + + # For images (PNG, JPEG), always use sync API + if file_ext in ['.png', '.jpg', '.jpeg']: + print(f" Processing image with sync API") + response = textract.detect_document_text( + Document={ + 'S3Object': { + 'Bucket': bucket_name, + 'Name': s3_key + } + } + ) + return response + + # For PDFs, check page count to decide which API to use + if file_ext == '.pdf': + # Download PDF to check page count + response = s3.get_object(Bucket=bucket_name, Key=s3_key) + pdf_bytes = response['Body'].read() + page_count = get_pdf_page_count(pdf_bytes) + + print(f" PDF has {page_count} page(s)") + + # Use async API for multi-page PDFs + if page_count > 1: + print(f" Using async API (start_document_text_detection) for multi-page PDF") + response = textract.start_document_text_detection( + DocumentLocation={ + 'S3Object': { + 'Bucket': bucket_name, + 'Name': s3_key + } + } + ) + job_id = response['JobId'] + print(f" Started async job: {job_id}") + + # Wait for job to complete + while True: + result = textract.get_document_text_detection(JobId=job_id) + status = result['JobStatus'] + + if status == 'SUCCEEDED': + return result + elif status == 'FAILED': + print(f" Job failed: {result.get('StatusMessage', 'Unknown error')}") + return None + + time.sleep(2) + else: + # Use sync API for single-page PDFs + print(f" Using sync API (detect_document_text) for single-page PDF") + response = textract.detect_document_text( + Document={ + 'S3Object': { + 'Bucket': bucket_name, + 'Name': s3_key + } + } + ) + return response + + except Exception as e: + print(f" Error processing {s3_key}: {str(e)}") + return None + + +def extract_text_from_response(response: Dict) -> str: + """ + Extract plain text from Textract response. + + Args: + response: Textract API response + + Returns: + str: Extracted text + """ + if not response: + return "" + + text_lines = [] + for block in response.get('Blocks', []): + if block['BlockType'] == 'LINE': + text_lines.append(block['Text']) + + return '\n'.join(text_lines) + + +def save_textract_output(s3_key: str, response: Dict, output_dir: Path): + """ + Save Textract response to JSON file and plain text file locally. + + Args: + s3_key: S3 object key + response: Textract API response + output_dir: Directory to save output files + """ + output_dir.mkdir(parents=True, exist_ok=True) + filename = Path(s3_key).stem + + # Extract text + extracted_text = extract_text_from_response(response) + + # Save JSON output + json_output_file = output_dir / f"{filename}_textract.json" + if response: + response['extracted_text'] = extracted_text + response['source_s3_key'] = s3_key + + with open(json_output_file, 'w', encoding='utf-8') as f: + json.dump(response, f, indent=2, ensure_ascii=False) + + print(f" ✓ Saved JSON to: {json_output_file.name}") + + # Save plain text output + text_output_file = output_dir / f"{filename}.txt" + with open(text_output_file, 'w', encoding='utf-8') as f: + f.write(extracted_text) + + print(f" ✓ Saved text to: {text_output_file.name}") + + +def get_supported_images_from_s3(bucket_name: str, prefix: str) -> List[str]: + """ + Get list of supported image files in an S3 prefix (folder). + Filters out files containing 'script' (case-insensitive). + + Args: + bucket_name: S3 bucket name + prefix: S3 prefix (folder path) + + Returns: + List of S3 keys for supported image files + """ + s3 = get_s3_client() + supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'} + images = [] + + # Ensure prefix ends with / if it's not empty + if prefix and not prefix.endswith('/'): + prefix += '/' + + paginator = s3.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/') + + for page in pages: + for obj in page.get('Contents', []): + key = obj['Key'] + file_path = Path(key) + + # Check if it's a file (not a folder) and has supported extension + if file_path.suffix.lower() in supported_extensions: + # Filter out files containing 'script' (case-insensitive) + if 'script' not in file_path.name.lower(): + images.append(key) + + return sorted(images) + + +def get_folders_from_s3(bucket_name: str, base_prefix: str = '') -> List[str]: + """ + Get list of folders (prefixes) in S3 bucket. + + Args: + bucket_name: S3 bucket name + base_prefix: Base prefix to search under + + Returns: + List of folder prefixes + """ + s3 = get_s3_client() + folders = [] + + # Ensure prefix ends with / if it's not empty + if base_prefix and not base_prefix.endswith('/'): + base_prefix += '/' + + paginator = s3.get_paginator('list_objects_v2') + pages = paginator.paginate(Bucket=bucket_name, Prefix=base_prefix, Delimiter='/') + + for page in pages: + for prefix_info in page.get('CommonPrefixes', []): + folders.append(prefix_info['Prefix']) + + return folders + + +def process_folder(bucket_name: str, prefix: str, output_base_dir: Path, skip_existing: bool = True): + """ + Process all images in an S3 folder (prefix). + + Args: + bucket_name: S3 bucket name + prefix: S3 prefix (folder path) + output_base_dir: Base directory for output files + skip_existing: Whether to skip already processed files + """ + folder_name = prefix.rstrip('/').split('/')[-1] or 'root' + output_dir = output_base_dir / folder_name + + print(f"\n{'='*80}") + print(f"Processing folder: {prefix}") + print(f"{'='*80}") + + images = get_supported_images_from_s3(bucket_name, prefix) + + if not images: + print(f" No supported images found (PDF, PNG, JPEG)") + return + + print(f" Found {len(images)} image(s)") + + processed_count = 0 + skipped_count = 0 + error_count = 0 + + for s3_key in images: + filename = Path(s3_key).name + print(f"\n Processing: {filename}") + + # Check if already processed + if skip_existing and is_already_processed(s3_key, output_dir): + print(f" ⊘ Skipped (already processed)") + skipped_count += 1 + continue + + # Process with Textract + response = process_image_from_s3(bucket_name, s3_key) + + if response: + # Save output (both JSON and text) + save_textract_output(s3_key, response, output_dir) + + # Print summary + num_blocks = len(response.get('Blocks', [])) + text_length = len(extract_text_from_response(response)) + print(f" ℹ Extracted {text_length} characters, {num_blocks} blocks") + + processed_count += 1 + + # Small delay to avoid rate limiting + time.sleep(0.5) + else: + error_count += 1 + + print(f"\n Summary for {folder_name}:") + print(f" Processed: {processed_count}") + print(f" Skipped: {skipped_count}") + print(f" Errors: {error_count}") + + +def main(): + """Main entry point for the script.""" + # Get bucket name from environment or command line + bucket_name = os.environ.get('S3_BUCKET_NAME') + base_prefix = os.environ.get('S3_BASE_PREFIX', 'imagens') + + if len(sys.argv) > 1: + bucket_name = sys.argv[1] + if len(sys.argv) > 2: + base_prefix = sys.argv[2] + + if not bucket_name: + print("Error: S3 bucket name not provided.") + print("\nUsage:") + print(" python process_images_batch.py [base_prefix]") + print("\nOr set environment variables:") + print(" export S3_BUCKET_NAME=my-bucket") + print(" export S3_BASE_PREFIX=imagens") + print(" python process_images_batch.py") + sys.exit(1) + + # Get output directory + script_dir = Path(__file__).parent + output_base_dir = script_dir / "textract_output" + + print(f"S3 Bucket: {bucket_name}") + print(f"Base prefix: {base_prefix}") + print(f"Output directory: {output_base_dir}") + + # Get all folders (prefixes) in the bucket + print(f"\nScanning S3 bucket for folders...") + folders = get_folders_from_s3(bucket_name, base_prefix) + + if not folders: + print(f"\nNo subdirectories found under '{base_prefix}'.") + print("Processing files in the base prefix instead...") + folders = [base_prefix] + else: + print(f"\nFound {len(folders)} folder(s) to process") + + # Process each folder + total_start = time.time() + + for prefix in folders: + try: + process_folder(bucket_name, prefix, output_base_dir) + except Exception as e: + print(f"\nError processing folder {prefix}: {str(e)}") + import traceback + traceback.print_exc() + continue + + total_time = time.time() - total_start + + print(f"\n{'='*80}") + print(f"Batch processing complete!") + print(f"Total time: {total_time:.2f} seconds") + print(f"{'='*80}") + + +if __name__ == '__main__': + main() diff --git a/scripts/textract.py b/scripts/textract.py new file mode 100644 index 0000000..b358df5 --- /dev/null +++ b/scripts/textract.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +""" +Simple script to invoke AWS Textract on a PDF file. +Extracts text and returns the detected content. +""" + +import boto3 +import sys +import io +from pathlib import Path +from PyPDF2 import PdfReader + + +def get_pdf_page_count(pdf_bytes: bytes) -> int: + """ + Get the number of pages in a PDF file. + + Args: + pdf_bytes: PDF file content as bytes + + Returns: + int: Number of pages in the PDF + """ + try: + pdf_reader = PdfReader(io.BytesIO(pdf_bytes)) + return len(pdf_reader.pages) + except Exception as e: + print(f"Warning: Could not determine page count: {str(e)}") + return 1 + + +def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict: + """ + Process a document file (PDF, PNG, JPEG) with AWS Textract. + Uses async API (start_document_text_detection) for multi-page PDFs, + and sync API (detect_document_text) for single-page PDFs and images. + + Args: + pdf_path: Path to the document file (local path or S3 key) + bucket_name: Optional S3 bucket name if document is in S3 + + Returns: + dict: Textract response containing detected text + """ + textract = boto3.client('textract') + file_ext = Path(pdf_path).suffix.lower() + + # For images (PNG, JPEG), always use sync API + if file_ext in ['.png', '.jpg', '.jpeg']: + print(f"Processing image file with sync API") + + if bucket_name: + response = textract.detect_document_text( + Document={ + 'S3Object': { + 'Bucket': bucket_name, + 'Name': pdf_path + } + } + ) + else: + with open(pdf_path, 'rb') as file: + file_bytes = file.read() + response = textract.detect_document_text( + Document={'Bytes': file_bytes} + ) + + return response + + # For PDFs, check page count to decide which API to use + if file_ext == '.pdf': + s3 = boto3.client('s3') + + # Determine number of pages + if bucket_name: + # Download PDF from S3 to check page count + response = s3.get_object(Bucket=bucket_name, Key=pdf_path) + pdf_bytes = response['Body'].read() + else: + # Read local PDF + with open(pdf_path, 'rb') as pdf_file: + pdf_bytes = pdf_file.read() + + page_count = get_pdf_page_count(pdf_bytes) + print(f"PDF has {page_count} page(s)") + + # Use async API for multi-page PDFs + if page_count > 1: + print("Using async API (start_document_text_detection) for multi-page PDF") + + if bucket_name: + # Process from S3 + response = textract.start_document_text_detection( + DocumentLocation={ + 'S3Object': { + 'Bucket': bucket_name, + 'Name': pdf_path + } + } + ) + else: + # For local files with multiple pages, we need to use S3 + # Note: Textract async API requires S3 + raise ValueError( + "Multi-page PDFs must be processed from S3. " + "Please upload the file to S3 first." + ) + + job_id = response['JobId'] + print(f"Started Textract job: {job_id}") + + # Wait for job to complete + import time + while True: + result = textract.get_document_text_detection(JobId=job_id) + status = result['JobStatus'] + print(f"Job status: {status}") + + if status in ['SUCCEEDED', 'FAILED']: + break + time.sleep(2) + + return result + else: + # Use sync API for single-page PDFs + print("Using sync API (detect_document_text) for single-page PDF") + + if bucket_name: + response = textract.detect_document_text( + Document={ + 'S3Object': { + 'Bucket': bucket_name, + 'Name': pdf_path + } + } + ) + else: + response = textract.detect_document_text( + Document={'Bytes': pdf_bytes} + ) + + return response + + # Unsupported file type + raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg") + + +def extract_text_from_response(response: dict) -> str: + """ + Extract plain text from Textract response. + + Args: + response: Textract API response + + Returns: + str: Extracted text + """ + text_lines = [] + + for block in response.get('Blocks', []): + if block['BlockType'] == 'LINE': + text_lines.append(block['Text']) + + return '\n'.join(text_lines) + + +def main(): + if len(sys.argv) < 2: + print("Usage: python textract_pdf.py [s3_bucket]") + print("\nExamples:") + print(" python textract_pdf.py document.pdf") + print(" python textract_pdf.py path/to/doc.pdf my-bucket") + sys.exit(1) + + pdf_path = sys.argv[1] + bucket_name = sys.argv[2] if len(sys.argv) > 2 else None + + if not bucket_name and not Path(pdf_path).exists(): + print(f"Error: File not found: {pdf_path}") + sys.exit(1) + + print(f"Processing PDF: {pdf_path}") + if bucket_name: + print(f"Using S3 bucket: {bucket_name}") + + # Process PDF + response = process_pdf_with_textract(pdf_path, bucket_name) + + # Extract and display text + text = extract_text_from_response(response) + + print("\n" + "="*80) + print("EXTRACTED TEXT") + print("="*80) + print(text) + print("="*80) + + # Print summary + num_blocks = len(response.get('Blocks', [])) + num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', []))) + + print(f"\nSummary:") + print(f" Pages processed: {num_pages}") + print(f" Total blocks: {num_blocks}") + print(f" Text length: {len(text)} characters") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/tojson.py b/scripts/tojson.py new file mode 100644 index 0000000..ae6fd38 --- /dev/null +++ b/scripts/tojson.py @@ -0,0 +1,30 @@ +import pandas as pd +import json +from pathlib import Path + +# Configuration +CSV_FILE = "guias.csv" +OUTPUT_DIR = "json_output" +ENCODING = "utf-8" + +# Create output directory +Path(OUTPUT_DIR).mkdir(exist_ok=True) + +# Read CSV +df = pd.read_csv(CSV_FILE, encoding=ENCODING) + +# Convert each row to JSON, skipping row 2 (index 1) +for index, row in df.iterrows(): + # Skip the second row (index 1) + if index == 0: + print(f"⊗ Skipped row {index + 1}") + continue + + # Save to individual JSON file + output_file = f"{OUTPUT_DIR}/row_{index + 1}.json" + with open(output_file, 'w', encoding=ENCODING) as json_file: + json.dump(row.to_dict(), json_file, indent=2, ensure_ascii=False) + + print(f"✓ Created {output_file}") + +print(f"\nDone! Created {len(df) - 1} JSON files in '{OUTPUT_DIR}/' directory") \ No newline at end of file