Adds initial files]
This commit is contained in:
156
code/app.py
Normal file
156
code/app.py
Normal file
@@ -0,0 +1,156 @@
|
||||
from fastapi import FastAPI,Header
|
||||
import uvicorn
|
||||
"""
|
||||
Simple LangGraph Agent using @tool decorator
|
||||
|
||||
Clean implementation with decorator-based tool definitions.
|
||||
"""
|
||||
|
||||
from typing import Annotated, TypedDict
|
||||
from langgraph.graph import StateGraph, START, END
|
||||
from langgraph.graph.message import add_messages
|
||||
from langgraph.prebuilt import ToolNode, tools_condition
|
||||
from langchain_core.messages import HumanMessage, SystemMessage
|
||||
from langchain_core.tools import tool
|
||||
from langchain_aws import ChatBedrock
|
||||
app = FastAPI()
|
||||
|
||||
# Define tools using @tool decorator
|
||||
rules="""- Mulheres acima de 45 anos ou menopausada
|
||||
- Homens com mais de 70 anos;
|
||||
- Osteogênese imperfeita (para esta patologia, poderá haver a liberação de (02) dois
|
||||
exames ao ano - cada 180 dias);
|
||||
- RX com osteopenia ou fratura patológica;
|
||||
- Antecedente pessoal de fratura após os 40 anos: punho, ombros, vértebras, quadril;
|
||||
- Parente de primeiro grau com osteoporose.
|
||||
- Mulheres com massa corporal <20kg/m2 ou peso < 57,8kg;
|
||||
- Menopausa antes dos 45 anos ou hipogonasismo crônico (falência ovariana
|
||||
precoce);
|
||||
- Uso de glicocorticóides (>=7,5 prednizona/ dia equivalente por mais três meses, ou
|
||||
presença de síndrome de cushing;
|
||||
- Hiperparatireoidismo primário;
|
||||
- Uso prolongado de anticonvulsivantes (< 10 anos);
|
||||
- Síndrome de má absorção crônica ou desnutrição doenças inflamatória intestinal
|
||||
(independente da causa: bariatrica, celiacos, intolerancia a lactose).
|
||||
- Quimioterapia, se sobrevida esperada for longa (< 5 anos);
|
||||
- Diminuição documentada de altura;
|
||||
- Presença de cifose após menopausa.
|
||||
- Imobilização prolongada"""
|
||||
SYSTEM_PROMPT="""
|
||||
You are a assisant, your job is to aprove or not a procedure based on the following rules:"""
|
||||
+rules+"""
|
||||
Your input will be a json, evaluate it based on the rules and return:
|
||||
Aproved: If one of the criteira is met
|
||||
Reproved: If not a single one of the criterias are met."""
|
||||
@tool
|
||||
def add(a: int, b: int) -> int:
|
||||
"""Add two numbers together.
|
||||
|
||||
Args:
|
||||
a: First number
|
||||
b: Second number
|
||||
"""
|
||||
return a + b
|
||||
|
||||
|
||||
@tool
|
||||
def multiply(a: int, b: int) -> int:
|
||||
"""Multiply two numbers.
|
||||
|
||||
Args:
|
||||
a: First number
|
||||
b: Second number
|
||||
"""
|
||||
return a * b
|
||||
|
||||
|
||||
@tool
|
||||
def get_word_length(word: str) -> int:
|
||||
"""Get the length of a word.
|
||||
|
||||
Args:
|
||||
word: The word to measure
|
||||
"""
|
||||
return len(word)
|
||||
|
||||
|
||||
@tool
|
||||
def search_info(topic: str) -> str:
|
||||
"""Search for information about a topic (mock implementation).
|
||||
|
||||
Args:
|
||||
topic: The topic to search for
|
||||
"""
|
||||
# Mock response - replace with actual search/API
|
||||
return f"Information about {topic}: This is a mock response. In production, this would return real data."
|
||||
|
||||
|
||||
# Define agent state
|
||||
class AgentState(TypedDict):
|
||||
messages: Annotated[list, add_messages]
|
||||
|
||||
|
||||
# Define tools list
|
||||
tools = [add, multiply, get_word_length, search_info]
|
||||
|
||||
|
||||
# Agent node
|
||||
def call_model(state: AgentState):
|
||||
"""Call the LLM with current state and tools."""
|
||||
|
||||
model = ChatBedrock(
|
||||
model_id="arn:aws:bedrock:us-east-2:232048051668:application-inference-profile/uy4xskop19zn",
|
||||
region_name="us-east-2",
|
||||
provider="anthropic"
|
||||
)
|
||||
|
||||
model_with_tools = model.bind_tools(tools)
|
||||
|
||||
messages = [
|
||||
SystemMessage(content=SYSTEM_PROMPT)
|
||||
] + state["messages"]
|
||||
|
||||
response = model_with_tools.invoke(messages)
|
||||
|
||||
return {"messages": [response]}
|
||||
|
||||
|
||||
# Build the graph
|
||||
def create_agent():
|
||||
"""Create and compile the agent graph."""
|
||||
|
||||
workflow = StateGraph(AgentState)
|
||||
|
||||
# Add nodes
|
||||
workflow.add_node("agent", call_model)
|
||||
workflow.add_node("tools", ToolNode(tools))
|
||||
|
||||
# Add edges
|
||||
workflow.add_edge(START, "agent")
|
||||
workflow.add_conditional_edges("agent", tools_condition)
|
||||
workflow.add_edge("tools", "agent")
|
||||
|
||||
return workflow.compile()
|
||||
|
||||
|
||||
# Main execution
|
||||
|
||||
|
||||
@app.post("/")
|
||||
async def root(json:str= Header(...)):
|
||||
agent = create_agent()
|
||||
query=json
|
||||
result = agent.invoke(
|
||||
{"messages": [HumanMessage(content=query)]},
|
||||
config={"recursion_limit": 10}
|
||||
)
|
||||
|
||||
final_message = result["messages"][-1]
|
||||
return {"status": "success", "message": final_message.content}
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "healthy"}
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
15
code/dockerfile
Normal file
15
code/dockerfile
Normal file
@@ -0,0 +1,15 @@
|
||||
FROM python:3.12
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install curl for health checks
|
||||
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY app.py .
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
6
code/requirements.txt
Normal file
6
code/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
fastapi==0.104.1
|
||||
uvicorn[standard]==0.24.0
|
||||
langgraph
|
||||
langchain-aws
|
||||
langchain
|
||||
PyPDF2
|
||||
25
infra/.terraform.lock.hcl
generated
Normal file
25
infra/.terraform.lock.hcl
generated
Normal file
@@ -0,0 +1,25 @@
|
||||
# This file is maintained automatically by "terraform init".
|
||||
# Manual edits may be lost in future updates.
|
||||
|
||||
provider "registry.terraform.io/hashicorp/aws" {
|
||||
version = "6.27.0"
|
||||
constraints = "~> 6.27"
|
||||
hashes = [
|
||||
"h1:bixp2PSsP5ZGBczGCxcbSDn6lF5QFlUXlNroq9cdab4=",
|
||||
"zh:177a24b806c72e8484b5cabc93b2b38e3d770ae6f745a998b54d6619fd0e8129",
|
||||
"zh:4ac4a85c14fb868a3306b542e6a56c10bd6c6d5a67bc0c9b8f6a9060cf5f3be7",
|
||||
"zh:552652185bc85c8ba1da1d65dea47c454728a5c6839c458b6dcd3ce71c19ccfc",
|
||||
"zh:60284b8172d09aee91eae0856f09855eaf040ce3a58d6933602ae17c53f8ed04",
|
||||
"zh:6be38d156756ca61fb8e7c752cc5d769cd709686700ac4b230f40a6e95b5dbc9",
|
||||
"zh:7a409138fae4ef42e3a637e37cb9efedf96459e28a3c764fc4e855e8db9a7485",
|
||||
"zh:8070cf5224ed1ed3a3e9a59f7c30ff88bf071c7567165275d477c1738a56c064",
|
||||
"zh:894439ef340a9a79f69cd759e27ad11c7826adeca27be1b1ca82b3c9702fa300",
|
||||
"zh:89d035eebf08a97c89374ff06040955ddc09f275ecca609d0c9d58d149bef5cf",
|
||||
"zh:985b1145d724fc1f38369099e4a5087141885740fd6c0b1dbc492171e73c2e49",
|
||||
"zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425",
|
||||
"zh:a80b47ae8d1475201c86bd94a5dcb9dd4da5e8b73102a90820b68b66b76d50fd",
|
||||
"zh:d3395be1556210f82199b9166a6b2e677cee9c4b67e96e63f6c3a98325ad7ab0",
|
||||
"zh:db0b869d09657f6f1e4110b56093c5fcdf9dbdd97c020db1e577b239c0adcbce",
|
||||
"zh:ffc72e680370ae7c21f9bd3082c6317730df805c6797427839a6b6b7e9a26a01",
|
||||
]
|
||||
}
|
||||
32
infra/ecr/main.tf
Normal file
32
infra/ecr/main.tf
Normal file
@@ -0,0 +1,32 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 6.27"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
provider "aws" {
|
||||
region = var.aws_region
|
||||
}
|
||||
|
||||
resource "aws_ecr_repository" "app" {
|
||||
name = var.repository_name
|
||||
image_tag_mutability = "MUTABLE" # or "IMMUTABLE"
|
||||
|
||||
image_scanning_configuration {
|
||||
scan_on_push = true
|
||||
}
|
||||
|
||||
encryption_configuration {
|
||||
encryption_type = "AES256" # or "KMS" for customer managed keys
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = var.repository_name
|
||||
Environment = var.environment
|
||||
ManagedBy = "Terraform"
|
||||
}
|
||||
}
|
||||
14
infra/ecr/outputs.tf
Normal file
14
infra/ecr/outputs.tf
Normal file
@@ -0,0 +1,14 @@
|
||||
output "repository_url" {
|
||||
description = "ECR repository URL"
|
||||
value = aws_ecr_repository.app.repository_url
|
||||
}
|
||||
|
||||
output "repository_arn" {
|
||||
description = "ECR repository ARN"
|
||||
value = aws_ecr_repository.app.arn
|
||||
}
|
||||
|
||||
output "repository_name" {
|
||||
description = "ECR repository name"
|
||||
value = aws_ecr_repository.app.name
|
||||
}
|
||||
75
infra/ecr/terraform.tfstate
Normal file
75
infra/ecr/terraform.tfstate
Normal file
@@ -0,0 +1,75 @@
|
||||
{
|
||||
"version": 4,
|
||||
"terraform_version": "1.14.3",
|
||||
"serial": 6,
|
||||
"lineage": "b2b2331d-cf66-169e-d25b-38e0528505fc",
|
||||
"outputs": {
|
||||
"repository_arn": {
|
||||
"value": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser",
|
||||
"type": "string"
|
||||
},
|
||||
"repository_name": {
|
||||
"value": "upflux-doc-analyser",
|
||||
"type": "string"
|
||||
},
|
||||
"repository_url": {
|
||||
"value": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser",
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"mode": "managed",
|
||||
"type": "aws_ecr_repository",
|
||||
"name": "app",
|
||||
"provider": "provider[\"registry.terraform.io/hashicorp/aws\"]",
|
||||
"instances": [
|
||||
{
|
||||
"schema_version": 0,
|
||||
"attributes": {
|
||||
"arn": "arn:aws:ecr:us-east-2:232048051668:repository/upflux-doc-analyser",
|
||||
"encryption_configuration": [
|
||||
{
|
||||
"encryption_type": "AES256",
|
||||
"kms_key": ""
|
||||
}
|
||||
],
|
||||
"force_delete": null,
|
||||
"id": "upflux-doc-analyser",
|
||||
"image_scanning_configuration": [
|
||||
{
|
||||
"scan_on_push": true
|
||||
}
|
||||
],
|
||||
"image_tag_mutability": "MUTABLE",
|
||||
"image_tag_mutability_exclusion_filter": [],
|
||||
"name": "upflux-doc-analyser",
|
||||
"region": "us-east-2",
|
||||
"registry_id": "232048051668",
|
||||
"repository_url": "232048051668.dkr.ecr.us-east-2.amazonaws.com/upflux-doc-analyser",
|
||||
"tags": {
|
||||
"Environment": "dev",
|
||||
"ManagedBy": "Terraform",
|
||||
"Name": "upflux-doc-analyser"
|
||||
},
|
||||
"tags_all": {
|
||||
"Environment": "dev",
|
||||
"ManagedBy": "Terraform",
|
||||
"Name": "upflux-doc-analyser"
|
||||
},
|
||||
"timeouts": null
|
||||
},
|
||||
"sensitive_attributes": [],
|
||||
"identity_schema_version": 0,
|
||||
"identity": {
|
||||
"account_id": "232048051668",
|
||||
"name": "upflux-doc-analyser",
|
||||
"region": "us-east-2"
|
||||
},
|
||||
"private": "eyJlMmJmYjczMC1lY2FhLTExZTYtOGY4OC0zNDM2M2JjN2M0YzAiOnsiZGVsZXRlIjoxMjAwMDAwMDAwMDAwfX0="
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"check_results": null
|
||||
}
|
||||
3
infra/ecr/terraform.tfvars
Normal file
3
infra/ecr/terraform.tfvars
Normal file
@@ -0,0 +1,3 @@
|
||||
aws_region = "us-east-2"
|
||||
repository_name = "upflux-doc-analyser"
|
||||
environment = "dev"
|
||||
16
infra/ecr/variable.tf
Normal file
16
infra/ecr/variable.tf
Normal file
@@ -0,0 +1,16 @@
|
||||
variable "aws_region" {
|
||||
description = "AWS region"
|
||||
type = string
|
||||
default = "us-east-1"
|
||||
}
|
||||
|
||||
variable "repository_name" {
|
||||
description = "Name of the ECR repository"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "environment" {
|
||||
description = "Environment name"
|
||||
type = string
|
||||
default = "dev"
|
||||
}
|
||||
249
infra/ecs_alb/main.tf
Normal file
249
infra/ecs_alb/main.tf
Normal file
@@ -0,0 +1,249 @@
|
||||
terraform {
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 6.27"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
provider "aws" {
|
||||
region = var.aws_region
|
||||
}
|
||||
|
||||
# Get current AWS account
|
||||
data "aws_caller_identity" "current" {}
|
||||
|
||||
# Reference existing VPC
|
||||
data "aws_vpc" "existing" {
|
||||
id = var.vpc_id
|
||||
}
|
||||
|
||||
# Reference existing public subnets
|
||||
data "aws_subnet" "public" {
|
||||
count = length(var.public_subnet_ids)
|
||||
id = var.public_subnet_ids[count.index]
|
||||
}
|
||||
|
||||
# Reference existing private subnets (for ECS tasks)
|
||||
data "aws_subnet" "private" {
|
||||
count = length(var.private_subnet_ids)
|
||||
id = var.private_subnet_ids[count.index]
|
||||
}
|
||||
|
||||
# Security Group for ALB (in public subnets)
|
||||
resource "aws_security_group" "alb" {
|
||||
name = "${var.app_name}-alb-sg"
|
||||
description = "Allow inbound traffic to ALB"
|
||||
vpc_id = data.aws_vpc.existing.id
|
||||
|
||||
ingress {
|
||||
from_port = 80
|
||||
to_port = 80
|
||||
protocol = "tcp"
|
||||
cidr_blocks = ["3.14.44.224/32"]
|
||||
description = "Allow HTTP from internet"
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
description = "Allow all outbound"
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-alb-sg"
|
||||
}
|
||||
}
|
||||
|
||||
# Security Group for ECS Tasks (in private subnets)
|
||||
resource "aws_security_group" "ecs_tasks" {
|
||||
name = "${var.app_name}-ecs-tasks-sg"
|
||||
description = "Allow inbound traffic from ALB"
|
||||
vpc_id = data.aws_vpc.existing.id
|
||||
|
||||
ingress {
|
||||
from_port = 8000
|
||||
to_port = 8000
|
||||
protocol = "tcp"
|
||||
security_groups = [aws_security_group.alb.id]
|
||||
description = "Allow traffic from ALB"
|
||||
}
|
||||
|
||||
egress {
|
||||
from_port = 0
|
||||
to_port = 0
|
||||
protocol = "-1"
|
||||
cidr_blocks = ["0.0.0.0/0"]
|
||||
description = "Allow all outbound"
|
||||
}
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-ecs-tasks-sg"
|
||||
}
|
||||
}
|
||||
|
||||
# Application Load Balancer (in public subnets)
|
||||
resource "aws_lb" "main" {
|
||||
name = "${var.app_name}-alb"
|
||||
internal = false
|
||||
load_balancer_type = "application"
|
||||
security_groups = [aws_security_group.alb.id]
|
||||
subnets = var.public_subnet_ids
|
||||
|
||||
enable_deletion_protection = false
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-alb"
|
||||
}
|
||||
}
|
||||
|
||||
# Target Group
|
||||
resource "aws_lb_target_group" "app" {
|
||||
name = "${var.app_name}-tg"
|
||||
port = 8000
|
||||
protocol = "HTTP"
|
||||
vpc_id = data.aws_vpc.existing.id
|
||||
target_type = "ip"
|
||||
|
||||
health_check {
|
||||
enabled = true
|
||||
healthy_threshold = 2
|
||||
interval = 30
|
||||
matcher = "200"
|
||||
path = "/health"
|
||||
port = "traffic-port"
|
||||
protocol = "HTTP"
|
||||
timeout = 5
|
||||
unhealthy_threshold = 3
|
||||
}
|
||||
|
||||
deregistration_delay = 30
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-tg"
|
||||
}
|
||||
}
|
||||
|
||||
# ALB Listener
|
||||
resource "aws_lb_listener" "app" {
|
||||
load_balancer_arn = aws_lb.main.arn
|
||||
port = "80"
|
||||
protocol = "HTTP"
|
||||
|
||||
default_action {
|
||||
type = "forward"
|
||||
target_group_arn = aws_lb_target_group.app.arn
|
||||
}
|
||||
}
|
||||
|
||||
# ECS Cluster
|
||||
resource "aws_ecs_cluster" "main" {
|
||||
name = "${var.app_name}-cluster"
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-cluster"
|
||||
}
|
||||
}
|
||||
|
||||
# CloudWatch Log Group
|
||||
resource "aws_cloudwatch_log_group" "app" {
|
||||
name = "/ecs/${var.app_name}"
|
||||
retention_in_days = 7
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-logs"
|
||||
}
|
||||
}
|
||||
|
||||
# ECS Task Execution Role
|
||||
resource "aws_iam_role" "ecs_task_execution_role" {
|
||||
name = "${var.app_name}-ecs-task-execution-role"
|
||||
|
||||
assume_role_policy = jsonencode({
|
||||
Version = "2012-10-17"
|
||||
Statement = [{
|
||||
Action = "sts:AssumeRole"
|
||||
Effect = "Allow"
|
||||
Principal = {
|
||||
Service = "ecs-tasks.amazonaws.com"
|
||||
}
|
||||
}]
|
||||
})
|
||||
}
|
||||
|
||||
resource "aws_iam_role_policy_attachment" "ecs_task_execution_role_policy" {
|
||||
role = aws_iam_role.ecs_task_execution_role.name
|
||||
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
|
||||
}
|
||||
|
||||
# ECS Task Definition
|
||||
resource "aws_ecs_task_definition" "app" {
|
||||
family = var.app_name
|
||||
network_mode = "awsvpc"
|
||||
requires_compatibilities = ["FARGATE"]
|
||||
cpu = var.fargate_cpu
|
||||
memory = var.fargate_memory
|
||||
execution_role_arn = aws_iam_role.ecs_task_execution_role.arn
|
||||
|
||||
container_definitions = jsonencode([{
|
||||
name = var.app_name
|
||||
image = "${data.aws_caller_identity.current.account_id}.dkr.ecr.${var.aws_region}.amazonaws.com/${var.ecr_repository_name}:${var.image_tag}"
|
||||
|
||||
portMappings = [{
|
||||
containerPort = 8000
|
||||
hostPort = 8000
|
||||
protocol = "tcp"
|
||||
}]
|
||||
|
||||
logConfiguration = {
|
||||
logDriver = "awslogs"
|
||||
options = {
|
||||
"awslogs-group" = aws_cloudwatch_log_group.app.name
|
||||
"awslogs-region" = var.aws_region
|
||||
"awslogs-stream-prefix" = "ecs"
|
||||
}
|
||||
}
|
||||
|
||||
healthCheck = {
|
||||
command = ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
|
||||
interval = 30
|
||||
timeout = 5
|
||||
retries = 3
|
||||
startPeriod = 60
|
||||
}
|
||||
}])
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-task"
|
||||
}
|
||||
}
|
||||
|
||||
# ECS Service (tasks in private subnets)
|
||||
resource "aws_ecs_service" "app" {
|
||||
name = "${var.app_name}-service"
|
||||
cluster = aws_ecs_cluster.main.id
|
||||
task_definition = aws_ecs_task_definition.app.arn
|
||||
desired_count = var.app_count
|
||||
launch_type = "FARGATE"
|
||||
|
||||
network_configuration {
|
||||
security_groups = [aws_security_group.ecs_tasks.id]
|
||||
subnets = var.private_subnet_ids # ECS tasks in private subnets
|
||||
assign_public_ip = false # No public IP needed with NAT gateway
|
||||
}
|
||||
|
||||
load_balancer {
|
||||
target_group_arn = aws_lb_target_group.app.arn
|
||||
container_name = var.app_name
|
||||
container_port = 8000
|
||||
}
|
||||
|
||||
depends_on = [aws_lb_listener.app]
|
||||
|
||||
tags = {
|
||||
Name = "${var.app_name}-service"
|
||||
}
|
||||
}
|
||||
23
infra/ecs_alb/terraform.tfvars
Normal file
23
infra/ecs_alb/terraform.tfvars
Normal file
@@ -0,0 +1,23 @@
|
||||
aws_region = "us-east-2"
|
||||
app_name = "upflux-doc-analyser"
|
||||
|
||||
# Replace these with your actual IDs
|
||||
vpc_id = "vpc-0270f02aee3bf1b8d"
|
||||
|
||||
# Your public subnets (where ALB will be)
|
||||
public_subnet_ids = [
|
||||
"subnet-088bc49c54ec8f028", # Public subnet 1
|
||||
"subnet-003f1693910a99afb" # Public subnet 2
|
||||
]
|
||||
|
||||
# Your private subnets (where ECS tasks will run)
|
||||
private_subnet_ids = [
|
||||
"subnet-045f73d784beed091", # Private subnet 1
|
||||
"subnet-06e660f44bf141442" # Private subnet 2
|
||||
]
|
||||
|
||||
ecr_repository_name = "upflux-doc-analyser"
|
||||
image_tag = "latest"
|
||||
fargate_cpu = "256"
|
||||
fargate_memory = "512"
|
||||
app_count = 1
|
||||
56
infra/ecs_alb/variables.tf
Normal file
56
infra/ecs_alb/variables.tf
Normal file
@@ -0,0 +1,56 @@
|
||||
variable "aws_region" {
|
||||
description = "AWS region"
|
||||
type = string
|
||||
default = "us-east-1"
|
||||
}
|
||||
|
||||
variable "app_name" {
|
||||
description = "Application name"
|
||||
type = string
|
||||
default = "fastapi-app"
|
||||
}
|
||||
|
||||
variable "vpc_id" {
|
||||
description = "Existing VPC ID"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "public_subnet_ids" {
|
||||
description = "List of existing public subnet IDs (for ALB)"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "private_subnet_ids" {
|
||||
description = "List of existing private subnet IDs (for ECS tasks)"
|
||||
type = list(string)
|
||||
}
|
||||
|
||||
variable "ecr_repository_name" {
|
||||
description = "ECR repository name"
|
||||
type = string
|
||||
default = "fastapi-app"
|
||||
}
|
||||
|
||||
variable "image_tag" {
|
||||
description = "Docker image tag"
|
||||
type = string
|
||||
default = "latest"
|
||||
}
|
||||
|
||||
variable "fargate_cpu" {
|
||||
description = "Fargate CPU units"
|
||||
type = string
|
||||
default = "256"
|
||||
}
|
||||
|
||||
variable "fargate_memory" {
|
||||
description = "Fargate memory in MB"
|
||||
type = string
|
||||
default = "512"
|
||||
}
|
||||
|
||||
variable "app_count" {
|
||||
description = "Number of tasks to run"
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
3
infra/terraform.tfvars
Normal file
3
infra/terraform.tfvars
Normal file
@@ -0,0 +1,3 @@
|
||||
aws_region = "us-east-2"
|
||||
repository_name = "upflux-doc-analyser"
|
||||
environment = "dev"
|
||||
404
scripts/process_images_batch.py
Executable file
404
scripts/process_images_batch.py
Executable file
@@ -0,0 +1,404 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch process images from S3 using AWS Textract.
|
||||
Iterates through folders (prefixes) in an S3 bucket and processes any PDF, PNG, or JPEG files
|
||||
that haven't been processed yet (checking for existing textract output files).
|
||||
Saves both JSON and plain text outputs locally.
|
||||
"""
|
||||
|
||||
import boto3
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import io
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
import time
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
|
||||
def get_s3_client():
|
||||
"""Initialize and return AWS S3 client."""
|
||||
return boto3.client('s3',region_name="us-east-2")
|
||||
|
||||
|
||||
def get_textract_client():
|
||||
"""Initialize and return AWS Textract client."""
|
||||
return boto3.client('textract',region_name="us-east-2")
|
||||
|
||||
|
||||
def get_pdf_page_count(pdf_bytes: bytes) -> int:
|
||||
"""
|
||||
Get the number of pages in a PDF file.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file content as bytes
|
||||
|
||||
Returns:
|
||||
int: Number of pages in the PDF
|
||||
"""
|
||||
try:
|
||||
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
||||
return len(pdf_reader.pages)
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not determine page count: {str(e)}")
|
||||
return 1
|
||||
|
||||
|
||||
def is_already_processed(s3_key: str, output_dir: Path) -> bool:
|
||||
"""
|
||||
Check if an image has already been processed by looking for output file.
|
||||
|
||||
Args:
|
||||
s3_key: S3 object key
|
||||
output_dir: Directory where output files are stored
|
||||
|
||||
Returns:
|
||||
bool: True if output file exists, False otherwise
|
||||
"""
|
||||
filename = Path(s3_key).stem
|
||||
output_file = output_dir / f"{filename}_textract.json"
|
||||
return output_file.exists()
|
||||
|
||||
|
||||
def process_image_from_s3(bucket_name: str, s3_key: str) -> Dict:
|
||||
"""
|
||||
Process an image file from S3 with AWS Textract.
|
||||
Supports PDF, PNG, and JPEG formats.
|
||||
Uses async API (start_document_text_detection) for multi-page PDFs,
|
||||
and sync API (detect_document_text) for single-page PDFs and images.
|
||||
|
||||
Args:
|
||||
bucket_name: S3 bucket name
|
||||
s3_key: S3 object key
|
||||
|
||||
Returns:
|
||||
dict: Textract response containing detected text
|
||||
"""
|
||||
textract = get_textract_client()
|
||||
s3 = get_s3_client()
|
||||
|
||||
try:
|
||||
# Verify the object exists first
|
||||
try:
|
||||
s3.head_object(Bucket=bucket_name, Key=s3_key)
|
||||
except Exception as e:
|
||||
print(f" Error accessing S3 object: {str(e)}")
|
||||
print(f" Bucket: {bucket_name}")
|
||||
print(f" Key: {s3_key}")
|
||||
return None
|
||||
|
||||
file_ext = Path(s3_key).suffix.lower()
|
||||
|
||||
# For images (PNG, JPEG), always use sync API
|
||||
if file_ext in ['.png', '.jpg', '.jpeg']:
|
||||
print(f" Processing image with sync API")
|
||||
response = textract.detect_document_text(
|
||||
Document={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': s3_key
|
||||
}
|
||||
}
|
||||
)
|
||||
return response
|
||||
|
||||
# For PDFs, check page count to decide which API to use
|
||||
if file_ext == '.pdf':
|
||||
# Download PDF to check page count
|
||||
response = s3.get_object(Bucket=bucket_name, Key=s3_key)
|
||||
pdf_bytes = response['Body'].read()
|
||||
page_count = get_pdf_page_count(pdf_bytes)
|
||||
|
||||
print(f" PDF has {page_count} page(s)")
|
||||
|
||||
# Use async API for multi-page PDFs
|
||||
if page_count > 1:
|
||||
print(f" Using async API (start_document_text_detection) for multi-page PDF")
|
||||
response = textract.start_document_text_detection(
|
||||
DocumentLocation={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': s3_key
|
||||
}
|
||||
}
|
||||
)
|
||||
job_id = response['JobId']
|
||||
print(f" Started async job: {job_id}")
|
||||
|
||||
# Wait for job to complete
|
||||
while True:
|
||||
result = textract.get_document_text_detection(JobId=job_id)
|
||||
status = result['JobStatus']
|
||||
|
||||
if status == 'SUCCEEDED':
|
||||
return result
|
||||
elif status == 'FAILED':
|
||||
print(f" Job failed: {result.get('StatusMessage', 'Unknown error')}")
|
||||
return None
|
||||
|
||||
time.sleep(2)
|
||||
else:
|
||||
# Use sync API for single-page PDFs
|
||||
print(f" Using sync API (detect_document_text) for single-page PDF")
|
||||
response = textract.detect_document_text(
|
||||
Document={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': s3_key
|
||||
}
|
||||
}
|
||||
)
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error processing {s3_key}: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
def extract_text_from_response(response: Dict) -> str:
|
||||
"""
|
||||
Extract plain text from Textract response.
|
||||
|
||||
Args:
|
||||
response: Textract API response
|
||||
|
||||
Returns:
|
||||
str: Extracted text
|
||||
"""
|
||||
if not response:
|
||||
return ""
|
||||
|
||||
text_lines = []
|
||||
for block in response.get('Blocks', []):
|
||||
if block['BlockType'] == 'LINE':
|
||||
text_lines.append(block['Text'])
|
||||
|
||||
return '\n'.join(text_lines)
|
||||
|
||||
|
||||
def save_textract_output(s3_key: str, response: Dict, output_dir: Path):
|
||||
"""
|
||||
Save Textract response to JSON file and plain text file locally.
|
||||
|
||||
Args:
|
||||
s3_key: S3 object key
|
||||
response: Textract API response
|
||||
output_dir: Directory to save output files
|
||||
"""
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
filename = Path(s3_key).stem
|
||||
|
||||
# Extract text
|
||||
extracted_text = extract_text_from_response(response)
|
||||
|
||||
# Save JSON output
|
||||
json_output_file = output_dir / f"{filename}_textract.json"
|
||||
if response:
|
||||
response['extracted_text'] = extracted_text
|
||||
response['source_s3_key'] = s3_key
|
||||
|
||||
with open(json_output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(response, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f" ✓ Saved JSON to: {json_output_file.name}")
|
||||
|
||||
# Save plain text output
|
||||
text_output_file = output_dir / f"{filename}.txt"
|
||||
with open(text_output_file, 'w', encoding='utf-8') as f:
|
||||
f.write(extracted_text)
|
||||
|
||||
print(f" ✓ Saved text to: {text_output_file.name}")
|
||||
|
||||
|
||||
def get_supported_images_from_s3(bucket_name: str, prefix: str) -> List[str]:
|
||||
"""
|
||||
Get list of supported image files in an S3 prefix (folder).
|
||||
Filters out files containing 'script' (case-insensitive).
|
||||
|
||||
Args:
|
||||
bucket_name: S3 bucket name
|
||||
prefix: S3 prefix (folder path)
|
||||
|
||||
Returns:
|
||||
List of S3 keys for supported image files
|
||||
"""
|
||||
s3 = get_s3_client()
|
||||
supported_extensions = {'.pdf', '.png', '.jpg', '.jpeg'}
|
||||
images = []
|
||||
|
||||
# Ensure prefix ends with / if it's not empty
|
||||
if prefix and not prefix.endswith('/'):
|
||||
prefix += '/'
|
||||
|
||||
paginator = s3.get_paginator('list_objects_v2')
|
||||
pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
|
||||
|
||||
for page in pages:
|
||||
for obj in page.get('Contents', []):
|
||||
key = obj['Key']
|
||||
file_path = Path(key)
|
||||
|
||||
# Check if it's a file (not a folder) and has supported extension
|
||||
if file_path.suffix.lower() in supported_extensions:
|
||||
# Filter out files containing 'script' (case-insensitive)
|
||||
if 'script' not in file_path.name.lower():
|
||||
images.append(key)
|
||||
|
||||
return sorted(images)
|
||||
|
||||
|
||||
def get_folders_from_s3(bucket_name: str, base_prefix: str = '') -> List[str]:
|
||||
"""
|
||||
Get list of folders (prefixes) in S3 bucket.
|
||||
|
||||
Args:
|
||||
bucket_name: S3 bucket name
|
||||
base_prefix: Base prefix to search under
|
||||
|
||||
Returns:
|
||||
List of folder prefixes
|
||||
"""
|
||||
s3 = get_s3_client()
|
||||
folders = []
|
||||
|
||||
# Ensure prefix ends with / if it's not empty
|
||||
if base_prefix and not base_prefix.endswith('/'):
|
||||
base_prefix += '/'
|
||||
|
||||
paginator = s3.get_paginator('list_objects_v2')
|
||||
pages = paginator.paginate(Bucket=bucket_name, Prefix=base_prefix, Delimiter='/')
|
||||
|
||||
for page in pages:
|
||||
for prefix_info in page.get('CommonPrefixes', []):
|
||||
folders.append(prefix_info['Prefix'])
|
||||
|
||||
return folders
|
||||
|
||||
|
||||
def process_folder(bucket_name: str, prefix: str, output_base_dir: Path, skip_existing: bool = True):
|
||||
"""
|
||||
Process all images in an S3 folder (prefix).
|
||||
|
||||
Args:
|
||||
bucket_name: S3 bucket name
|
||||
prefix: S3 prefix (folder path)
|
||||
output_base_dir: Base directory for output files
|
||||
skip_existing: Whether to skip already processed files
|
||||
"""
|
||||
folder_name = prefix.rstrip('/').split('/')[-1] or 'root'
|
||||
output_dir = output_base_dir / folder_name
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Processing folder: {prefix}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
images = get_supported_images_from_s3(bucket_name, prefix)
|
||||
|
||||
if not images:
|
||||
print(f" No supported images found (PDF, PNG, JPEG)")
|
||||
return
|
||||
|
||||
print(f" Found {len(images)} image(s)")
|
||||
|
||||
processed_count = 0
|
||||
skipped_count = 0
|
||||
error_count = 0
|
||||
|
||||
for s3_key in images:
|
||||
filename = Path(s3_key).name
|
||||
print(f"\n Processing: {filename}")
|
||||
|
||||
# Check if already processed
|
||||
if skip_existing and is_already_processed(s3_key, output_dir):
|
||||
print(f" ⊘ Skipped (already processed)")
|
||||
skipped_count += 1
|
||||
continue
|
||||
|
||||
# Process with Textract
|
||||
response = process_image_from_s3(bucket_name, s3_key)
|
||||
|
||||
if response:
|
||||
# Save output (both JSON and text)
|
||||
save_textract_output(s3_key, response, output_dir)
|
||||
|
||||
# Print summary
|
||||
num_blocks = len(response.get('Blocks', []))
|
||||
text_length = len(extract_text_from_response(response))
|
||||
print(f" ℹ Extracted {text_length} characters, {num_blocks} blocks")
|
||||
|
||||
processed_count += 1
|
||||
|
||||
# Small delay to avoid rate limiting
|
||||
time.sleep(0.5)
|
||||
else:
|
||||
error_count += 1
|
||||
|
||||
print(f"\n Summary for {folder_name}:")
|
||||
print(f" Processed: {processed_count}")
|
||||
print(f" Skipped: {skipped_count}")
|
||||
print(f" Errors: {error_count}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the script."""
|
||||
# Get bucket name from environment or command line
|
||||
bucket_name = os.environ.get('S3_BUCKET_NAME')
|
||||
base_prefix = os.environ.get('S3_BASE_PREFIX', 'imagens')
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
bucket_name = sys.argv[1]
|
||||
if len(sys.argv) > 2:
|
||||
base_prefix = sys.argv[2]
|
||||
|
||||
if not bucket_name:
|
||||
print("Error: S3 bucket name not provided.")
|
||||
print("\nUsage:")
|
||||
print(" python process_images_batch.py <bucket_name> [base_prefix]")
|
||||
print("\nOr set environment variables:")
|
||||
print(" export S3_BUCKET_NAME=my-bucket")
|
||||
print(" export S3_BASE_PREFIX=imagens")
|
||||
print(" python process_images_batch.py")
|
||||
sys.exit(1)
|
||||
|
||||
# Get output directory
|
||||
script_dir = Path(__file__).parent
|
||||
output_base_dir = script_dir / "textract_output"
|
||||
|
||||
print(f"S3 Bucket: {bucket_name}")
|
||||
print(f"Base prefix: {base_prefix}")
|
||||
print(f"Output directory: {output_base_dir}")
|
||||
|
||||
# Get all folders (prefixes) in the bucket
|
||||
print(f"\nScanning S3 bucket for folders...")
|
||||
folders = get_folders_from_s3(bucket_name, base_prefix)
|
||||
|
||||
if not folders:
|
||||
print(f"\nNo subdirectories found under '{base_prefix}'.")
|
||||
print("Processing files in the base prefix instead...")
|
||||
folders = [base_prefix]
|
||||
else:
|
||||
print(f"\nFound {len(folders)} folder(s) to process")
|
||||
|
||||
# Process each folder
|
||||
total_start = time.time()
|
||||
|
||||
for prefix in folders:
|
||||
try:
|
||||
process_folder(bucket_name, prefix, output_base_dir)
|
||||
except Exception as e:
|
||||
print(f"\nError processing folder {prefix}: {str(e)}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
total_time = time.time() - total_start
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Batch processing complete!")
|
||||
print(f"Total time: {total_time:.2f} seconds")
|
||||
print(f"{'='*80}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
209
scripts/textract.py
Normal file
209
scripts/textract.py
Normal file
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple script to invoke AWS Textract on a PDF file.
|
||||
Extracts text and returns the detected content.
|
||||
"""
|
||||
|
||||
import boto3
|
||||
import sys
|
||||
import io
|
||||
from pathlib import Path
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
|
||||
def get_pdf_page_count(pdf_bytes: bytes) -> int:
|
||||
"""
|
||||
Get the number of pages in a PDF file.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file content as bytes
|
||||
|
||||
Returns:
|
||||
int: Number of pages in the PDF
|
||||
"""
|
||||
try:
|
||||
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
||||
return len(pdf_reader.pages)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not determine page count: {str(e)}")
|
||||
return 1
|
||||
|
||||
|
||||
def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
|
||||
"""
|
||||
Process a document file (PDF, PNG, JPEG) with AWS Textract.
|
||||
Uses async API (start_document_text_detection) for multi-page PDFs,
|
||||
and sync API (detect_document_text) for single-page PDFs and images.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the document file (local path or S3 key)
|
||||
bucket_name: Optional S3 bucket name if document is in S3
|
||||
|
||||
Returns:
|
||||
dict: Textract response containing detected text
|
||||
"""
|
||||
textract = boto3.client('textract')
|
||||
file_ext = Path(pdf_path).suffix.lower()
|
||||
|
||||
# For images (PNG, JPEG), always use sync API
|
||||
if file_ext in ['.png', '.jpg', '.jpeg']:
|
||||
print(f"Processing image file with sync API")
|
||||
|
||||
if bucket_name:
|
||||
response = textract.detect_document_text(
|
||||
Document={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': pdf_path
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_bytes = file.read()
|
||||
response = textract.detect_document_text(
|
||||
Document={'Bytes': file_bytes}
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# For PDFs, check page count to decide which API to use
|
||||
if file_ext == '.pdf':
|
||||
s3 = boto3.client('s3')
|
||||
|
||||
# Determine number of pages
|
||||
if bucket_name:
|
||||
# Download PDF from S3 to check page count
|
||||
response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
|
||||
pdf_bytes = response['Body'].read()
|
||||
else:
|
||||
# Read local PDF
|
||||
with open(pdf_path, 'rb') as pdf_file:
|
||||
pdf_bytes = pdf_file.read()
|
||||
|
||||
page_count = get_pdf_page_count(pdf_bytes)
|
||||
print(f"PDF has {page_count} page(s)")
|
||||
|
||||
# Use async API for multi-page PDFs
|
||||
if page_count > 1:
|
||||
print("Using async API (start_document_text_detection) for multi-page PDF")
|
||||
|
||||
if bucket_name:
|
||||
# Process from S3
|
||||
response = textract.start_document_text_detection(
|
||||
DocumentLocation={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': pdf_path
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
# For local files with multiple pages, we need to use S3
|
||||
# Note: Textract async API requires S3
|
||||
raise ValueError(
|
||||
"Multi-page PDFs must be processed from S3. "
|
||||
"Please upload the file to S3 first."
|
||||
)
|
||||
|
||||
job_id = response['JobId']
|
||||
print(f"Started Textract job: {job_id}")
|
||||
|
||||
# Wait for job to complete
|
||||
import time
|
||||
while True:
|
||||
result = textract.get_document_text_detection(JobId=job_id)
|
||||
status = result['JobStatus']
|
||||
print(f"Job status: {status}")
|
||||
|
||||
if status in ['SUCCEEDED', 'FAILED']:
|
||||
break
|
||||
time.sleep(2)
|
||||
|
||||
return result
|
||||
else:
|
||||
# Use sync API for single-page PDFs
|
||||
print("Using sync API (detect_document_text) for single-page PDF")
|
||||
|
||||
if bucket_name:
|
||||
response = textract.detect_document_text(
|
||||
Document={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': pdf_path
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
response = textract.detect_document_text(
|
||||
Document={'Bytes': pdf_bytes}
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# Unsupported file type
|
||||
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
|
||||
|
||||
|
||||
def extract_text_from_response(response: dict) -> str:
|
||||
"""
|
||||
Extract plain text from Textract response.
|
||||
|
||||
Args:
|
||||
response: Textract API response
|
||||
|
||||
Returns:
|
||||
str: Extracted text
|
||||
"""
|
||||
text_lines = []
|
||||
|
||||
for block in response.get('Blocks', []):
|
||||
if block['BlockType'] == 'LINE':
|
||||
text_lines.append(block['Text'])
|
||||
|
||||
return '\n'.join(text_lines)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
|
||||
print("\nExamples:")
|
||||
print(" python textract_pdf.py document.pdf")
|
||||
print(" python textract_pdf.py path/to/doc.pdf my-bucket")
|
||||
sys.exit(1)
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
if not bucket_name and not Path(pdf_path).exists():
|
||||
print(f"Error: File not found: {pdf_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Processing PDF: {pdf_path}")
|
||||
if bucket_name:
|
||||
print(f"Using S3 bucket: {bucket_name}")
|
||||
|
||||
# Process PDF
|
||||
response = process_pdf_with_textract(pdf_path, bucket_name)
|
||||
|
||||
# Extract and display text
|
||||
text = extract_text_from_response(response)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("EXTRACTED TEXT")
|
||||
print("="*80)
|
||||
print(text)
|
||||
print("="*80)
|
||||
|
||||
# Print summary
|
||||
num_blocks = len(response.get('Blocks', []))
|
||||
num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
|
||||
|
||||
print(f"\nSummary:")
|
||||
print(f" Pages processed: {num_pages}")
|
||||
print(f" Total blocks: {num_blocks}")
|
||||
print(f" Text length: {len(text)} characters")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
30
scripts/tojson.py
Normal file
30
scripts/tojson.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import pandas as pd
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Configuration
|
||||
CSV_FILE = "guias.csv"
|
||||
OUTPUT_DIR = "json_output"
|
||||
ENCODING = "utf-8"
|
||||
|
||||
# Create output directory
|
||||
Path(OUTPUT_DIR).mkdir(exist_ok=True)
|
||||
|
||||
# Read CSV
|
||||
df = pd.read_csv(CSV_FILE, encoding=ENCODING)
|
||||
|
||||
# Convert each row to JSON, skipping row 2 (index 1)
|
||||
for index, row in df.iterrows():
|
||||
# Skip the second row (index 1)
|
||||
if index == 0:
|
||||
print(f"⊗ Skipped row {index + 1}")
|
||||
continue
|
||||
|
||||
# Save to individual JSON file
|
||||
output_file = f"{OUTPUT_DIR}/row_{index + 1}.json"
|
||||
with open(output_file, 'w', encoding=ENCODING) as json_file:
|
||||
json.dump(row.to_dict(), json_file, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"✓ Created {output_file}")
|
||||
|
||||
print(f"\nDone! Created {len(df) - 1} JSON files in '{OUTPUT_DIR}/' directory")
|
||||
Reference in New Issue
Block a user