Files
2026-05-14 14:07:04 -03:00

181 lines
6.5 KiB
Python

import boto3
import os
import tempfile
import json
from urllib.parse import urlparse
from diagram_processor import DiagramProcessor
def parse_s3_path(s3_path):
"""
Parse S3 path into bucket and key
Args:
s3_path: S3 path like 's3://bucket-name/path/to/file.pdf'
Returns:
Tuple (bucket, key)
"""
if not s3_path.startswith('s3://'):
raise ValueError(f"Invalid S3 path: {s3_path}. Must start with 's3://'")
parsed = urlparse(s3_path)
bucket = parsed.netloc
key = parsed.path.lstrip('/')
return bucket, key
def download_from_s3(s3_path, local_path):
"""
Download file from S3
Args:
s3_path: S3 path (s3://bucket/key)
local_path: Local file path to save to
"""
bucket, key = parse_s3_path(s3_path)
s3_client = boto3.client('s3')
print(f"Downloading from S3: {s3_path}")
s3_client.download_file(bucket, key, local_path)
print(f"Downloaded to: {local_path}")
def execute(s3_path):
"""
Function A - Process diagram from S3 and return matches only
Args:
s3_path: S3 path to diagram (e.g., 's3://my-bucket/diagrams/diagram.pdf')
Returns:
Dictionary with matches of labels and blocks
"""
print(f"Function A - Diagram Processing")
print(f"Input S3 path: {s3_path}")
# Create temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Download diagram from S3
bucket, key = parse_s3_path(s3_path)
input_file = os.path.join(temp_dir, os.path.basename(key))
download_from_s3(s3_path, input_file)
# Create output directory for processing
output_dir = os.path.join(temp_dir, 'output')
os.makedirs(output_dir, exist_ok=True)
# Initialize processor
print("\nInitializing DiagramProcessor...")
processor = DiagramProcessor(
region=os.environ.get('AWS_REGION', 'us-east-1'),
custom_labels_arn=os.environ.get('CUSTOM_LABELS_ARN', 'arn:aws:rekognition:us-east-1:173378533286:project/labels-valvula/version/labels-valvula.2025-11-24T15.44.16/1764009856090')
)
# Process diagram
print("\nProcessing diagram...")
try:
results = processor.process_single_diagram(
diagram_path=input_file,
output_base_dir=output_dir,
grid_size=(5, 5),
overlap_percent=10,
keep_regex_list=[r'\+', r'\+', r'.*[Xx].*', r'\*', r'\\'],
min_confidence=80,
custom_labels_confidence=60,
iou_threshold=0.3,
matching_max_distance=200
)
# Extract only the matches
matching_results = results['matching_results']
# Format matches for clean output
formatted_matches = []
for match in matching_results['matches']:
match_type = match.get('match_type', 'vm_label')
if match_type == 'two_labels':
formatted_match = {
'object_name': match['object_name'],
'object_confidence': round(match['object_confidence'], 2),
'match_type': match_type,
'text_top': match['text_top'],
'text_top_confidence': round(match['text_confidence_top'], 2),
'text_bottom': match['text_bottom'],
'text_bottom_confidence': round(match['text_confidence_bottom'], 2),
'object_bbox': match['object_bbox'],
'text_bbox_top': match['text_bbox_top'],
'text_bbox_bottom': match['text_bbox_bottom']
}
else:
formatted_match = {
'object_name': match['object_name'],
'object_confidence': round(match['object_confidence'], 2),
'match_type': match_type,
'text': match['text'],
'text_confidence': round(match['text_confidence'], 2),
'distance_pixels': round(match['distance_pixels'], 2),
'object_bbox': match['object_bbox'],
'text_bbox': match['text_bbox']
}
formatted_matches.append(formatted_match)
# Format unmatched objects
unmatched_objects = [
{
'name': obj['Name'],
'confidence': round(obj['Confidence'], 2),
'bbox': obj['global_bbox']
}
for obj in matching_results['unmatched_objects']
]
# Format unmatched texts
unmatched_texts = [
{
'text': text['text'],
'confidence': round(text['confidence'], 2),
'bbox': text['global_bbox']
}
for text in matching_results['unmatched_texts']
]
# Prepare response
response = {
'status': 'success',
'input_s3_path': s3_path,
'summary': {
'total_matches': len(formatted_matches),
'unmatched_objects': len(unmatched_objects),
'unmatched_texts': len(unmatched_texts),
'matching_rate': f"{matching_results['matching_rate']*100:.1f}%"
},
'matches': formatted_matches,
'unmatched_objects': unmatched_objects,
'unmatched_texts': unmatched_texts
}
print("\n" + "="*80)
print("PROCESSING COMPLETE")
print("="*80)
print(f"Total matches: {len(formatted_matches)}")
print(f"Matching rate: {matching_results['matching_rate']*100:.1f}%")
print(f"Unmatched objects: {len(unmatched_objects)}")
print(f"Unmatched texts: {len(unmatched_texts)}")
return response
except Exception as e:
error_message = f"Error processing diagram: {str(e)}"
print(error_message)
import traceback
traceback.print_exc()
return {
'status': 'error',
'error': error_message,
'input_s3_path': s3_path
}