import boto3 import os import tempfile import json from urllib.parse import urlparse from diagram_processor import DiagramProcessor def parse_s3_path(s3_path): """ Parse S3 path into bucket and key Args: s3_path: S3 path like 's3://bucket-name/path/to/file.pdf' Returns: Tuple (bucket, key) """ if not s3_path.startswith('s3://'): raise ValueError(f"Invalid S3 path: {s3_path}. Must start with 's3://'") parsed = urlparse(s3_path) bucket = parsed.netloc key = parsed.path.lstrip('/') return bucket, key def download_from_s3(s3_path, local_path): """ Download file from S3 Args: s3_path: S3 path (s3://bucket/key) local_path: Local file path to save to """ bucket, key = parse_s3_path(s3_path) s3_client = boto3.client('s3') print(f"Downloading from S3: {s3_path}") s3_client.download_file(bucket, key, local_path) print(f"Downloaded to: {local_path}") def execute(s3_path): """ Function A - Process diagram from S3 and return matches only Args: s3_path: S3 path to diagram (e.g., 's3://my-bucket/diagrams/diagram.pdf') Returns: Dictionary with matches of labels and blocks """ print(f"Function A - Diagram Processing") print(f"Input S3 path: {s3_path}") # Create temporary directory with tempfile.TemporaryDirectory() as temp_dir: # Download diagram from S3 bucket, key = parse_s3_path(s3_path) input_file = os.path.join(temp_dir, os.path.basename(key)) download_from_s3(s3_path, input_file) # Create output directory for processing output_dir = os.path.join(temp_dir, 'output') os.makedirs(output_dir, exist_ok=True) # Initialize processor print("\nInitializing DiagramProcessor...") processor = DiagramProcessor( region=os.environ.get('AWS_REGION', 'us-east-1'), custom_labels_arn=os.environ.get('CUSTOM_LABELS_ARN', 'arn:aws:rekognition:us-east-1:173378533286:project/labels-valvula/version/labels-valvula.2025-11-24T15.44.16/1764009856090') ) # Process diagram print("\nProcessing diagram...") try: results = processor.process_single_diagram( diagram_path=input_file, output_base_dir=output_dir, grid_size=(5, 5), overlap_percent=10, keep_regex_list=[r'\+', r'\+', r'.*[Xx].*', r'\*', r'\\'], min_confidence=80, custom_labels_confidence=60, iou_threshold=0.3, matching_max_distance=200 ) # Extract only the matches matching_results = results['matching_results'] # Format matches for clean output formatted_matches = [] for match in matching_results['matches']: match_type = match.get('match_type', 'vm_label') if match_type == 'two_labels': formatted_match = { 'object_name': match['object_name'], 'object_confidence': round(match['object_confidence'], 2), 'match_type': match_type, 'text_top': match['text_top'], 'text_top_confidence': round(match['text_confidence_top'], 2), 'text_bottom': match['text_bottom'], 'text_bottom_confidence': round(match['text_confidence_bottom'], 2), 'object_bbox': match['object_bbox'], 'text_bbox_top': match['text_bbox_top'], 'text_bbox_bottom': match['text_bbox_bottom'] } else: formatted_match = { 'object_name': match['object_name'], 'object_confidence': round(match['object_confidence'], 2), 'match_type': match_type, 'text': match['text'], 'text_confidence': round(match['text_confidence'], 2), 'distance_pixels': round(match['distance_pixels'], 2), 'object_bbox': match['object_bbox'], 'text_bbox': match['text_bbox'] } formatted_matches.append(formatted_match) # Format unmatched objects unmatched_objects = [ { 'name': obj['Name'], 'confidence': round(obj['Confidence'], 2), 'bbox': obj['global_bbox'] } for obj in matching_results['unmatched_objects'] ] # Format unmatched texts unmatched_texts = [ { 'text': text['text'], 'confidence': round(text['confidence'], 2), 'bbox': text['global_bbox'] } for text in matching_results['unmatched_texts'] ] # Prepare response response = { 'status': 'success', 'input_s3_path': s3_path, 'summary': { 'total_matches': len(formatted_matches), 'unmatched_objects': len(unmatched_objects), 'unmatched_texts': len(unmatched_texts), 'matching_rate': f"{matching_results['matching_rate']*100:.1f}%" }, 'matches': formatted_matches, 'unmatched_objects': unmatched_objects, 'unmatched_texts': unmatched_texts } print("\n" + "="*80) print("PROCESSING COMPLETE") print("="*80) print(f"Total matches: {len(formatted_matches)}") print(f"Matching rate: {matching_results['matching_rate']*100:.1f}%") print(f"Unmatched objects: {len(unmatched_objects)}") print(f"Unmatched texts: {len(unmatched_texts)}") return response except Exception as e: error_message = f"Error processing diagram: {str(e)}" print(error_message) import traceback traceback.print_exc() return { 'status': 'error', 'error': error_message, 'input_s3_path': s3_path }