AI-coodex-rekog-image-labeling/label/infra/code/function_a.py

import boto3
import os
import tempfile
import json
from urllib.parse import urlparse
from diagram_processor import DiagramProcessor


def parse_s3_path(s3_path):
    """
    Parse S3 path into bucket and key

    Args:
        s3_path: S3 path like 's3://bucket-name/path/to/file.pdf'

    Returns:
        Tuple (bucket, key)
    """
    if not s3_path.startswith('s3://'):
        raise ValueError(f"Invalid S3 path: {s3_path}. Must start with 's3://'")

    parsed = urlparse(s3_path)
    bucket = parsed.netloc
    key = parsed.path.lstrip('/')

    return bucket, key


def download_from_s3(s3_path, local_path):
    """
    Download file from S3

    Args:
        s3_path: S3 path (s3://bucket/key)
        local_path: Local file path to save to
    """
    bucket, key = parse_s3_path(s3_path)

    s3_client = boto3.client('s3')
    print(f"Downloading from S3: {s3_path}")
    s3_client.download_file(bucket, key, local_path)
    print(f"Downloaded to: {local_path}")


def execute(s3_path):
    """
    Function A - Process diagram from S3 and return matches only

    Args:
        s3_path: S3 path to diagram (e.g., 's3://my-bucket/diagrams/diagram.pdf')

    Returns:
        Dictionary with matches of labels and blocks
    """
    print(f"Function A - Diagram Processing")
    print(f"Input S3 path: {s3_path}")

    # Create temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        # Download diagram from S3
        bucket, key = parse_s3_path(s3_path)
        input_file = os.path.join(temp_dir, os.path.basename(key))
        download_from_s3(s3_path, input_file)

        # Create output directory for processing
        output_dir = os.path.join(temp_dir, 'output')
        os.makedirs(output_dir, exist_ok=True)

        # Initialize processor
        print("\nInitializing DiagramProcessor...")
        processor = DiagramProcessor(
            region=os.environ.get('AWS_REGION', 'us-east-1'),
            custom_labels_arn=os.environ.get('CUSTOM_LABELS_ARN', 'arn:aws:rekognition:us-east-1:173378533286:project/labels-valvula/version/labels-valvula.2025-11-24T15.44.16/1764009856090')
        )

        # Process diagram
        print("\nProcessing diagram...")
        try:
            results = processor.process_single_diagram(
                diagram_path=input_file,
                output_base_dir=output_dir,
                grid_size=(5, 5),
                overlap_percent=10,
                keep_regex_list=[r'\+', r'\+', r'.*[Xx].*', r'\*', r'\\'],
                min_confidence=80,
                custom_labels_confidence=60,
                iou_threshold=0.3,
                matching_max_distance=200
            )

            # Extract only the matches
            matching_results = results['matching_results']

            # Format matches for clean output
            formatted_matches = []
            for match in matching_results['matches']:
                match_type = match.get('match_type', 'vm_label')

                if match_type == 'two_labels':
                    formatted_match = {
                        'object_name': match['object_name'],
                        'object_confidence': round(match['object_confidence'], 2),
                        'match_type': match_type,
                        'text_top': match['text_top'],
                        'text_top_confidence': round(match['text_confidence_top'], 2),
                        'text_bottom': match['text_bottom'],
                        'text_bottom_confidence': round(match['text_confidence_bottom'], 2),
                        'object_bbox': match['object_bbox'],
                        'text_bbox_top': match['text_bbox_top'],
                        'text_bbox_bottom': match['text_bbox_bottom']
                    }
                else:
                    formatted_match = {
                        'object_name': match['object_name'],
                        'object_confidence': round(match['object_confidence'], 2),
                        'match_type': match_type,
                        'text': match['text'],
                        'text_confidence': round(match['text_confidence'], 2),
                        'distance_pixels': round(match['distance_pixels'], 2),
                        'object_bbox': match['object_bbox'],
                        'text_bbox': match['text_bbox']
                    }

                formatted_matches.append(formatted_match)

            # Format unmatched objects
            unmatched_objects = [
                {
                    'name': obj['Name'],
                    'confidence': round(obj['Confidence'], 2),
                    'bbox': obj['global_bbox']
                }
                for obj in matching_results['unmatched_objects']
            ]

            # Format unmatched texts
            unmatched_texts = [
                {
                    'text': text['text'],
                    'confidence': round(text['confidence'], 2),
                    'bbox': text['global_bbox']
                }
                for text in matching_results['unmatched_texts']
            ]

            # Prepare response
            response = {
                'status': 'success',
                'input_s3_path': s3_path,
                'summary': {
                    'total_matches': len(formatted_matches),
                    'unmatched_objects': len(unmatched_objects),
                    'unmatched_texts': len(unmatched_texts),
                    'matching_rate': f"{matching_results['matching_rate']*100:.1f}%"
                },
                'matches': formatted_matches,
                'unmatched_objects': unmatched_objects,
                'unmatched_texts': unmatched_texts
            }

            print("\n" + "="*80)
            print("PROCESSING COMPLETE")
            print("="*80)
            print(f"Total matches: {len(formatted_matches)}")
            print(f"Matching rate: {matching_results['matching_rate']*100:.1f}%")
            print(f"Unmatched objects: {len(unmatched_objects)}")
            print(f"Unmatched texts: {len(unmatched_texts)}")

            return response

        except Exception as e:
            error_message = f"Error processing diagram: {str(e)}"
            print(error_message)
            import traceback
            traceback.print_exc()

            return {
                'status': 'error',
                'error': error_message,
                'input_s3_path': s3_path
            }