Initial commit

2026-05-14 14:07:04 -03:00
commit e0bc5d784b
34 changed files with 7496 additions and 0 deletions
--- a/label/infra/code/function_a.py
+++ b/label/infra/code/function_a.py
@@ -0,0 +1,181 @@
+import boto3
+import os
+import tempfile
+import json
+from urllib.parse import urlparse
+from diagram_processor import DiagramProcessor
+
+
+def parse_s3_path(s3_path):
+    """
+    Parse S3 path into bucket and key
+    
+    Args:
+        s3_path: S3 path like 's3://bucket-name/path/to/file.pdf'
+    
+    Returns:
+        Tuple (bucket, key)
+    """
+    if not s3_path.startswith('s3://'):
+        raise ValueError(f"Invalid S3 path: {s3_path}. Must start with 's3://'")
+    
+    parsed = urlparse(s3_path)
+    bucket = parsed.netloc
+    key = parsed.path.lstrip('/')
+    
+    return bucket, key
+
+
+def download_from_s3(s3_path, local_path):
+    """
+    Download file from S3
+    
+    Args:
+        s3_path: S3 path (s3://bucket/key)
+        local_path: Local file path to save to
+    """
+    bucket, key = parse_s3_path(s3_path)
+    
+    s3_client = boto3.client('s3')
+    print(f"Downloading from S3: {s3_path}")
+    s3_client.download_file(bucket, key, local_path)
+    print(f"Downloaded to: {local_path}")
+
+
+def execute(s3_path):
+    """
+    Function A - Process diagram from S3 and return matches only
+    
+    Args:
+        s3_path: S3 path to diagram (e.g., 's3://my-bucket/diagrams/diagram.pdf')
+    
+    Returns:
+        Dictionary with matches of labels and blocks
+    """
+    print(f"Function A - Diagram Processing")
+    print(f"Input S3 path: {s3_path}")
+    
+    # Create temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Download diagram from S3
+        bucket, key = parse_s3_path(s3_path)
+        input_file = os.path.join(temp_dir, os.path.basename(key))
+        download_from_s3(s3_path, input_file)
+        
+        # Create output directory for processing
+        output_dir = os.path.join(temp_dir, 'output')
+        os.makedirs(output_dir, exist_ok=True)
+        
+        # Initialize processor
+        print("\nInitializing DiagramProcessor...")
+        processor = DiagramProcessor(
+            region=os.environ.get('AWS_REGION', 'us-east-1'),
+            custom_labels_arn=os.environ.get('CUSTOM_LABELS_ARN', 'arn:aws:rekognition:us-east-1:173378533286:project/labels-valvula/version/labels-valvula.2025-11-24T15.44.16/1764009856090')
+        )
+        
+        # Process diagram
+        print("\nProcessing diagram...")
+        try:
+            results = processor.process_single_diagram(
+                diagram_path=input_file,
+                output_base_dir=output_dir,
+                grid_size=(5, 5),
+                overlap_percent=10,
+                keep_regex_list=[r'\+', r'\+', r'.*[Xx].*', r'\*', r'\\'],
+                min_confidence=80,
+                custom_labels_confidence=60,
+                iou_threshold=0.3,
+                matching_max_distance=200
+            )
+            
+            # Extract only the matches
+            matching_results = results['matching_results']
+            
+            # Format matches for clean output
+            formatted_matches = []
+            for match in matching_results['matches']:
+                match_type = match.get('match_type', 'vm_label')
+                
+                if match_type == 'two_labels':
+                    formatted_match = {
+                        'object_name': match['object_name'],
+                        'object_confidence': round(match['object_confidence'], 2),
+                        'match_type': match_type,
+                        'text_top': match['text_top'],
+                        'text_top_confidence': round(match['text_confidence_top'], 2),
+                        'text_bottom': match['text_bottom'],
+                        'text_bottom_confidence': round(match['text_confidence_bottom'], 2),
+                        'object_bbox': match['object_bbox'],
+                        'text_bbox_top': match['text_bbox_top'],
+                        'text_bbox_bottom': match['text_bbox_bottom']
+                    }
+                else:
+                    formatted_match = {
+                        'object_name': match['object_name'],
+                        'object_confidence': round(match['object_confidence'], 2),
+                        'match_type': match_type,
+                        'text': match['text'],
+                        'text_confidence': round(match['text_confidence'], 2),
+                        'distance_pixels': round(match['distance_pixels'], 2),
+                        'object_bbox': match['object_bbox'],
+                        'text_bbox': match['text_bbox']
+                    }
+                
+                formatted_matches.append(formatted_match)
+            
+            # Format unmatched objects
+            unmatched_objects = [
+                {
+                    'name': obj['Name'],
+                    'confidence': round(obj['Confidence'], 2),
+                    'bbox': obj['global_bbox']
+                }
+                for obj in matching_results['unmatched_objects']
+            ]
+            
+            # Format unmatched texts
+            unmatched_texts = [
+                {
+                    'text': text['text'],
+                    'confidence': round(text['confidence'], 2),
+                    'bbox': text['global_bbox']
+                }
+                for text in matching_results['unmatched_texts']
+            ]
+            
+            # Prepare response
+            response = {
+                'status': 'success',
+                'input_s3_path': s3_path,
+                'summary': {
+                    'total_matches': len(formatted_matches),
+                    'unmatched_objects': len(unmatched_objects),
+                    'unmatched_texts': len(unmatched_texts),
+                    'matching_rate': f"{matching_results['matching_rate']*100:.1f}%"
+                },
+                'matches': formatted_matches,
+                'unmatched_objects': unmatched_objects,
+                'unmatched_texts': unmatched_texts
+            }
+            
+            print("\n" + "="*80)
+            print("PROCESSING COMPLETE")
+            print("="*80)
+            print(f"Total matches: {len(formatted_matches)}")
+            print(f"Matching rate: {matching_results['matching_rate']*100:.1f}%")
+            print(f"Unmatched objects: {len(unmatched_objects)}")
+            print(f"Unmatched texts: {len(unmatched_texts)}")
+            
+            return response
+            
+        except Exception as e:
+            error_message = f"Error processing diagram: {str(e)}"
+            print(error_message)
+            import traceback
+            traceback.print_exc()
+            
+            return {
+                'status': 'error',
+                'error': error_message,
+                'input_s3_path': s3_path
+            }