import boto3 import json import os import re from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageOps, ImageFilter import random import shutil # Configuration BUCKET_NAME = 'custom-labels-valvulas-bloco-funcao' FOLDER_PREFIX = 'splitted_diagrams/' # Directory in S3 (e.g., 'images/' or '' for root) REGION = 'us-east-1' OUTPUT_FOLDER = 'text_json' def detect_text(): """Detect text using Textract, returning bounding box for each word""" # Initialize clients s3 = boto3.client('s3', region_name=REGION) textract = boto3.client('textract', region_name=REGION) # List all PNG files in the S3 folder response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=FOLDER_PREFIX) if 'Contents' not in response: print(f"No files found in {BUCKET_NAME}/{FOLDER_PREFIX}") exit() # Process each PNG file for obj in response['Contents']: key = obj['Key'] # Skip if not a PNG file if not key.lower().endswith('.png'): continue print(f"\nProcessing: {key}") # Detect text using Textract result = textract.detect_document_text( Document={ 'S3Object': { 'Bucket': BUCKET_NAME, 'Name': key } } ) # Save result to JSON file filename = os.path.basename(key).replace('.png', '.json').replace('.PNG', '.json') output_path = os.path.join(OUTPUT_FOLDER, filename) os.makedirs(OUTPUT_FOLDER, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) print(f" Saved to: {output_path}") # Print detected text (per word) word_count = 0 for block in result['Blocks']: if block['BlockType'] == 'WORD': word_count += 1 text = block['Text'] confidence = block['Confidence'] bbox = block['Geometry']['BoundingBox'] print(f" Word: {text} ({confidence:.1f}%) - BBox: L={bbox['Left']:.3f}, T={bbox['Top']:.3f}, W={bbox['Width']:.3f}, H={bbox['Height']:.3f}") print(f" Total words detected: {word_count}") def draw_bounding_boxes(sectors_dir, json_dir, output_dir='bounding_box_images'): """ Draw bounding boxes around detected words on original images Args: sectors_dir: Directory containing the original PNG images json_dir: Directory containing the JSON text detection files output_dir: Directory to save images with bounding boxes """ # Create output directory os.makedirs(output_dir, exist_ok=True) # Get all PNG files png_files = [f for f in os.listdir(sectors_dir) if f.lower().endswith('.png')] for png_file in png_files: # Get corresponding JSON file name json_file = os.path.splitext(png_file)[0] + '.json' image_path = os.path.join(sectors_dir, png_file) json_path = os.path.join(json_dir, json_file) output_path = os.path.join(output_dir, png_file.replace('.png', '_bbox.png')) # Check if JSON file exists if not os.path.exists(json_path): print(f"Warning: JSON not found for {png_file}, skipping...") continue print(f"Processing: {png_file}") # Load the image img = Image.open(image_path) width, height = img.size # Load JSON data with open(json_path, 'r') as f: data = json.load(f) # Create a drawing object draw = ImageDraw.Draw(img) # Try to use a better font, fall back to default if not available try: font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12) except: try: font = ImageFont.truetype("arial.ttf", 12) except: font = ImageFont.load_default() # Draw bounding box for each WORD word_count = 0 for block in data['Blocks']: if block['BlockType'] == 'WORD': word_count += 1 bbox = block['Geometry']['BoundingBox'] # Convert relative coordinates to absolute pixels left = int(bbox['Left'] * width) top = int(bbox['Top'] * height) box_width = int(bbox['Width'] * width) box_height = int(bbox['Height'] * height) # Calculate rectangle coordinates x1 = left y1 = top x2 = left + box_width y2 = top + box_height # Draw rectangle around word draw.rectangle([x1, y1, x2, y2], outline='red', width=2) # Draw text label above bounding box text = block['Text'] confidence = block['Confidence'] label = f"{text} ({confidence:.0f}%)" # Draw text background for better visibility try: text_bbox = draw.textbbox((x1, y1 - 15), label, font=font) draw.rectangle(text_bbox, fill='red') draw.text((x1, y1 - 15), label, fill='white', font=font) except: # Fallback for older Pillow versions draw.text((x1, y1 - 15), label, fill='red', font=font) # Save the image with bounding boxes img.save(output_path) print(f" Saved: {output_path} ({word_count} bounding boxes drawn)") print(f"\nAll images with bounding boxes saved to: {output_dir}") def remove_text_from_images(sectors_dir, json_dir, output_dir='cleaned_images', shrink_percent=0, keep_regex_list=None, min_confidence=0): """ Replace text bounding boxes with white pixels for all images in directory Args: sectors_dir: Directory containing the original PNG images json_dir: Directory containing the JSON text detection files output_dir: Directory to save cleaned images shrink_percent: Percentage to shrink the bounding box (0-100). E.g., 10 = shrink by 10% keep_regex_list: List of regex patterns. Words matching these patterns will NOT be removed. Add "+" to the list to keep the "+" symbol. min_confidence: Minimum confidence threshold (0-100). Words with confidence below this will NOT be removed. """ # Create output directory os.makedirs(output_dir, exist_ok=True) # Compile regex patterns for efficiency compiled_patterns = [] if keep_regex_list: for pattern in keep_regex_list: try: compiled_patterns.append(re.compile(pattern)) except re.error as e: print(f"Warning: Invalid regex pattern '{pattern}': {e}") # Get all PNG files png_files = [f for f in os.listdir(sectors_dir) if f.lower().endswith('.png')] for png_file in png_files: # Get corresponding JSON file name json_file = os.path.splitext(png_file)[0] + '.json' image_path = os.path.join(sectors_dir, png_file) json_path = os.path.join(json_dir, json_file) output_path = os.path.join(output_dir, png_file) # Check if JSON file exists if not os.path.exists(json_path): print(f"Warning: JSON not found for {png_file}, skipping...") continue print(f"Processing: {png_file}") # Load the image img = Image.open(image_path) width, height = img.size # Load JSON data with open(json_path, 'r') as f: data = json.load(f) # Create a drawing object draw = ImageDraw.Draw(img) # Process each text detection - NOW PER WORD word_count = 0 kept_by_regex = 0 kept_by_confidence = 0 for block in data['Blocks']: if block['BlockType'] == 'WORD': text = block['Text'] confidence = block['Confidence'] # Check if confidence is below minimum threshold if confidence < min_confidence: kept_by_confidence += 1 print(f" Keeping word: {text} (confidence {confidence:.1f}% < {min_confidence}%)") continue # Check if word matches any keep pattern should_keep = False if compiled_patterns: for pattern in compiled_patterns: if pattern.match(text): should_keep = True kept_by_regex += 1 print(f" Keeping word: {text} (matches pattern)") break # Skip removal if word should be kept if should_keep: continue word_count += 1 bbox = block['Geometry']['BoundingBox'] # Convert relative coordinates to absolute pixels left = int(bbox['Left'] * width) top = int(bbox['Top'] * height) box_width = int(bbox['Width'] * width) box_height = int(bbox['Height'] * height) # Apply shrink percentage if shrink_percent > 0: shrink_factor = shrink_percent / 100 width_reduction = int(box_width * shrink_factor / 2) height_reduction = int(box_height * shrink_factor / 2) left += width_reduction top += height_reduction box_width -= width_reduction * 2 box_height -= height_reduction * 2 # Draw white rectangle over the text draw.rectangle( [(left, top), (left + box_width, top + box_height)], fill='white' ) # Save the modified image img.save(output_path) total_kept = kept_by_regex + kept_by_confidence print(f" Saved: {output_path} ({word_count} words removed, {total_kept} words kept: {kept_by_regex} by regex, {kept_by_confidence} by confidence)") print(f"\nAll cleaned images saved to: {output_dir}") def pick_random_images(source_dir, output_dir, n, seed=None): """ Pick N random images from source directory and copy them to output directory Args: source_dir: Directory containing the cleaned images output_dir: Directory to save random sample of images n: Number of random images to pick seed: Random seed for reproducibility (optional) Returns: List of selected image filenames """ # Create output directory os.makedirs(output_dir, exist_ok=True) # Get all PNG files from source directory png_files = [f for f in os.listdir(source_dir) if f.lower().endswith('.png')] if len(png_files) == 0: print(f"No PNG files found in {source_dir}") return [] # Check if n is larger than available files if n > len(png_files): print(f"Warning: Requested {n} images but only {len(png_files)} available. Using all images.") n = len(png_files) # Set random seed if provided if seed is not None: random.seed(seed) # Randomly select n images selected_files = random.sample(png_files, n) print(f"\nPicking {n} random images from {source_dir}:") # Copy selected images to output directory for filename in selected_files: source_path = os.path.join(source_dir, filename) dest_path = os.path.join(output_dir, filename) shutil.copy2(source_path, dest_path) print(f" Copied: {filename}") print(f"\n{len(selected_files)} random images saved to: {output_dir}") return selected_files def augment_images(source_dir, output_dir='augmented_images', augmentations_per_image=5, brightness_range=(0.7, 1.3), contrast_range=(0.7, 1.3), rotation_range=(-15, 15), blur_probability=0.3, noise_probability=0.3, flip_horizontal=True, flip_vertical=False, seed=None): """ Apply data augmentation to images in source directory Args: source_dir: Directory containing the original images output_dir: Directory to save augmented images augmentations_per_image: Number of augmented versions to create per image brightness_range: Tuple (min, max) for brightness adjustment (1.0 = original) contrast_range: Tuple (min, max) for contrast adjustment (1.0 = original) rotation_range: Tuple (min_degrees, max_degrees) for rotation blur_probability: Probability of applying blur (0.0 to 1.0) noise_probability: Probability of adding noise (0.0 to 1.0) flip_horizontal: Whether to include horizontal flips flip_vertical: Whether to include vertical flips seed: Random seed for reproducibility (optional) Returns: Total number of augmented images created """ # Create output directory os.makedirs(output_dir, exist_ok=True) # Set random seed if provided if seed is not None: random.seed(seed) # Get all PNG files from source directory png_files = [f for f in os.listdir(source_dir) if f.lower().endswith('.png')] if len(png_files) == 0: print(f"No PNG files found in {source_dir}") return 0 print(f"\nAugmenting {len(png_files)} images from {source_dir}:") print(f"Creating {augmentations_per_image} augmented versions per image") total_created = 0 for png_file in png_files: image_path = os.path.join(source_dir, png_file) base_name = os.path.splitext(png_file)[0] # Load the image img = Image.open(image_path) print(f"\nProcessing: {png_file}") for aug_idx in range(augmentations_per_image): # Start with a copy of the original image aug_img = img.copy() augmentation_list = [] # Random brightness adjustment if random.random() > 0.5: brightness_factor = random.uniform(*brightness_range) enhancer = ImageEnhance.Brightness(aug_img) aug_img = enhancer.enhance(brightness_factor) augmentation_list.append(f"brightness_{brightness_factor:.2f}") # Random contrast adjustment if random.random() > 0.5: contrast_factor = random.uniform(*contrast_range) enhancer = ImageEnhance.Contrast(aug_img) aug_img = enhancer.enhance(contrast_factor) augmentation_list.append(f"contrast_{contrast_factor:.2f}") # Random rotation if random.random() > 0.5: rotation_angle = random.uniform(*rotation_range) aug_img = aug_img.rotate(rotation_angle, fillcolor='white', expand=False) augmentation_list.append(f"rotate_{rotation_angle:.1f}") # Random blur if random.random() < blur_probability: blur_radius = random.uniform(0.5, 2.0) aug_img = aug_img.filter(ImageFilter.GaussianBlur(radius=blur_radius)) augmentation_list.append(f"blur_{blur_radius:.1f}") # Random noise (salt and pepper) if random.random() < noise_probability: aug_img = add_noise(aug_img, noise_level=0.02) augmentation_list.append("noise") # Random horizontal flip if flip_horizontal and random.random() > 0.5: aug_img = ImageOps.mirror(aug_img) augmentation_list.append("flip_h") # Random vertical flip if flip_vertical and random.random() > 0.5: aug_img = ImageOps.flip(aug_img) augmentation_list.append("flip_v") # Save augmented image aug_suffix = "_".join(augmentation_list) if augmentation_list else "original" output_filename = f"{base_name}_aug{aug_idx}_{aug_suffix}.png" output_path = os.path.join(output_dir, output_filename) aug_img.save(output_path) total_created += 1 print(f" Created: {output_filename}") print(f"\n{total_created} augmented images saved to: {output_dir}") return total_created def add_noise(image, noise_level=0.02): """ Add salt and pepper noise to an image Args: image: PIL Image object noise_level: Probability of a pixel being noisy (0.0 to 1.0) Returns: PIL Image with noise added """ img_array = list(image.getdata()) width, height = image.size for i in range(len(img_array)): if random.random() < noise_level: # Randomly choose salt (white) or pepper (black) if random.random() > 0.5: img_array[i] = (255, 255, 255) if image.mode == 'RGB' else 255 else: img_array[i] = (0, 0, 0) if image.mode == 'RGB' else 0 noisy_image = Image.new(image.mode, (width, height)) noisy_image.putdata(img_array) return noisy_image def filter_images_by_pattern(image_dir, json_dir, output_dir_match, output_dir_no_match, pattern=r'VM-\d{4}'): """ Filter images that contain at least one word matching the specified pattern Creates two folders: one with matches and one without matches Args: image_dir: Directory containing the images json_dir: Directory containing the JSON text detection files output_dir_match: Directory to save images that MATCH the pattern output_dir_no_match: Directory to save images that DO NOT match the pattern pattern: Regex pattern to match (default: VM-#### where #### is 4 digits) Returns: Tuple of (matched_count, no_match_count, matched_files, no_match_files) """ # Create output directories os.makedirs(output_dir_match, exist_ok=True) os.makedirs(output_dir_no_match, exist_ok=True) # Compile the regex pattern try: compiled_pattern = re.compile(pattern) except re.error as e: print(f"Error: Invalid regex pattern '{pattern}': {e}") return 0, 0, [], [] # Get all PNG files from image directory png_files = [f for f in os.listdir(image_dir) if f.lower().endswith('.png')] if len(png_files) == 0: print(f"No PNG files found in {image_dir}") return 0, 0, [], [] print(f"\nFiltering images by pattern: {pattern}") print(f"Checking {len(png_files)} images...") matched_count = 0 no_match_count = 0 matched_files = [] no_match_files = [] for png_file in png_files: # Get corresponding JSON file name json_file = os.path.splitext(png_file)[0] + '.json' image_path = os.path.join(image_dir, png_file) json_path = os.path.join(json_dir, json_file) # Check if JSON file exists if not os.path.exists(json_path): print(f"Warning: JSON not found for {png_file}, skipping...") continue # Load JSON data with open(json_path, 'r') as f: data = json.load(f) # Check if any word matches the pattern matching_words = [] for block in data['Blocks']: if block['BlockType'] == 'WORD': text = block['Text'] if compiled_pattern.search(text): matching_words.append(text) # Copy image to appropriate folder if matching_words: # Image has matching words matched_count += 1 output_path = os.path.join(output_dir_match, png_file) shutil.copy2(image_path, output_path) matched_files.append((png_file, matching_words)) print(f" ✓ MATCH: {png_file} - Found: {', '.join(matching_words)}") else: # Image has no matching words no_match_count += 1 output_path = os.path.join(output_dir_no_match, png_file) shutil.copy2(image_path, output_path) no_match_files.append(png_file) print(f" ✗ NO MATCH: {png_file}") print(f"\n=== Filtering Summary ===") print(f"Pattern: '{pattern}'") print(f"Total images processed: {len(png_files)}") print(f"Images WITH pattern: {matched_count} (saved to {output_dir_match})") print(f"Images WITHOUT pattern: {no_match_count} (saved to {output_dir_no_match})") return matched_count, no_match_count, matched_files, no_match_files #matched_count, no_match_count, matched_files, no_match_files = filter_images_by_pattern( # './clean_image', './text_json', './vm_images', './no_vm_images' #) # Print detailed summary #print("\n=== Images WITH VM-#### Pattern ===") #for filename, words in matched_files: # print(f"{filename}: {', '.join(words)}") #print(f"\n=== Images WITHOUT VM-#### Pattern ===") #for filename in no_match_files: # print(f"{filename}") # Run text detection #detect_text() # Draw bounding boxes on original images #draw_bounding_boxes('./sectors', './text_json', './bounding_box_images') # Remove text from images, but keep words matching the regex patterns # Example: Keep "+" symbol and any words starting with "PT" or "FT" #remove_text_from_images('./sectors', './text_json', './clean_image', 0, [r'\+',r'.*[Xx].*',r'\1',r'L'],25) augment_images('./to_augment', './test_dataset', augmentations_per_image=1, rotation_range=(-5,5), blur_probability=0.5, noise_probability=0.5) #pick_random_images("./clean_image","./dataset",200)