import boto3
import json
import os
import re
from PIL import Image, ImageDraw, ImageFont, ImageEnhance, ImageOps, ImageFilter
import random
import shutil

# Configuration
BUCKET_NAME = 'custom-labels-valvulas-bloco-funcao'
FOLDER_PREFIX = 'splitted_diagrams/'  # Directory in S3 (e.g., 'images/' or '' for root)
REGION = 'us-east-1'
OUTPUT_FOLDER = 'text_json'

def detect_text():
    """Detect text using Textract, returning bounding box for each word"""
    # Initialize clients
    s3 = boto3.client('s3', region_name=REGION)
    textract = boto3.client('textract', region_name=REGION)

    # List all PNG files in the S3 folder
    response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=FOLDER_PREFIX)

    if 'Contents' not in response:
        print(f"No files found in {BUCKET_NAME}/{FOLDER_PREFIX}")
        exit()

    # Process each PNG file
    for obj in response['Contents']:
        key = obj['Key']
        
        # Skip if not a PNG file
        if not key.lower().endswith('.png'):
            continue
        
        print(f"\nProcessing: {key}")
        
        # Detect text using Textract
        result = textract.detect_document_text(
            Document={
                'S3Object': {
                    'Bucket': BUCKET_NAME,
                    'Name': key
                }
            }
        )
        
        # Save result to JSON file
        filename = os.path.basename(key).replace('.png', '.json').replace('.PNG', '.json')
        output_path = os.path.join(OUTPUT_FOLDER, filename)
        
        os.makedirs(OUTPUT_FOLDER, exist_ok=True)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        
        print(f"  Saved to: {output_path}")
        
        # Print detected text (per word)
        word_count = 0
        for block in result['Blocks']:
            if block['BlockType'] == 'WORD':
                word_count += 1
                text = block['Text']
                confidence = block['Confidence']
                bbox = block['Geometry']['BoundingBox']
                print(f"  Word: {text} ({confidence:.1f}%) - BBox: L={bbox['Left']:.3f}, T={bbox['Top']:.3f}, W={bbox['Width']:.3f}, H={bbox['Height']:.3f}")
        
        print(f"  Total words detected: {word_count}")


def draw_bounding_boxes(sectors_dir, json_dir, output_dir='bounding_box_images'):
    """
    Draw bounding boxes around detected words on original images
    
    Args:
        sectors_dir: Directory containing the original PNG images
        json_dir: Directory containing the JSON text detection files
        output_dir: Directory to save images with bounding boxes
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all PNG files
    png_files = [f for f in os.listdir(sectors_dir) if f.lower().endswith('.png')]
    
    for png_file in png_files:
        # Get corresponding JSON file name
        json_file = os.path.splitext(png_file)[0] + '.json'
        
        image_path = os.path.join(sectors_dir, png_file)
        json_path = os.path.join(json_dir, json_file)
        output_path = os.path.join(output_dir, png_file.replace('.png', '_bbox.png'))
        
        # Check if JSON file exists
        if not os.path.exists(json_path):
            print(f"Warning: JSON not found for {png_file}, skipping...")
            continue
        
        print(f"Processing: {png_file}")
        
        # Load the image
        img = Image.open(image_path)
        width, height = img.size
        
        # Load JSON data
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        # Create a drawing object
        draw = ImageDraw.Draw(img)
        
        # Try to use a better font, fall back to default if not available
        try:
            font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 12)
        except:
            try:
                font = ImageFont.truetype("arial.ttf", 12)
            except:
                font = ImageFont.load_default()
        
        # Draw bounding box for each WORD
        word_count = 0
        for block in data['Blocks']:
            if block['BlockType'] == 'WORD':
                word_count += 1
                bbox = block['Geometry']['BoundingBox']
                
                # Convert relative coordinates to absolute pixels
                left = int(bbox['Left'] * width)
                top = int(bbox['Top'] * height)
                box_width = int(bbox['Width'] * width)
                box_height = int(bbox['Height'] * height)
                
                # Calculate rectangle coordinates
                x1 = left
                y1 = top
                x2 = left + box_width
                y2 = top + box_height
                
                # Draw rectangle around word
                draw.rectangle([x1, y1, x2, y2], outline='red', width=2)
                
                # Draw text label above bounding box
                text = block['Text']
                confidence = block['Confidence']
                label = f"{text} ({confidence:.0f}%)"
                
                # Draw text background for better visibility
                try:
                    text_bbox = draw.textbbox((x1, y1 - 15), label, font=font)
                    draw.rectangle(text_bbox, fill='red')
                    draw.text((x1, y1 - 15), label, fill='white', font=font)
                except:
                    # Fallback for older Pillow versions
                    draw.text((x1, y1 - 15), label, fill='red', font=font)
        
        # Save the image with bounding boxes
        img.save(output_path)
        print(f"  Saved: {output_path} ({word_count} bounding boxes drawn)")
    
    print(f"\nAll images with bounding boxes saved to: {output_dir}")


def remove_text_from_images(sectors_dir, json_dir, output_dir='cleaned_images', shrink_percent=0, keep_regex_list=None, min_confidence=0):
    """
    Replace text bounding boxes with white pixels for all images in directory
    
    Args:
        sectors_dir: Directory containing the original PNG images
        json_dir: Directory containing the JSON text detection files
        output_dir: Directory to save cleaned images
        shrink_percent: Percentage to shrink the bounding box (0-100). E.g., 10 = shrink by 10%
        keep_regex_list: List of regex patterns. Words matching these patterns will NOT be removed.
                        Add "+" to the list to keep the "+" symbol.
        min_confidence: Minimum confidence threshold (0-100). Words with confidence below this will NOT be removed.
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Compile regex patterns for efficiency
    compiled_patterns = []
    if keep_regex_list:
        for pattern in keep_regex_list:
            try:
                compiled_patterns.append(re.compile(pattern))
            except re.error as e:
                print(f"Warning: Invalid regex pattern '{pattern}': {e}")
    
    # Get all PNG files
    png_files = [f for f in os.listdir(sectors_dir) if f.lower().endswith('.png')]
    
    for png_file in png_files:
        # Get corresponding JSON file name
        json_file = os.path.splitext(png_file)[0] + '.json'
        
        image_path = os.path.join(sectors_dir, png_file)
        json_path = os.path.join(json_dir, json_file)
        output_path = os.path.join(output_dir, png_file)
        
        # Check if JSON file exists
        if not os.path.exists(json_path):
            print(f"Warning: JSON not found for {png_file}, skipping...")
            continue
        
        print(f"Processing: {png_file}")
        
        # Load the image
        img = Image.open(image_path)
        width, height = img.size
        
        # Load JSON data
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        # Create a drawing object
        draw = ImageDraw.Draw(img)
        
        # Process each text detection - NOW PER WORD
        word_count = 0
        kept_by_regex = 0
        kept_by_confidence = 0
        for block in data['Blocks']:
            if block['BlockType'] == 'WORD':
                text = block['Text']
                confidence = block['Confidence']
                
                # Check if confidence is below minimum threshold
                if confidence < min_confidence:
                    kept_by_confidence += 1
                    print(f"  Keeping word: {text} (confidence {confidence:.1f}% < {min_confidence}%)")
                    continue
                
                # Check if word matches any keep pattern
                should_keep = False
                if compiled_patterns:
                    for pattern in compiled_patterns:
                        if pattern.match(text):
                            should_keep = True
                            kept_by_regex += 1
                            print(f"  Keeping word: {text} (matches pattern)")
                            break
                
                # Skip removal if word should be kept
                if should_keep:
                    continue
                
                word_count += 1
                bbox = block['Geometry']['BoundingBox']
                
                # Convert relative coordinates to absolute pixels
                left = int(bbox['Left'] * width)
                top = int(bbox['Top'] * height)
                box_width = int(bbox['Width'] * width)
                box_height = int(bbox['Height'] * height)
                
                # Apply shrink percentage
                if shrink_percent > 0:
                    shrink_factor = shrink_percent / 100
                    width_reduction = int(box_width * shrink_factor / 2)
                    height_reduction = int(box_height * shrink_factor / 2)
                    
                    left += width_reduction
                    top += height_reduction
                    box_width -= width_reduction * 2
                    box_height -= height_reduction * 2
                
                # Draw white rectangle over the text
                draw.rectangle(
                    [(left, top), (left + box_width, top + box_height)],
                    fill='white'
                )
        
        # Save the modified image
        img.save(output_path)
        total_kept = kept_by_regex + kept_by_confidence
        print(f"  Saved: {output_path} ({word_count} words removed, {total_kept} words kept: {kept_by_regex} by regex, {kept_by_confidence} by confidence)")
    
    print(f"\nAll cleaned images saved to: {output_dir}")
def pick_random_images(source_dir, output_dir, n, seed=None):
    """
    Pick N random images from source directory and copy them to output directory
    
    Args:
        source_dir: Directory containing the cleaned images
        output_dir: Directory to save random sample of images
        n: Number of random images to pick
        seed: Random seed for reproducibility (optional)
    
    Returns:
        List of selected image filenames
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Get all PNG files from source directory
    png_files = [f for f in os.listdir(source_dir) if f.lower().endswith('.png')]
    
    if len(png_files) == 0:
        print(f"No PNG files found in {source_dir}")
        return []
    
    # Check if n is larger than available files
    if n > len(png_files):
        print(f"Warning: Requested {n} images but only {len(png_files)} available. Using all images.")
        n = len(png_files)
    
    # Set random seed if provided
    if seed is not None:
        random.seed(seed)
    
    # Randomly select n images
    selected_files = random.sample(png_files, n)
    
    print(f"\nPicking {n} random images from {source_dir}:")
    
    # Copy selected images to output directory
    for filename in selected_files:
        source_path = os.path.join(source_dir, filename)
        dest_path = os.path.join(output_dir, filename)
        
        shutil.copy2(source_path, dest_path)
        print(f"  Copied: {filename}")
    
    print(f"\n{len(selected_files)} random images saved to: {output_dir}")
    
    return selected_files
def augment_images(source_dir, output_dir='augmented_images', augmentations_per_image=5, 
                   brightness_range=(0.7, 1.3), contrast_range=(0.7, 1.3), 
                   rotation_range=(-15, 15), blur_probability=0.3, noise_probability=0.3,
                   flip_horizontal=True, flip_vertical=False, seed=None):
    """
    Apply data augmentation to images in source directory
    
    Args:
        source_dir: Directory containing the original images
        output_dir: Directory to save augmented images
        augmentations_per_image: Number of augmented versions to create per image
        brightness_range: Tuple (min, max) for brightness adjustment (1.0 = original)
        contrast_range: Tuple (min, max) for contrast adjustment (1.0 = original)
        rotation_range: Tuple (min_degrees, max_degrees) for rotation
        blur_probability: Probability of applying blur (0.0 to 1.0)
        noise_probability: Probability of adding noise (0.0 to 1.0)
        flip_horizontal: Whether to include horizontal flips
        flip_vertical: Whether to include vertical flips
        seed: Random seed for reproducibility (optional)
    
    Returns:
        Total number of augmented images created
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Set random seed if provided
    if seed is not None:
        random.seed(seed)
    
    # Get all PNG files from source directory
    png_files = [f for f in os.listdir(source_dir) if f.lower().endswith('.png')]
    
    if len(png_files) == 0:
        print(f"No PNG files found in {source_dir}")
        return 0
    
    print(f"\nAugmenting {len(png_files)} images from {source_dir}:")
    print(f"Creating {augmentations_per_image} augmented versions per image")
    
    total_created = 0
    
    for png_file in png_files:
        image_path = os.path.join(source_dir, png_file)
        base_name = os.path.splitext(png_file)[0]
        
        # Load the image
        img = Image.open(image_path)
        
        print(f"\nProcessing: {png_file}")
        
        for aug_idx in range(augmentations_per_image):
            # Start with a copy of the original image
            aug_img = img.copy()
            
            augmentation_list = []
            
            # Random brightness adjustment
            if random.random() > 0.5:
                brightness_factor = random.uniform(*brightness_range)
                enhancer = ImageEnhance.Brightness(aug_img)
                aug_img = enhancer.enhance(brightness_factor)
                augmentation_list.append(f"brightness_{brightness_factor:.2f}")
            
            # Random contrast adjustment
            if random.random() > 0.5:
                contrast_factor = random.uniform(*contrast_range)
                enhancer = ImageEnhance.Contrast(aug_img)
                aug_img = enhancer.enhance(contrast_factor)
                augmentation_list.append(f"contrast_{contrast_factor:.2f}")
            
            # Random rotation
            if random.random() > 0.5:
                rotation_angle = random.uniform(*rotation_range)
                aug_img = aug_img.rotate(rotation_angle, fillcolor='white', expand=False)
                augmentation_list.append(f"rotate_{rotation_angle:.1f}")
            
            # Random blur
            if random.random() < blur_probability:
                blur_radius = random.uniform(0.5, 2.0)
                aug_img = aug_img.filter(ImageFilter.GaussianBlur(radius=blur_radius))
                augmentation_list.append(f"blur_{blur_radius:.1f}")
            
            # Random noise (salt and pepper)
            if random.random() < noise_probability:
                aug_img = add_noise(aug_img, noise_level=0.02)
                augmentation_list.append("noise")
            
            # Random horizontal flip
            if flip_horizontal and random.random() > 0.5:
                aug_img = ImageOps.mirror(aug_img)
                augmentation_list.append("flip_h")
            
            # Random vertical flip
            if flip_vertical and random.random() > 0.5:
                aug_img = ImageOps.flip(aug_img)
                augmentation_list.append("flip_v")
            
            # Save augmented image
            aug_suffix = "_".join(augmentation_list) if augmentation_list else "original"
            output_filename = f"{base_name}_aug{aug_idx}_{aug_suffix}.png"
            output_path = os.path.join(output_dir, output_filename)
            
            aug_img.save(output_path)
            total_created += 1
            
            print(f"  Created: {output_filename}")
    
    print(f"\n{total_created} augmented images saved to: {output_dir}")
    return total_created


def add_noise(image, noise_level=0.02):
    """
    Add salt and pepper noise to an image
    
    Args:
        image: PIL Image object
        noise_level: Probability of a pixel being noisy (0.0 to 1.0)
    
    Returns:
        PIL Image with noise added
    """
    img_array = list(image.getdata())
    width, height = image.size
    
    for i in range(len(img_array)):
        if random.random() < noise_level:
            # Randomly choose salt (white) or pepper (black)
            if random.random() > 0.5:
                img_array[i] = (255, 255, 255) if image.mode == 'RGB' else 255
            else:
                img_array[i] = (0, 0, 0) if image.mode == 'RGB' else 0
    
    noisy_image = Image.new(image.mode, (width, height))
    noisy_image.putdata(img_array)
    
    return noisy_image

def filter_images_by_pattern(image_dir, json_dir, output_dir_match, output_dir_no_match, pattern=r'VM-\d{4}'):
    """
    Filter images that contain at least one word matching the specified pattern
    Creates two folders: one with matches and one without matches
    
    Args:
        image_dir: Directory containing the images
        json_dir: Directory containing the JSON text detection files
        output_dir_match: Directory to save images that MATCH the pattern
        output_dir_no_match: Directory to save images that DO NOT match the pattern
        pattern: Regex pattern to match (default: VM-#### where #### is 4 digits)
    
    Returns:
        Tuple of (matched_count, no_match_count, matched_files, no_match_files)
    """
    # Create output directories
    os.makedirs(output_dir_match, exist_ok=True)
    os.makedirs(output_dir_no_match, exist_ok=True)
    
    # Compile the regex pattern
    try:
        compiled_pattern = re.compile(pattern)
    except re.error as e:
        print(f"Error: Invalid regex pattern '{pattern}': {e}")
        return 0, 0, [], []
    
    # Get all PNG files from image directory
    png_files = [f for f in os.listdir(image_dir) if f.lower().endswith('.png')]
    
    if len(png_files) == 0:
        print(f"No PNG files found in {image_dir}")
        return 0, 0, [], []
    
    print(f"\nFiltering images by pattern: {pattern}")
    print(f"Checking {len(png_files)} images...")
    
    matched_count = 0
    no_match_count = 0
    matched_files = []
    no_match_files = []
    
    for png_file in png_files:
        # Get corresponding JSON file name
        json_file = os.path.splitext(png_file)[0] + '.json'
        
        image_path = os.path.join(image_dir, png_file)
        json_path = os.path.join(json_dir, json_file)
        
        # Check if JSON file exists
        if not os.path.exists(json_path):
            print(f"Warning: JSON not found for {png_file}, skipping...")
            continue
        
        # Load JSON data
        with open(json_path, 'r') as f:
            data = json.load(f)
        
        # Check if any word matches the pattern
        matching_words = []
        for block in data['Blocks']:
            if block['BlockType'] == 'WORD':
                text = block['Text']
                if compiled_pattern.search(text):
                    matching_words.append(text)
        
        # Copy image to appropriate folder
        if matching_words:
            # Image has matching words
            matched_count += 1
            output_path = os.path.join(output_dir_match, png_file)
            shutil.copy2(image_path, output_path)
            
            matched_files.append((png_file, matching_words))
            print(f"  ✓ MATCH: {png_file} - Found: {', '.join(matching_words)}")
        else:
            # Image has no matching words
            no_match_count += 1
            output_path = os.path.join(output_dir_no_match, png_file)
            shutil.copy2(image_path, output_path)
            
            no_match_files.append(png_file)
            print(f"  ✗ NO MATCH: {png_file}")
    
    print(f"\n=== Filtering Summary ===")
    print(f"Pattern: '{pattern}'")
    print(f"Total images processed: {len(png_files)}")
    print(f"Images WITH pattern: {matched_count} (saved to {output_dir_match})")
    print(f"Images WITHOUT pattern: {no_match_count} (saved to {output_dir_no_match})")
    
    return matched_count, no_match_count, matched_files, no_match_files
#matched_count, no_match_count, matched_files, no_match_files = filter_images_by_pattern(
#    './clean_image', './text_json', './vm_images', './no_vm_images'
#)

# Print detailed summary
#print("\n=== Images WITH VM-#### Pattern ===")
#for filename, words in matched_files:
#    print(f"{filename}: {', '.join(words)}")

#print(f"\n=== Images WITHOUT VM-#### Pattern ===")
#for filename in no_match_files:
#    print(f"{filename}")
# Run text detection
#detect_text()

# Draw bounding boxes on original images
#draw_bounding_boxes('./sectors', './text_json', './bounding_box_images')

# Remove text from images, but keep words matching the regex patterns
# Example: Keep "+" symbol and any words starting with "PT" or "FT"
#remove_text_from_images('./sectors', './text_json', './clean_image', 0, [r'\+',r'.*[Xx].*',r'\1',r'L'],25)
augment_images('./to_augment', './test_dataset',
               augmentations_per_image=1,
               rotation_range=(-5,5),
               blur_probability=0.5,
               noise_probability=0.5)
#pick_random_images("./clean_image","./dataset",200)