AI-coodex-rekog-image-labeling/label/cores/sobel.py

#!/usr/bin/env python3
"""
PDF Edge Detection with Color Grouping (Preserving Edge Segregation)

Input: input.pdf
Output: output_sobel/ folder
"""

import cv2
import numpy as np
from pdf2image import convert_from_path
import os
import shutil
from collections import Counter


def clear_output_directory(output_dir):
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)


def enhance_pastel_colors(image_bgr):
    """
    Increase saturation of pastel colors, keep gray closer to black.
    """
    hsv = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2HSV).astype(np.float32)
    h, s, v = cv2.split(hsv)

    # Identify pastel colors
    pastel_mask = (v > 150) & (s < 100) & (s > 10)

    # Identify gray
    gray_mask = (s <= 10)

    # Boost saturation for pastels
    s[pastel_mask] = np.clip(s[pastel_mask] * 2.5, 0, 255)

    # Darken grays
    v[gray_mask] = np.clip(v[gray_mask] * 0.3, 0, 255)

    # Reconstruct
    hsv_enhanced = cv2.merge([h, s, v]).astype(np.uint8)
    result = cv2.cvtColor(hsv_enhanced, cv2.COLOR_HSV2BGR)

    return result


def sobel_edge_detection(image):
    """Apply Sobel filter to detect edges."""
    # Quantize colors
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    h_quantized = (h // 5) * 5
    s_quantized = (s // 64) * 64
    v_quantized = (v // 64) * 64

    hsv_quantized = cv2.merge([h_quantized, s_quantized, v_quantized])
    image_quantized = cv2.cvtColor(hsv_quantized, cv2.COLOR_HSV2BGR)

    # Apply Sobel
    b, g, r = cv2.split(image_quantized)

    edges_b = np.sqrt(cv2.Sobel(b, cv2.CV_64F, 1, 0, ksize=3)**2 +
                      cv2.Sobel(b, cv2.CV_64F, 0, 1, ksize=3)**2)
    edges_g = np.sqrt(cv2.Sobel(g, cv2.CV_64F, 1, 0, ksize=3)**2 +
                      cv2.Sobel(g, cv2.CV_64F, 0, 1, ksize=3)**2)
    edges_r = np.sqrt(cv2.Sobel(r, cv2.CV_64F, 1, 0, ksize=3)**2 +
                      cv2.Sobel(r, cv2.CV_64F, 0, 1, ksize=3)**2)

    combined = np.sqrt(edges_b**2 + edges_g**2 + edges_r**2)
    combined = combined / (combined.max() + 1e-8)

    edge_mask = (combined > 0.10).astype(np.uint8) * 255
    kernel = np.ones((2, 2), np.uint8)
    edge_mask = cv2.morphologyEx(edge_mask, cv2.MORPH_CLOSE, kernel)

    # Create BGRA with original colors
    result = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8)
    result[edge_mask > 0, :3] = image[edge_mask > 0]
    result[edge_mask > 0, 3] = 255

    # Remove white pixels
    white_mask = np.all(result[:, :, :3] > 240, axis=2)
    result[white_mask, 3] = 0

    return result


def analyze_edge_colors(edge_img, edge_mask):
    """
    Analyze if an edge has multiple distinct colors.

    Returns:
        (has_multiple_colors, num_colors, dominant_hues)
    """
    bgr = edge_img[:, :, :3]
    pixels = bgr[edge_mask]

    # Filter white
    non_white = pixels[~np.all(pixels > 240, axis=1)]

    if len(non_white) < 10:
        return False, 0, []

    # Convert to HSV
    hsv = cv2.cvtColor(non_white.reshape(-1, 1, 3), cv2.COLOR_BGR2HSV).reshape(-1, 3)

    # Filter low saturation (gray)
    saturated_mask = hsv[:, 1] > 30
    saturated_hsv = hsv[saturated_mask]

    if len(saturated_hsv) < 10:
        return False, 0, []

    # Quantize hue into bins (every 10 degrees)
    hue_bins = (saturated_hsv[:, 0] // 10).astype(np.int32)

    # Count occurrences
    unique_hues, counts = np.unique(hue_bins, return_counts=True)

    # Filter significant hues (>5% of pixels)
    total = len(hue_bins)
    significant_mask = counts > (total * 0.05)
    significant_hues = unique_hues[significant_mask]

    num_colors = len(significant_hues)

    return num_colors > 1, num_colors, significant_hues.tolist()


def split_edge_by_color(edges_bgra, edge_mask, labels, edge_id, num_colors):
    """
    Split edge into multiple sub-edges based on color using K-means.

    Returns:
        List of (sub_edge_image, cluster_id) tuples
    """
    bgr = edges_bgra[:, :, :3]

    # Get edge pixels
    y_coords, x_coords = np.where(edge_mask)
    edge_pixels = bgr[edge_mask]

    # Filter white and convert to HSV
    non_white_mask = ~np.all(edge_pixels > 240, axis=1)
    valid_pixels = edge_pixels[non_white_mask]
    valid_y = y_coords[non_white_mask]
    valid_x = x_coords[non_white_mask]

    if len(valid_pixels) < 10:
        # Return original edge
        edge_img = np.zeros_like(edges_bgra)
        edge_img[edge_mask] = edges_bgra[edge_mask]
        return [(edge_img, 0)]

    # Convert to HSV for clustering (use only H and S)
    hsv = cv2.cvtColor(valid_pixels.reshape(-1, 1, 3), cv2.COLOR_BGR2HSV).reshape(-1, 3)
    features = hsv[:, :2].astype(np.float32)  # Hue and Saturation only

    # K-means clustering
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    _, cluster_labels, centers = cv2.kmeans(features, num_colors, None, criteria, 3, cv2.KMEANS_PP_CENTERS)

    cluster_labels = cluster_labels.flatten()

    # Create sub-edges (keep them separate!)
    sub_edges = []

    for cluster_id in range(num_colors):
        cluster_mask_1d = (cluster_labels == cluster_id)

        # Create separate image for this sub-edge
        sub_edge_img = np.zeros_like(edges_bgra)
        cluster_y = valid_y[cluster_mask_1d]
        cluster_x = valid_x[cluster_mask_1d]
        sub_edge_img[cluster_y, cluster_x] = edges_bgra[cluster_y, cluster_x]

        sub_edges.append((sub_edge_img, cluster_id))

    return sub_edges


def get_edge_mode_color(edge_img, edge_mask):
    """
    Get the mode (most common) color of an edge.
    """
    bgr = edge_img[:, :, :3]
    pixels = bgr[edge_mask]

    # Filter white
    non_white = pixels[~np.all(pixels > 240, axis=1)]

    if len(non_white) == 0:
        return None

    # Convert to HSV
    hsv = cv2.cvtColor(non_white.reshape(-1, 1, 3), cv2.COLOR_BGR2HSV).reshape(-1, 3)

    # Filter low saturation
    saturated_mask = (hsv[:, 1] > 30)
    saturated_pixels = non_white[saturated_mask]

    if len(saturated_pixels) == 0:
        saturated_pixels = non_white

    # Get mode color
    pixel_ints = (saturated_pixels[:, 0].astype(np.int32) +
                  saturated_pixels[:, 1].astype(np.int32) * 256 +
                  saturated_pixels[:, 2].astype(np.int32) * 65536)

    mode_int = np.bincount(pixel_ints).argmax()

    mode_color = np.array([
        mode_int % 256,
        (mode_int // 256) % 256,
        (mode_int // 65536) % 256
    ], dtype=np.uint8)

    return mode_color


def process_and_group_edges(edges_bgra, color_threshold=30):
    """
    Process edges: split multi-color edges, then group by color.
    Edges remain separate (segregated) even within groups.

    Returns:
        List of (group_image, mode_color, edge_count) tuples
    """
    alpha = edges_bgra[:, :, 3]

    # Find connected components
    num_labels, labels = cv2.connectedComponents(alpha)

    print(f"    Found {num_labels - 1} edges")

    if num_labels <= 1:
        return []

    # Process each edge: split if multi-color
    all_edge_images = []

    for edge_id in range(1, num_labels):
        edge_mask = (labels == edge_id)

        if not np.any(edge_mask):
            continue

        # Analyze colors
        has_multiple, num_colors, hues = analyze_edge_colors(edges_bgra, edge_mask)

        if has_multiple:
            print(f"      Edge {edge_id}: {num_colors} colors detected, splitting...")
            # Split into sub-edges
            sub_edges = split_edge_by_color(edges_bgra, edge_mask, labels, edge_id, num_colors)
            all_edge_images.extend(sub_edges)
        else:
            # Keep as single edge
            edge_img = np.zeros_like(edges_bgra)
            edge_img[edge_mask] = edges_bgra[edge_mask]
            all_edge_images.append((edge_img, 0))

    print(f"    Total edges after splitting: {len(all_edge_images)}")

    # Get mode color for each edge
    edge_colors = []
    for edge_img, cluster_id in all_edge_images:
        edge_mask = edge_img[:, :, 3] > 0
        mode_color = get_edge_mode_color(edge_img, edge_mask)
        edge_colors.append(mode_color)

    # Group by similar colors
    groups = []
    used_indices = set()

    for i, mode_color in enumerate(edge_colors):
        if i in used_indices or mode_color is None:
            continue

        # Start new group
        group_indices = [i]
        used_indices.add(i)

        # Find similar edges
        for j, other_color in enumerate(edge_colors):
            if j in used_indices or other_color is None:
                continue

            # Calculate color distance
            distance = np.linalg.norm(mode_color.astype(float) - other_color.astype(float))

            if distance <= color_threshold:
                group_indices.append(j)
                used_indices.add(j)

        # Create group image (edges remain separate!)
        group_img = np.zeros_like(edges_bgra)
        for idx in group_indices:
            edge_img, _ = all_edge_images[idx]
            mask = edge_img[:, :, 3] > 0
            group_img[mask] = edge_img[mask]

        groups.append((group_img, mode_color, len(group_indices)))

    print(f"    Grouped into {len(groups)} color groups")

    return groups


def process_pdf(pdf_path, output_dir, dpi=200):
    clear_output_directory(output_dir)

    print(f"Processing PDF: {pdf_path}")
    print(f"Converting at {dpi} DPI...\n")

    images = convert_from_path(pdf_path, dpi=dpi)
    print(f"Total pages: {len(images)}\n")

    for page_num, pil_image in enumerate(images, start=1):
        print(f"Page {page_num}/{len(images)}...")

        # Convert to BGR
        image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

        # Enhance pastel colors
        print(f"  - Enhancing pastel colors...")
        enhanced_image = enhance_pastel_colors(image)

        # Detect edges
        print(f"  - Detecting edges...")
        edges = sobel_edge_detection(enhanced_image)

        # Process and group edges
        print(f"  - Processing and grouping edges by color...")
        groups = process_and_group_edges(edges, color_threshold=30)

        # Save outputs
        base = f"page{page_num:03d}"

        cv2.imwrite(os.path.join(output_dir, f"{base}_original.png"), image)
        cv2.imwrite(os.path.join(output_dir, f"{base}_enhanced.png"), enhanced_image)
        cv2.imwrite(os.path.join(output_dir, f"{base}_edges.png"), edges)

        # Save each group
        for group_idx, (group_img, mode_color, edge_count) in enumerate(groups, start=1):
            path = os.path.join(output_dir, f"{base}_group{group_idx}.png")
            cv2.imwrite(path, group_img)
            print(f"    Group {group_idx}: {edge_count} edges, mode color (BGR): {tuple(mode_color)}")

        print(f"  - Saved {len(groups)} group images\n")

    print("Complete!")


def main():
    pdf_path = "input.pdf"
    output_dir = "output_sobel"

    if not os.path.exists(pdf_path):
        print(f"Error: '{pdf_path}' not found!")
        return 1

    try:
        process_pdf(pdf_path, output_dir)
        return 0
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
        return 1


if __name__ == "__main__":
    exit(main())