AI-coodex-rekog-image-labeling/label/cores/file/layer_extractor.py

#!/usr/bin/env python3
"""
PDF Industrial Diagram Layer Extractor
Extracts colored layers from PDF diagrams with white backgrounds.
"""

import os
import sys
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans
from collections import Counter
import argparse

try:
    import fitz  # PyMuPDF
except ImportError:
    print("Error: PyMuPDF not installed. Install with: pip install PyMuPDF")
    sys.exit(1)


def pdf_to_image(pdf_path, dpi=300):
    """
    Convert PDF to PIL Image.

    Args:
        pdf_path: Path to PDF file
        dpi: Resolution for rendering (default: 300)

    Returns:
        PIL Image object
    """
    print(f"Loading PDF: {pdf_path}")
    doc = fitz.open(pdf_path)

    if len(doc) > 1:
        print(f"  PDF has {len(doc)} pages, processing first page only")

    # Get first page
    page = doc[0]

    # Render page to image
    mat = fitz.Matrix(dpi/72, dpi/72)  # Scale factor for DPI
    pix = page.get_pixmap(matrix=mat, alpha=False)

    # Convert to PIL Image
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    print(f"  Rendered at {img.size[0]}x{img.size[1]} pixels ({dpi} DPI)")
    doc.close()

    return img


def get_dominant_colors(img, n_colors=15, sample_fraction=0.2):
    """
    Identify dominant colors using KMeans clustering.

    Args:
        img: PIL Image object
        n_colors: Maximum number of colors to detect
        sample_fraction: Fraction of pixels to sample

    Returns:
        List of (color, pixel_count) tuples sorted by frequency
    """
    print("Analyzing colors...")
    img_array = np.array(img)
    pixels = img_array.reshape(-1, 3)

    # Sample pixels for speed
    if sample_fraction < 1.0:
        n_samples = int(len(pixels) * sample_fraction)
        indices = np.random.choice(len(pixels), n_samples, replace=False)
        sampled_pixels = pixels[indices]
    else:
        sampled_pixels = pixels

    # KMeans clustering
    n_clusters = min(n_colors, len(np.unique(sampled_pixels, axis=0)))
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(sampled_pixels)

    # Get colors and frequencies
    colors = kmeans.cluster_centers_.astype(int)
    labels = kmeans.predict(pixels)
    counts = Counter(labels)

    # Sort by frequency
    color_counts = [(tuple(colors[i]), counts[i]) for i in range(len(colors))]
    color_counts.sort(key=lambda x: x[1], reverse=True)

    return color_counts


def is_white(color, threshold=250):
    """Check if color is white (strict check for pure white background)."""
    return all(c >= threshold for c in color)


def color_distance(color1, color2):
    """Calculate Euclidean distance between two RGB colors."""
    return np.sqrt(sum((a - b) ** 2 for a, b in zip(color1, color2)))


def merge_similar_colors(color_counts, similarity_threshold=40):
    """
    Merge similar colors into groups to reduce layer fragmentation.

    Args:
        color_counts: List of (color, pixel_count) tuples
        similarity_threshold: Maximum color distance to merge (0-441, default: 40)

    Returns:
        List of merged (color, total_pixel_count) tuples
    """
    if not color_counts:
        return []

    merged = []
    used = set()

    for i, (color1, count1) in enumerate(color_counts):
        if i in used:
            continue

        # Start a new group with this color
        group_colors = [color1]
        group_count = count1
        used.add(i)

        # Find similar colors to merge
        for j, (color2, count2) in enumerate(color_counts):
            if j <= i or j in used:
                continue

            if color_distance(color1, color2) <= similarity_threshold:
                group_colors.append(color2)
                group_count += count2
                used.add(j)

        # Use the average color of the group
        avg_color = tuple(int(np.mean([c[i] for c in group_colors])) for i in range(3))
        merged.append((avg_color, group_count))

    # Sort by pixel count (most predominant first)
    merged.sort(key=lambda x: x[1], reverse=True)
    return merged


def get_layer_region(img, target_color, tolerance=30):
    """
    Get the bounding box region of a layer.

    Args:
        img: PIL Image object
        target_color: RGB tuple of target color
        tolerance: Color matching tolerance

    Returns:
        Tuple of (min_x, min_y, max_x, max_y, pixel_count) or None if no pixels found
    """
    img_array = np.array(img).astype(np.float32)

    # Calculate color distance for all pixels
    target = np.array(target_color, dtype=np.float32)
    distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2))

    # Scale tolerance
    max_distance = np.sqrt(3 * 255 ** 2)
    actual_tolerance = (tolerance / 100.0) * max_distance

    # Find matching pixels
    mask = distances <= actual_tolerance

    if not mask.any():
        return None

    # Get coordinates of matching pixels
    y_coords, x_coords = np.where(mask)

    if len(y_coords) == 0:
        return None

    # Calculate bounding box
    min_x, max_x = x_coords.min(), x_coords.max()
    min_y, max_y = y_coords.min(), y_coords.max()
    pixel_count = mask.sum()

    return (min_x, min_y, max_x, max_y, pixel_count)


def extract_layer(img, target_color, tolerance=30, min_alpha=128):
    """
    Extract a single colored layer.

    Args:
        img: PIL Image object
        target_color: RGB tuple of target color
        tolerance: Per-channel color distance (0-100, where 100 = max tolerance)
        min_alpha: Minimum alpha value to keep (0-255, higher = less ghost pixels)

    Returns:
        PIL Image with transparent background
    """
    img_array = np.array(img).astype(np.float32)
    h, w, _ = img_array.shape

    # Create output with alpha channel
    output = np.zeros((h, w, 4), dtype=np.uint8)

    # Calculate color distance for all pixels
    target = np.array(target_color, dtype=np.float32)
    distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2))

    # Scale tolerance: max Euclidean distance in RGB is sqrt(3*255^2) ≈ 441
    # Map tolerance (0-100) to actual distance (0-441)
    # tolerance=30 maps to ~132 distance (good for moderate antialiasing)
    # tolerance=50 maps to ~220 distance (good for heavy antialiasing)
    max_distance = np.sqrt(3 * 255 ** 2)  # ≈ 441.67
    actual_tolerance = (tolerance / 100.0) * max_distance

    # Mask pixels within tolerance
    mask = distances <= actual_tolerance

    # Calculate alpha with gradient based on distance
    alpha = np.clip(255 * (1 - distances / actual_tolerance), 0, 255).astype(np.uint8)

    # Filter out ghost pixels: only keep pixels with alpha >= min_alpha
    strong_mask = mask & (alpha >= min_alpha)

    # Copy matching pixels
    output[strong_mask, :3] = img_array[strong_mask].astype(np.uint8)
    output[strong_mask, 3] = alpha[strong_mask]

    return Image.fromarray(output, 'RGBA')


def process_pdf(pdf_path, output_dir='output', dpi=300, tolerance=30,
                min_pixels=100, n_layers=None, merge_threshold=40,
                show_regions=True, min_alpha=128):
    """
    Process a PDF diagram and extract layers.

    Args:
        pdf_path: Path to PDF file
        output_dir: Output directory for layers
        dpi: PDF rendering resolution
        tolerance: Color matching tolerance
        min_pixels: Minimum pixels for valid layer
        n_layers: Number of layers to extract (None = auto)
        merge_threshold: Color distance for merging similar layers (0-441)
        show_regions: Display bounding box regions for each layer
        min_alpha: Minimum alpha value to keep (0-255, removes ghost pixels)
    """
    # Convert PDF to image
    img = pdf_to_image(pdf_path, dpi)
    total_pixels = img.size[0] * img.size[1]

    # Detect colors
    color_counts = get_dominant_colors(img, n_colors=20)

    # Filter out white background
    layer_colors = []
    print("\nDetected colors (before merging):")
    for color, count in color_counts:
        if is_white(color):
            print(f"  RGB{color}: {count:,} pixels - WHITE BACKGROUND (skipped)")
        elif count >= min_pixels:
            percentage = (count / total_pixels) * 100
            layer_colors.append((color, count))
            print(f"  RGB{color}: {count:,} pixels ({percentage:.1f}%)")

    if not layer_colors:
        print("No colored layers found!")
        return

    # Merge similar colors to reduce fragmentation
    print(f"\nMerging similar colors (threshold: {merge_threshold})...")
    merged_layers = merge_similar_colors(layer_colors, merge_threshold)

    print(f"\nMerged layers (predominant first):")
    for color, count in merged_layers:
        percentage = (count / total_pixels) * 100
        print(f"  RGB{color}: {count:,} pixels ({percentage:.1f}%)")

    # Limit layers if specified
    if n_layers:
        merged_layers = merged_layers[:n_layers]
        print(f"\nKeeping top {n_layers} layers")

    print(f"\n{len(merged_layers)} layers to extract")

    # Create output directory
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    os.makedirs(output_dir, exist_ok=True)

    # Analyze regions and extract layers
    if show_regions:
        print("\nAnalyzing layer regions...")

    print("\nExtracting layers...")
    for idx, (color, count) in enumerate(merged_layers, 1):
        percentage = (count / total_pixels) * 100

        if show_regions:
            # Get region information
            region = get_layer_region(img, color, tolerance)
            if region:
                min_x, min_y, max_x, max_y, pixel_count = region
                width = max_x - min_x + 1
                height = max_y - min_y + 1
                print(f"  [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)")
                print(f"      Region: ({min_x},{min_y}) to ({max_x},{max_y}) - {width}x{height} px")
            else:
                print(f"  [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)")
        else:
            print(f"  [{idx}/{len(merged_layers)}] RGB{color}...", end=' ')

        # Extract layer
        layer_img = extract_layer(img, color, tolerance, min_alpha)

        # Save with descriptive filename
        color_name = f"{color[0]:03d}_{color[1]:03d}_{color[2]:03d}"
        output_path = os.path.join(output_dir, f"{base_name}_layer{idx}_{color_name}.png")
        layer_img.save(output_path)

        if not show_regions:
            print(f"✓ Saved")
        else:
            print(f"      Saved: {output_path}")

    print(f"\n✓ Complete! {len(merged_layers)} layers saved to: {output_dir}/")


def main():
    parser = argparse.ArgumentParser(
        description='Extract colored layers from PDF industrial diagrams',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Basic usage
  python layer_extractor.py diagram.pdf

  # Custom output directory and DPI
  python layer_extractor.py diagram.pdf -o layers/ --dpi 600

  # Adjust color tolerance
  python layer_extractor.py diagram.pdf -t 40

  # Extract specific number of layers
  python layer_extractor.py diagram.pdf -n 5
        """
    )

    parser.add_argument('pdf', help='Input PDF file')
    parser.add_argument('-o', '--output', default='output',
                        help='Output directory (default: output)')
    parser.add_argument('--dpi', type=int, default=300,
                        help='PDF rendering DPI (default: 300, higher = better quality)')
    parser.add_argument('-t', '--tolerance', type=int, default=30,
                        help='Color matching tolerance 0-100 (default: 30, higher = more lenient)')
    parser.add_argument('-n', '--n-layers', type=int,
                        help='Extract exactly N layers (default: auto-detect all)')
    parser.add_argument('-m', '--min-pixels', type=int, default=100,
                        help='Minimum pixels for valid layer (default: 100)')
    parser.add_argument('--merge', type=int, default=40,
                        help='Color merge threshold 0-441 (default: 40, higher = more aggressive merging)')
    parser.add_argument('--min-alpha', type=int, default=128,
                        help='Minimum alpha value 0-255 (default: 128, higher = remove more ghost pixels)')
    parser.add_argument('--no-regions', action='store_true',
                        help='Disable region analysis output')

    args = parser.parse_args()

    # Validate tolerance range
    if not 0 <= args.tolerance <= 100:
        print(f"Error: Tolerance must be between 0-100 (got {args.tolerance})")
        return 1

    # Validate min_alpha range
    if not 0 <= args.min_alpha <= 255:
        print(f"Error: min-alpha must be between 0-255 (got {args.min_alpha})")
        return 1

    # Validate input
    if not os.path.isfile(args.pdf):
        print(f"Error: File not found: {args.pdf}")
        return 1

    if not args.pdf.lower().endswith('.pdf'):
        print(f"Error: Input must be a PDF file")
        return 1

    # Process PDF
    try:
        process_pdf(
            args.pdf,
            output_dir=args.output,
            dpi=args.dpi,
            tolerance=args.tolerance,
            min_pixels=args.min_pixels,
            n_layers=args.n_layers,
            merge_threshold=args.merge,
            show_regions=not args.no_regions,
            min_alpha=args.min_alpha
        )
    except Exception as e:
        print(f"\n✗ Error: {e}")
        import traceback
        traceback.print_exc()
        return 1

    return 0


if __name__ == '__main__':
    exit(main())