#!/usr/bin/env python3 """ PDF Industrial Diagram Layer Extractor Extracts colored layers from PDF diagrams with white backgrounds. """ import os import sys import numpy as np from PIL import Image from sklearn.cluster import KMeans from collections import Counter import argparse try: import fitz # PyMuPDF except ImportError: print("Error: PyMuPDF not installed. Install with: pip install PyMuPDF") sys.exit(1) def pdf_to_image(pdf_path, dpi=300): """ Convert PDF to PIL Image. Args: pdf_path: Path to PDF file dpi: Resolution for rendering (default: 300) Returns: PIL Image object """ print(f"Loading PDF: {pdf_path}") doc = fitz.open(pdf_path) if len(doc) > 1: print(f" PDF has {len(doc)} pages, processing first page only") # Get first page page = doc[0] # Render page to image mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI pix = page.get_pixmap(matrix=mat, alpha=False) # Convert to PIL Image img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) print(f" Rendered at {img.size[0]}x{img.size[1]} pixels ({dpi} DPI)") doc.close() return img def get_dominant_colors(img, n_colors=15, sample_fraction=0.2): """ Identify dominant colors using KMeans clustering. Args: img: PIL Image object n_colors: Maximum number of colors to detect sample_fraction: Fraction of pixels to sample Returns: List of (color, pixel_count) tuples sorted by frequency """ print("Analyzing colors...") img_array = np.array(img) pixels = img_array.reshape(-1, 3) # Sample pixels for speed if sample_fraction < 1.0: n_samples = int(len(pixels) * sample_fraction) indices = np.random.choice(len(pixels), n_samples, replace=False) sampled_pixels = pixels[indices] else: sampled_pixels = pixels # KMeans clustering n_clusters = min(n_colors, len(np.unique(sampled_pixels, axis=0))) kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) kmeans.fit(sampled_pixels) # Get colors and frequencies colors = kmeans.cluster_centers_.astype(int) labels = kmeans.predict(pixels) counts = Counter(labels) # Sort by frequency color_counts = [(tuple(colors[i]), counts[i]) for i in range(len(colors))] color_counts.sort(key=lambda x: x[1], reverse=True) return color_counts def is_white(color, threshold=250): """Check if color is white (strict check for pure white background).""" return all(c >= threshold for c in color) def color_distance(color1, color2): """Calculate Euclidean distance between two RGB colors.""" return np.sqrt(sum((a - b) ** 2 for a, b in zip(color1, color2))) def merge_similar_colors(color_counts, similarity_threshold=40): """ Merge similar colors into groups to reduce layer fragmentation. Args: color_counts: List of (color, pixel_count) tuples similarity_threshold: Maximum color distance to merge (0-441, default: 40) Returns: List of merged (color, total_pixel_count) tuples """ if not color_counts: return [] merged = [] used = set() for i, (color1, count1) in enumerate(color_counts): if i in used: continue # Start a new group with this color group_colors = [color1] group_count = count1 used.add(i) # Find similar colors to merge for j, (color2, count2) in enumerate(color_counts): if j <= i or j in used: continue if color_distance(color1, color2) <= similarity_threshold: group_colors.append(color2) group_count += count2 used.add(j) # Use the average color of the group avg_color = tuple(int(np.mean([c[i] for c in group_colors])) for i in range(3)) merged.append((avg_color, group_count)) # Sort by pixel count (most predominant first) merged.sort(key=lambda x: x[1], reverse=True) return merged def get_layer_region(img, target_color, tolerance=30): """ Get the bounding box region of a layer. Args: img: PIL Image object target_color: RGB tuple of target color tolerance: Color matching tolerance Returns: Tuple of (min_x, min_y, max_x, max_y, pixel_count) or None if no pixels found """ img_array = np.array(img).astype(np.float32) # Calculate color distance for all pixels target = np.array(target_color, dtype=np.float32) distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2)) # Scale tolerance max_distance = np.sqrt(3 * 255 ** 2) actual_tolerance = (tolerance / 100.0) * max_distance # Find matching pixels mask = distances <= actual_tolerance if not mask.any(): return None # Get coordinates of matching pixels y_coords, x_coords = np.where(mask) if len(y_coords) == 0: return None # Calculate bounding box min_x, max_x = x_coords.min(), x_coords.max() min_y, max_y = y_coords.min(), y_coords.max() pixel_count = mask.sum() return (min_x, min_y, max_x, max_y, pixel_count) def extract_layer(img, target_color, tolerance=30, min_alpha=128): """ Extract a single colored layer. Args: img: PIL Image object target_color: RGB tuple of target color tolerance: Per-channel color distance (0-100, where 100 = max tolerance) min_alpha: Minimum alpha value to keep (0-255, higher = less ghost pixels) Returns: PIL Image with transparent background """ img_array = np.array(img).astype(np.float32) h, w, _ = img_array.shape # Create output with alpha channel output = np.zeros((h, w, 4), dtype=np.uint8) # Calculate color distance for all pixels target = np.array(target_color, dtype=np.float32) distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2)) # Scale tolerance: max Euclidean distance in RGB is sqrt(3*255^2) ≈ 441 # Map tolerance (0-100) to actual distance (0-441) # tolerance=30 maps to ~132 distance (good for moderate antialiasing) # tolerance=50 maps to ~220 distance (good for heavy antialiasing) max_distance = np.sqrt(3 * 255 ** 2) # ≈ 441.67 actual_tolerance = (tolerance / 100.0) * max_distance # Mask pixels within tolerance mask = distances <= actual_tolerance # Calculate alpha with gradient based on distance alpha = np.clip(255 * (1 - distances / actual_tolerance), 0, 255).astype(np.uint8) # Filter out ghost pixels: only keep pixels with alpha >= min_alpha strong_mask = mask & (alpha >= min_alpha) # Copy matching pixels output[strong_mask, :3] = img_array[strong_mask].astype(np.uint8) output[strong_mask, 3] = alpha[strong_mask] return Image.fromarray(output, 'RGBA') def process_pdf(pdf_path, output_dir='output', dpi=300, tolerance=30, min_pixels=100, n_layers=None, merge_threshold=40, show_regions=True, min_alpha=128): """ Process a PDF diagram and extract layers. Args: pdf_path: Path to PDF file output_dir: Output directory for layers dpi: PDF rendering resolution tolerance: Color matching tolerance min_pixels: Minimum pixels for valid layer n_layers: Number of layers to extract (None = auto) merge_threshold: Color distance for merging similar layers (0-441) show_regions: Display bounding box regions for each layer min_alpha: Minimum alpha value to keep (0-255, removes ghost pixels) """ # Convert PDF to image img = pdf_to_image(pdf_path, dpi) total_pixels = img.size[0] * img.size[1] # Detect colors color_counts = get_dominant_colors(img, n_colors=20) # Filter out white background layer_colors = [] print("\nDetected colors (before merging):") for color, count in color_counts: if is_white(color): print(f" RGB{color}: {count:,} pixels - WHITE BACKGROUND (skipped)") elif count >= min_pixels: percentage = (count / total_pixels) * 100 layer_colors.append((color, count)) print(f" RGB{color}: {count:,} pixels ({percentage:.1f}%)") if not layer_colors: print("No colored layers found!") return # Merge similar colors to reduce fragmentation print(f"\nMerging similar colors (threshold: {merge_threshold})...") merged_layers = merge_similar_colors(layer_colors, merge_threshold) print(f"\nMerged layers (predominant first):") for color, count in merged_layers: percentage = (count / total_pixels) * 100 print(f" RGB{color}: {count:,} pixels ({percentage:.1f}%)") # Limit layers if specified if n_layers: merged_layers = merged_layers[:n_layers] print(f"\nKeeping top {n_layers} layers") print(f"\n{len(merged_layers)} layers to extract") # Create output directory base_name = os.path.splitext(os.path.basename(pdf_path))[0] os.makedirs(output_dir, exist_ok=True) # Analyze regions and extract layers if show_regions: print("\nAnalyzing layer regions...") print("\nExtracting layers...") for idx, (color, count) in enumerate(merged_layers, 1): percentage = (count / total_pixels) * 100 if show_regions: # Get region information region = get_layer_region(img, color, tolerance) if region: min_x, min_y, max_x, max_y, pixel_count = region width = max_x - min_x + 1 height = max_y - min_y + 1 print(f" [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)") print(f" Region: ({min_x},{min_y}) to ({max_x},{max_y}) - {width}x{height} px") else: print(f" [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)") else: print(f" [{idx}/{len(merged_layers)}] RGB{color}...", end=' ') # Extract layer layer_img = extract_layer(img, color, tolerance, min_alpha) # Save with descriptive filename color_name = f"{color[0]:03d}_{color[1]:03d}_{color[2]:03d}" output_path = os.path.join(output_dir, f"{base_name}_layer{idx}_{color_name}.png") layer_img.save(output_path) if not show_regions: print(f"✓ Saved") else: print(f" Saved: {output_path}") print(f"\n✓ Complete! {len(merged_layers)} layers saved to: {output_dir}/") def main(): parser = argparse.ArgumentParser( description='Extract colored layers from PDF industrial diagrams', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Basic usage python layer_extractor.py diagram.pdf # Custom output directory and DPI python layer_extractor.py diagram.pdf -o layers/ --dpi 600 # Adjust color tolerance python layer_extractor.py diagram.pdf -t 40 # Extract specific number of layers python layer_extractor.py diagram.pdf -n 5 """ ) parser.add_argument('pdf', help='Input PDF file') parser.add_argument('-o', '--output', default='output', help='Output directory (default: output)') parser.add_argument('--dpi', type=int, default=300, help='PDF rendering DPI (default: 300, higher = better quality)') parser.add_argument('-t', '--tolerance', type=int, default=30, help='Color matching tolerance 0-100 (default: 30, higher = more lenient)') parser.add_argument('-n', '--n-layers', type=int, help='Extract exactly N layers (default: auto-detect all)') parser.add_argument('-m', '--min-pixels', type=int, default=100, help='Minimum pixels for valid layer (default: 100)') parser.add_argument('--merge', type=int, default=40, help='Color merge threshold 0-441 (default: 40, higher = more aggressive merging)') parser.add_argument('--min-alpha', type=int, default=128, help='Minimum alpha value 0-255 (default: 128, higher = remove more ghost pixels)') parser.add_argument('--no-regions', action='store_true', help='Disable region analysis output') args = parser.parse_args() # Validate tolerance range if not 0 <= args.tolerance <= 100: print(f"Error: Tolerance must be between 0-100 (got {args.tolerance})") return 1 # Validate min_alpha range if not 0 <= args.min_alpha <= 255: print(f"Error: min-alpha must be between 0-255 (got {args.min_alpha})") return 1 # Validate input if not os.path.isfile(args.pdf): print(f"Error: File not found: {args.pdf}") return 1 if not args.pdf.lower().endswith('.pdf'): print(f"Error: Input must be a PDF file") return 1 # Process PDF try: process_pdf( args.pdf, output_dir=args.output, dpi=args.dpi, tolerance=args.tolerance, min_pixels=args.min_pixels, n_layers=args.n_layers, merge_threshold=args.merge, show_regions=not args.no_regions, min_alpha=args.min_alpha ) except Exception as e: print(f"\n✗ Error: {e}") import traceback traceback.print_exc() return 1 return 0 if __name__ == '__main__': exit(main())