Files
2026-05-14 14:07:04 -03:00

419 lines
13 KiB
Python

#!/usr/bin/env python3
"""
PDF Industrial Diagram Layer Extractor
Extracts colored layers from PDF diagrams with white backgrounds.
"""
import os
import sys
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans
from collections import Counter
import argparse
try:
import fitz # PyMuPDF
except ImportError:
print("Error: PyMuPDF not installed. Install with: pip install PyMuPDF")
sys.exit(1)
def pdf_to_image(pdf_path, dpi=300):
"""
Convert PDF to PIL Image.
Args:
pdf_path: Path to PDF file
dpi: Resolution for rendering (default: 300)
Returns:
PIL Image object
"""
print(f"Loading PDF: {pdf_path}")
doc = fitz.open(pdf_path)
if len(doc) > 1:
print(f" PDF has {len(doc)} pages, processing first page only")
# Get first page
page = doc[0]
# Render page to image
mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to PIL Image
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
print(f" Rendered at {img.size[0]}x{img.size[1]} pixels ({dpi} DPI)")
doc.close()
return img
def get_dominant_colors(img, n_colors=15, sample_fraction=0.2):
"""
Identify dominant colors using KMeans clustering.
Args:
img: PIL Image object
n_colors: Maximum number of colors to detect
sample_fraction: Fraction of pixels to sample
Returns:
List of (color, pixel_count) tuples sorted by frequency
"""
print("Analyzing colors...")
img_array = np.array(img)
pixels = img_array.reshape(-1, 3)
# Sample pixels for speed
if sample_fraction < 1.0:
n_samples = int(len(pixels) * sample_fraction)
indices = np.random.choice(len(pixels), n_samples, replace=False)
sampled_pixels = pixels[indices]
else:
sampled_pixels = pixels
# KMeans clustering
n_clusters = min(n_colors, len(np.unique(sampled_pixels, axis=0)))
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
kmeans.fit(sampled_pixels)
# Get colors and frequencies
colors = kmeans.cluster_centers_.astype(int)
labels = kmeans.predict(pixels)
counts = Counter(labels)
# Sort by frequency
color_counts = [(tuple(colors[i]), counts[i]) for i in range(len(colors))]
color_counts.sort(key=lambda x: x[1], reverse=True)
return color_counts
def is_white(color, threshold=250):
"""Check if color is white (strict check for pure white background)."""
return all(c >= threshold for c in color)
def color_distance(color1, color2):
"""Calculate Euclidean distance between two RGB colors."""
return np.sqrt(sum((a - b) ** 2 for a, b in zip(color1, color2)))
def merge_similar_colors(color_counts, similarity_threshold=40):
"""
Merge similar colors into groups to reduce layer fragmentation.
Args:
color_counts: List of (color, pixel_count) tuples
similarity_threshold: Maximum color distance to merge (0-441, default: 40)
Returns:
List of merged (color, total_pixel_count) tuples
"""
if not color_counts:
return []
merged = []
used = set()
for i, (color1, count1) in enumerate(color_counts):
if i in used:
continue
# Start a new group with this color
group_colors = [color1]
group_count = count1
used.add(i)
# Find similar colors to merge
for j, (color2, count2) in enumerate(color_counts):
if j <= i or j in used:
continue
if color_distance(color1, color2) <= similarity_threshold:
group_colors.append(color2)
group_count += count2
used.add(j)
# Use the average color of the group
avg_color = tuple(int(np.mean([c[i] for c in group_colors])) for i in range(3))
merged.append((avg_color, group_count))
# Sort by pixel count (most predominant first)
merged.sort(key=lambda x: x[1], reverse=True)
return merged
def get_layer_region(img, target_color, tolerance=30):
"""
Get the bounding box region of a layer.
Args:
img: PIL Image object
target_color: RGB tuple of target color
tolerance: Color matching tolerance
Returns:
Tuple of (min_x, min_y, max_x, max_y, pixel_count) or None if no pixels found
"""
img_array = np.array(img).astype(np.float32)
# Calculate color distance for all pixels
target = np.array(target_color, dtype=np.float32)
distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2))
# Scale tolerance
max_distance = np.sqrt(3 * 255 ** 2)
actual_tolerance = (tolerance / 100.0) * max_distance
# Find matching pixels
mask = distances <= actual_tolerance
if not mask.any():
return None
# Get coordinates of matching pixels
y_coords, x_coords = np.where(mask)
if len(y_coords) == 0:
return None
# Calculate bounding box
min_x, max_x = x_coords.min(), x_coords.max()
min_y, max_y = y_coords.min(), y_coords.max()
pixel_count = mask.sum()
return (min_x, min_y, max_x, max_y, pixel_count)
def extract_layer(img, target_color, tolerance=30, min_alpha=128):
"""
Extract a single colored layer.
Args:
img: PIL Image object
target_color: RGB tuple of target color
tolerance: Per-channel color distance (0-100, where 100 = max tolerance)
min_alpha: Minimum alpha value to keep (0-255, higher = less ghost pixels)
Returns:
PIL Image with transparent background
"""
img_array = np.array(img).astype(np.float32)
h, w, _ = img_array.shape
# Create output with alpha channel
output = np.zeros((h, w, 4), dtype=np.uint8)
# Calculate color distance for all pixels
target = np.array(target_color, dtype=np.float32)
distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2))
# Scale tolerance: max Euclidean distance in RGB is sqrt(3*255^2) ≈ 441
# Map tolerance (0-100) to actual distance (0-441)
# tolerance=30 maps to ~132 distance (good for moderate antialiasing)
# tolerance=50 maps to ~220 distance (good for heavy antialiasing)
max_distance = np.sqrt(3 * 255 ** 2) # ≈ 441.67
actual_tolerance = (tolerance / 100.0) * max_distance
# Mask pixels within tolerance
mask = distances <= actual_tolerance
# Calculate alpha with gradient based on distance
alpha = np.clip(255 * (1 - distances / actual_tolerance), 0, 255).astype(np.uint8)
# Filter out ghost pixels: only keep pixels with alpha >= min_alpha
strong_mask = mask & (alpha >= min_alpha)
# Copy matching pixels
output[strong_mask, :3] = img_array[strong_mask].astype(np.uint8)
output[strong_mask, 3] = alpha[strong_mask]
return Image.fromarray(output, 'RGBA')
def process_pdf(pdf_path, output_dir='output', dpi=300, tolerance=30,
min_pixels=100, n_layers=None, merge_threshold=40,
show_regions=True, min_alpha=128):
"""
Process a PDF diagram and extract layers.
Args:
pdf_path: Path to PDF file
output_dir: Output directory for layers
dpi: PDF rendering resolution
tolerance: Color matching tolerance
min_pixels: Minimum pixels for valid layer
n_layers: Number of layers to extract (None = auto)
merge_threshold: Color distance for merging similar layers (0-441)
show_regions: Display bounding box regions for each layer
min_alpha: Minimum alpha value to keep (0-255, removes ghost pixels)
"""
# Convert PDF to image
img = pdf_to_image(pdf_path, dpi)
total_pixels = img.size[0] * img.size[1]
# Detect colors
color_counts = get_dominant_colors(img, n_colors=20)
# Filter out white background
layer_colors = []
print("\nDetected colors (before merging):")
for color, count in color_counts:
if is_white(color):
print(f" RGB{color}: {count:,} pixels - WHITE BACKGROUND (skipped)")
elif count >= min_pixels:
percentage = (count / total_pixels) * 100
layer_colors.append((color, count))
print(f" RGB{color}: {count:,} pixels ({percentage:.1f}%)")
if not layer_colors:
print("No colored layers found!")
return
# Merge similar colors to reduce fragmentation
print(f"\nMerging similar colors (threshold: {merge_threshold})...")
merged_layers = merge_similar_colors(layer_colors, merge_threshold)
print(f"\nMerged layers (predominant first):")
for color, count in merged_layers:
percentage = (count / total_pixels) * 100
print(f" RGB{color}: {count:,} pixels ({percentage:.1f}%)")
# Limit layers if specified
if n_layers:
merged_layers = merged_layers[:n_layers]
print(f"\nKeeping top {n_layers} layers")
print(f"\n{len(merged_layers)} layers to extract")
# Create output directory
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
os.makedirs(output_dir, exist_ok=True)
# Analyze regions and extract layers
if show_regions:
print("\nAnalyzing layer regions...")
print("\nExtracting layers...")
for idx, (color, count) in enumerate(merged_layers, 1):
percentage = (count / total_pixels) * 100
if show_regions:
# Get region information
region = get_layer_region(img, color, tolerance)
if region:
min_x, min_y, max_x, max_y, pixel_count = region
width = max_x - min_x + 1
height = max_y - min_y + 1
print(f" [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)")
print(f" Region: ({min_x},{min_y}) to ({max_x},{max_y}) - {width}x{height} px")
else:
print(f" [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)")
else:
print(f" [{idx}/{len(merged_layers)}] RGB{color}...", end=' ')
# Extract layer
layer_img = extract_layer(img, color, tolerance, min_alpha)
# Save with descriptive filename
color_name = f"{color[0]:03d}_{color[1]:03d}_{color[2]:03d}"
output_path = os.path.join(output_dir, f"{base_name}_layer{idx}_{color_name}.png")
layer_img.save(output_path)
if not show_regions:
print(f"✓ Saved")
else:
print(f" Saved: {output_path}")
print(f"\n✓ Complete! {len(merged_layers)} layers saved to: {output_dir}/")
def main():
parser = argparse.ArgumentParser(
description='Extract colored layers from PDF industrial diagrams',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Basic usage
python layer_extractor.py diagram.pdf
# Custom output directory and DPI
python layer_extractor.py diagram.pdf -o layers/ --dpi 600
# Adjust color tolerance
python layer_extractor.py diagram.pdf -t 40
# Extract specific number of layers
python layer_extractor.py diagram.pdf -n 5
"""
)
parser.add_argument('pdf', help='Input PDF file')
parser.add_argument('-o', '--output', default='output',
help='Output directory (default: output)')
parser.add_argument('--dpi', type=int, default=300,
help='PDF rendering DPI (default: 300, higher = better quality)')
parser.add_argument('-t', '--tolerance', type=int, default=30,
help='Color matching tolerance 0-100 (default: 30, higher = more lenient)')
parser.add_argument('-n', '--n-layers', type=int,
help='Extract exactly N layers (default: auto-detect all)')
parser.add_argument('-m', '--min-pixels', type=int, default=100,
help='Minimum pixels for valid layer (default: 100)')
parser.add_argument('--merge', type=int, default=40,
help='Color merge threshold 0-441 (default: 40, higher = more aggressive merging)')
parser.add_argument('--min-alpha', type=int, default=128,
help='Minimum alpha value 0-255 (default: 128, higher = remove more ghost pixels)')
parser.add_argument('--no-regions', action='store_true',
help='Disable region analysis output')
args = parser.parse_args()
# Validate tolerance range
if not 0 <= args.tolerance <= 100:
print(f"Error: Tolerance must be between 0-100 (got {args.tolerance})")
return 1
# Validate min_alpha range
if not 0 <= args.min_alpha <= 255:
print(f"Error: min-alpha must be between 0-255 (got {args.min_alpha})")
return 1
# Validate input
if not os.path.isfile(args.pdf):
print(f"Error: File not found: {args.pdf}")
return 1
if not args.pdf.lower().endswith('.pdf'):
print(f"Error: Input must be a PDF file")
return 1
# Process PDF
try:
process_pdf(
args.pdf,
output_dir=args.output,
dpi=args.dpi,
tolerance=args.tolerance,
min_pixels=args.min_pixels,
n_layers=args.n_layers,
merge_threshold=args.merge,
show_regions=not args.no_regions,
min_alpha=args.min_alpha
)
except Exception as e:
print(f"\n✗ Error: {e}")
import traceback
traceback.print_exc()
return 1
return 0
if __name__ == '__main__':
exit(main())