Initial commit
This commit is contained in:
418
label/cores/file/layer_extractor.py
Normal file
418
label/cores/file/layer_extractor.py
Normal file
@@ -0,0 +1,418 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PDF Industrial Diagram Layer Extractor
|
||||
Extracts colored layers from PDF diagrams with white backgrounds.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from sklearn.cluster import KMeans
|
||||
from collections import Counter
|
||||
import argparse
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
print("Error: PyMuPDF not installed. Install with: pip install PyMuPDF")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def pdf_to_image(pdf_path, dpi=300):
|
||||
"""
|
||||
Convert PDF to PIL Image.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
dpi: Resolution for rendering (default: 300)
|
||||
|
||||
Returns:
|
||||
PIL Image object
|
||||
"""
|
||||
print(f"Loading PDF: {pdf_path}")
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
if len(doc) > 1:
|
||||
print(f" PDF has {len(doc)} pages, processing first page only")
|
||||
|
||||
# Get first page
|
||||
page = doc[0]
|
||||
|
||||
# Render page to image
|
||||
mat = fitz.Matrix(dpi/72, dpi/72) # Scale factor for DPI
|
||||
pix = page.get_pixmap(matrix=mat, alpha=False)
|
||||
|
||||
# Convert to PIL Image
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
|
||||
print(f" Rendered at {img.size[0]}x{img.size[1]} pixels ({dpi} DPI)")
|
||||
doc.close()
|
||||
|
||||
return img
|
||||
|
||||
|
||||
def get_dominant_colors(img, n_colors=15, sample_fraction=0.2):
|
||||
"""
|
||||
Identify dominant colors using KMeans clustering.
|
||||
|
||||
Args:
|
||||
img: PIL Image object
|
||||
n_colors: Maximum number of colors to detect
|
||||
sample_fraction: Fraction of pixels to sample
|
||||
|
||||
Returns:
|
||||
List of (color, pixel_count) tuples sorted by frequency
|
||||
"""
|
||||
print("Analyzing colors...")
|
||||
img_array = np.array(img)
|
||||
pixels = img_array.reshape(-1, 3)
|
||||
|
||||
# Sample pixels for speed
|
||||
if sample_fraction < 1.0:
|
||||
n_samples = int(len(pixels) * sample_fraction)
|
||||
indices = np.random.choice(len(pixels), n_samples, replace=False)
|
||||
sampled_pixels = pixels[indices]
|
||||
else:
|
||||
sampled_pixels = pixels
|
||||
|
||||
# KMeans clustering
|
||||
n_clusters = min(n_colors, len(np.unique(sampled_pixels, axis=0)))
|
||||
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
||||
kmeans.fit(sampled_pixels)
|
||||
|
||||
# Get colors and frequencies
|
||||
colors = kmeans.cluster_centers_.astype(int)
|
||||
labels = kmeans.predict(pixels)
|
||||
counts = Counter(labels)
|
||||
|
||||
# Sort by frequency
|
||||
color_counts = [(tuple(colors[i]), counts[i]) for i in range(len(colors))]
|
||||
color_counts.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
return color_counts
|
||||
|
||||
|
||||
def is_white(color, threshold=250):
|
||||
"""Check if color is white (strict check for pure white background)."""
|
||||
return all(c >= threshold for c in color)
|
||||
|
||||
|
||||
def color_distance(color1, color2):
|
||||
"""Calculate Euclidean distance between two RGB colors."""
|
||||
return np.sqrt(sum((a - b) ** 2 for a, b in zip(color1, color2)))
|
||||
|
||||
|
||||
def merge_similar_colors(color_counts, similarity_threshold=40):
|
||||
"""
|
||||
Merge similar colors into groups to reduce layer fragmentation.
|
||||
|
||||
Args:
|
||||
color_counts: List of (color, pixel_count) tuples
|
||||
similarity_threshold: Maximum color distance to merge (0-441, default: 40)
|
||||
|
||||
Returns:
|
||||
List of merged (color, total_pixel_count) tuples
|
||||
"""
|
||||
if not color_counts:
|
||||
return []
|
||||
|
||||
merged = []
|
||||
used = set()
|
||||
|
||||
for i, (color1, count1) in enumerate(color_counts):
|
||||
if i in used:
|
||||
continue
|
||||
|
||||
# Start a new group with this color
|
||||
group_colors = [color1]
|
||||
group_count = count1
|
||||
used.add(i)
|
||||
|
||||
# Find similar colors to merge
|
||||
for j, (color2, count2) in enumerate(color_counts):
|
||||
if j <= i or j in used:
|
||||
continue
|
||||
|
||||
if color_distance(color1, color2) <= similarity_threshold:
|
||||
group_colors.append(color2)
|
||||
group_count += count2
|
||||
used.add(j)
|
||||
|
||||
# Use the average color of the group
|
||||
avg_color = tuple(int(np.mean([c[i] for c in group_colors])) for i in range(3))
|
||||
merged.append((avg_color, group_count))
|
||||
|
||||
# Sort by pixel count (most predominant first)
|
||||
merged.sort(key=lambda x: x[1], reverse=True)
|
||||
return merged
|
||||
|
||||
|
||||
def get_layer_region(img, target_color, tolerance=30):
|
||||
"""
|
||||
Get the bounding box region of a layer.
|
||||
|
||||
Args:
|
||||
img: PIL Image object
|
||||
target_color: RGB tuple of target color
|
||||
tolerance: Color matching tolerance
|
||||
|
||||
Returns:
|
||||
Tuple of (min_x, min_y, max_x, max_y, pixel_count) or None if no pixels found
|
||||
"""
|
||||
img_array = np.array(img).astype(np.float32)
|
||||
|
||||
# Calculate color distance for all pixels
|
||||
target = np.array(target_color, dtype=np.float32)
|
||||
distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2))
|
||||
|
||||
# Scale tolerance
|
||||
max_distance = np.sqrt(3 * 255 ** 2)
|
||||
actual_tolerance = (tolerance / 100.0) * max_distance
|
||||
|
||||
# Find matching pixels
|
||||
mask = distances <= actual_tolerance
|
||||
|
||||
if not mask.any():
|
||||
return None
|
||||
|
||||
# Get coordinates of matching pixels
|
||||
y_coords, x_coords = np.where(mask)
|
||||
|
||||
if len(y_coords) == 0:
|
||||
return None
|
||||
|
||||
# Calculate bounding box
|
||||
min_x, max_x = x_coords.min(), x_coords.max()
|
||||
min_y, max_y = y_coords.min(), y_coords.max()
|
||||
pixel_count = mask.sum()
|
||||
|
||||
return (min_x, min_y, max_x, max_y, pixel_count)
|
||||
|
||||
|
||||
def extract_layer(img, target_color, tolerance=30, min_alpha=128):
|
||||
"""
|
||||
Extract a single colored layer.
|
||||
|
||||
Args:
|
||||
img: PIL Image object
|
||||
target_color: RGB tuple of target color
|
||||
tolerance: Per-channel color distance (0-100, where 100 = max tolerance)
|
||||
min_alpha: Minimum alpha value to keep (0-255, higher = less ghost pixels)
|
||||
|
||||
Returns:
|
||||
PIL Image with transparent background
|
||||
"""
|
||||
img_array = np.array(img).astype(np.float32)
|
||||
h, w, _ = img_array.shape
|
||||
|
||||
# Create output with alpha channel
|
||||
output = np.zeros((h, w, 4), dtype=np.uint8)
|
||||
|
||||
# Calculate color distance for all pixels
|
||||
target = np.array(target_color, dtype=np.float32)
|
||||
distances = np.sqrt(np.sum((img_array - target) ** 2, axis=2))
|
||||
|
||||
# Scale tolerance: max Euclidean distance in RGB is sqrt(3*255^2) ≈ 441
|
||||
# Map tolerance (0-100) to actual distance (0-441)
|
||||
# tolerance=30 maps to ~132 distance (good for moderate antialiasing)
|
||||
# tolerance=50 maps to ~220 distance (good for heavy antialiasing)
|
||||
max_distance = np.sqrt(3 * 255 ** 2) # ≈ 441.67
|
||||
actual_tolerance = (tolerance / 100.0) * max_distance
|
||||
|
||||
# Mask pixels within tolerance
|
||||
mask = distances <= actual_tolerance
|
||||
|
||||
# Calculate alpha with gradient based on distance
|
||||
alpha = np.clip(255 * (1 - distances / actual_tolerance), 0, 255).astype(np.uint8)
|
||||
|
||||
# Filter out ghost pixels: only keep pixels with alpha >= min_alpha
|
||||
strong_mask = mask & (alpha >= min_alpha)
|
||||
|
||||
# Copy matching pixels
|
||||
output[strong_mask, :3] = img_array[strong_mask].astype(np.uint8)
|
||||
output[strong_mask, 3] = alpha[strong_mask]
|
||||
|
||||
return Image.fromarray(output, 'RGBA')
|
||||
|
||||
|
||||
def process_pdf(pdf_path, output_dir='output', dpi=300, tolerance=30,
|
||||
min_pixels=100, n_layers=None, merge_threshold=40,
|
||||
show_regions=True, min_alpha=128):
|
||||
"""
|
||||
Process a PDF diagram and extract layers.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
output_dir: Output directory for layers
|
||||
dpi: PDF rendering resolution
|
||||
tolerance: Color matching tolerance
|
||||
min_pixels: Minimum pixels for valid layer
|
||||
n_layers: Number of layers to extract (None = auto)
|
||||
merge_threshold: Color distance for merging similar layers (0-441)
|
||||
show_regions: Display bounding box regions for each layer
|
||||
min_alpha: Minimum alpha value to keep (0-255, removes ghost pixels)
|
||||
"""
|
||||
# Convert PDF to image
|
||||
img = pdf_to_image(pdf_path, dpi)
|
||||
total_pixels = img.size[0] * img.size[1]
|
||||
|
||||
# Detect colors
|
||||
color_counts = get_dominant_colors(img, n_colors=20)
|
||||
|
||||
# Filter out white background
|
||||
layer_colors = []
|
||||
print("\nDetected colors (before merging):")
|
||||
for color, count in color_counts:
|
||||
if is_white(color):
|
||||
print(f" RGB{color}: {count:,} pixels - WHITE BACKGROUND (skipped)")
|
||||
elif count >= min_pixels:
|
||||
percentage = (count / total_pixels) * 100
|
||||
layer_colors.append((color, count))
|
||||
print(f" RGB{color}: {count:,} pixels ({percentage:.1f}%)")
|
||||
|
||||
if not layer_colors:
|
||||
print("No colored layers found!")
|
||||
return
|
||||
|
||||
# Merge similar colors to reduce fragmentation
|
||||
print(f"\nMerging similar colors (threshold: {merge_threshold})...")
|
||||
merged_layers = merge_similar_colors(layer_colors, merge_threshold)
|
||||
|
||||
print(f"\nMerged layers (predominant first):")
|
||||
for color, count in merged_layers:
|
||||
percentage = (count / total_pixels) * 100
|
||||
print(f" RGB{color}: {count:,} pixels ({percentage:.1f}%)")
|
||||
|
||||
# Limit layers if specified
|
||||
if n_layers:
|
||||
merged_layers = merged_layers[:n_layers]
|
||||
print(f"\nKeeping top {n_layers} layers")
|
||||
|
||||
print(f"\n{len(merged_layers)} layers to extract")
|
||||
|
||||
# Create output directory
|
||||
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Analyze regions and extract layers
|
||||
if show_regions:
|
||||
print("\nAnalyzing layer regions...")
|
||||
|
||||
print("\nExtracting layers...")
|
||||
for idx, (color, count) in enumerate(merged_layers, 1):
|
||||
percentage = (count / total_pixels) * 100
|
||||
|
||||
if show_regions:
|
||||
# Get region information
|
||||
region = get_layer_region(img, color, tolerance)
|
||||
if region:
|
||||
min_x, min_y, max_x, max_y, pixel_count = region
|
||||
width = max_x - min_x + 1
|
||||
height = max_y - min_y + 1
|
||||
print(f" [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)")
|
||||
print(f" Region: ({min_x},{min_y}) to ({max_x},{max_y}) - {width}x{height} px")
|
||||
else:
|
||||
print(f" [{idx}/{len(merged_layers)}] RGB{color} - {count:,} px ({percentage:.1f}%)")
|
||||
else:
|
||||
print(f" [{idx}/{len(merged_layers)}] RGB{color}...", end=' ')
|
||||
|
||||
# Extract layer
|
||||
layer_img = extract_layer(img, color, tolerance, min_alpha)
|
||||
|
||||
# Save with descriptive filename
|
||||
color_name = f"{color[0]:03d}_{color[1]:03d}_{color[2]:03d}"
|
||||
output_path = os.path.join(output_dir, f"{base_name}_layer{idx}_{color_name}.png")
|
||||
layer_img.save(output_path)
|
||||
|
||||
if not show_regions:
|
||||
print(f"✓ Saved")
|
||||
else:
|
||||
print(f" Saved: {output_path}")
|
||||
|
||||
print(f"\n✓ Complete! {len(merged_layers)} layers saved to: {output_dir}/")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Extract colored layers from PDF industrial diagrams',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Basic usage
|
||||
python layer_extractor.py diagram.pdf
|
||||
|
||||
# Custom output directory and DPI
|
||||
python layer_extractor.py diagram.pdf -o layers/ --dpi 600
|
||||
|
||||
# Adjust color tolerance
|
||||
python layer_extractor.py diagram.pdf -t 40
|
||||
|
||||
# Extract specific number of layers
|
||||
python layer_extractor.py diagram.pdf -n 5
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument('pdf', help='Input PDF file')
|
||||
parser.add_argument('-o', '--output', default='output',
|
||||
help='Output directory (default: output)')
|
||||
parser.add_argument('--dpi', type=int, default=300,
|
||||
help='PDF rendering DPI (default: 300, higher = better quality)')
|
||||
parser.add_argument('-t', '--tolerance', type=int, default=30,
|
||||
help='Color matching tolerance 0-100 (default: 30, higher = more lenient)')
|
||||
parser.add_argument('-n', '--n-layers', type=int,
|
||||
help='Extract exactly N layers (default: auto-detect all)')
|
||||
parser.add_argument('-m', '--min-pixels', type=int, default=100,
|
||||
help='Minimum pixels for valid layer (default: 100)')
|
||||
parser.add_argument('--merge', type=int, default=40,
|
||||
help='Color merge threshold 0-441 (default: 40, higher = more aggressive merging)')
|
||||
parser.add_argument('--min-alpha', type=int, default=128,
|
||||
help='Minimum alpha value 0-255 (default: 128, higher = remove more ghost pixels)')
|
||||
parser.add_argument('--no-regions', action='store_true',
|
||||
help='Disable region analysis output')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Validate tolerance range
|
||||
if not 0 <= args.tolerance <= 100:
|
||||
print(f"Error: Tolerance must be between 0-100 (got {args.tolerance})")
|
||||
return 1
|
||||
|
||||
# Validate min_alpha range
|
||||
if not 0 <= args.min_alpha <= 255:
|
||||
print(f"Error: min-alpha must be between 0-255 (got {args.min_alpha})")
|
||||
return 1
|
||||
|
||||
# Validate input
|
||||
if not os.path.isfile(args.pdf):
|
||||
print(f"Error: File not found: {args.pdf}")
|
||||
return 1
|
||||
|
||||
if not args.pdf.lower().endswith('.pdf'):
|
||||
print(f"Error: Input must be a PDF file")
|
||||
return 1
|
||||
|
||||
# Process PDF
|
||||
try:
|
||||
process_pdf(
|
||||
args.pdf,
|
||||
output_dir=args.output,
|
||||
dpi=args.dpi,
|
||||
tolerance=args.tolerance,
|
||||
min_pixels=args.min_pixels,
|
||||
n_layers=args.n_layers,
|
||||
merge_threshold=args.merge,
|
||||
show_regions=not args.no_regions,
|
||||
min_alpha=args.min_alpha
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"\n✗ Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
exit(main())
|
||||
Reference in New Issue
Block a user