Files
2026-05-14 14:07:04 -03:00

275 lines
9.9 KiB
Python

import os
import shutil
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
def clear_output_folder(folder):
"""Clear and create output folder."""
if os.path.exists(folder):
shutil.rmtree(folder)
os.makedirs(folder)
def load_image(file_path, dpi=300):
"""Load image from PDF or image file."""
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
print(f"Converting PDF to image (DPI: {dpi})...")
images = convert_from_path(file_path, dpi=dpi, fmt='png')
img = images[0] # First page only
print(f" Converted: {img.size[0]}x{img.size[1]}")
else:
print(f"Loading image...")
img = Image.open(file_path)
print(f" Loaded: {img.size[0]}x{img.size[1]}")
# Convert to RGB
if img.mode != 'RGB':
img = img.convert('RGB')
return img
def find_main_colors(img, color_threshold=30, min_percentage=0.5):
"""
Find main distinct colors in image.
Algorithm:
1. Find most common color
2. Group all colors within distance threshold (Euclidean distance in RGB space)
3. Remove those colors
4. Find next most common color
5. Repeat until no colors left
Parameters:
- color_threshold: Maximum distance between colors to group them (0-441)
Distance = sqrt((R1-R2)² + (G1-G2)² + (B1-B2)²)
- min_percentage: Minimum percentage to be a "main" color
"""
print(f"\nAnalyzing colors...")
print(f" Color distance threshold: {color_threshold}")
print(f" Minimum percentage: {min_percentage}%")
# Get all pixels
pixels = np.array(img)
h, w = pixels.shape[:2]
pixels = pixels.reshape(-1, 3)
total_pixels = len(pixels)
# Remove white background (>= 250 in all RGB channels)
is_white = (pixels[:, 0] >= 250) & (pixels[:, 1] >= 250) & (pixels[:, 2] >= 250)
pixels = pixels[~is_white]
print(f" Total pixels: {total_pixels:,}")
print(f" White background: {np.sum(is_white):,} ({np.sum(is_white)/total_pixels*100:.1f}%) - IGNORED")
print(f" Color pixels: {len(pixels):,} ({len(pixels)/total_pixels*100:.1f}%)")
if len(pixels) == 0:
print(" Error: Image is entirely white!")
return []
# Get unique colors and their counts
unique_colors, counts = np.unique(pixels, axis=0, return_counts=True)
print(f" Unique colors found: {len(unique_colors):,}")
# Greedy grouping by frequency
print(f"\n Grouping colors (greedy by frequency)...")
color_groups = []
remaining = np.ones(len(unique_colors), dtype=bool) # Track which colors are still available
iteration = 0
while np.any(remaining):
iteration += 1
# Find most common remaining color
remaining_counts = counts.copy()
remaining_counts[~remaining] = 0 # Zero out already-used colors
if np.max(remaining_counts) == 0:
break
most_common_idx = np.argmax(remaining_counts)
base_color = unique_colors[most_common_idx]
# Calculate Euclidean distance from base_color to all colors
# Distance = sqrt((R1-R2)² + (G1-G2)² + (B1-B2)²)
diff = unique_colors.astype(float) - base_color.astype(float)
distances = np.sqrt(np.sum(diff ** 2, axis=1))
# Find all colors within threshold distance
within_threshold = (distances <= color_threshold) & remaining
# Mark these colors as used
remaining[within_threshold] = False
# Group info
group_colors = unique_colors[within_threshold]
group_counts = counts[within_threshold]
total_count = np.sum(group_counts)
percentage = (total_count / len(pixels)) * 100
color_groups.append({
'color': base_color,
'count': total_count,
'percentage': percentage,
'num_variants': len(group_colors)
})
print(f" Group {iteration}: RGB{tuple(base_color)} -> {len(group_colors)} variants, {percentage:.1f}%")
print(f" Created {len(color_groups)} color groups")
# Filter by minimum percentage
color_groups = [g for g in color_groups if g['percentage'] >= min_percentage]
print(f" Main colors (>= {min_percentage}%): {len(color_groups)}")
# Verify percentages
total_percentage = sum(g['percentage'] for g in color_groups)
print(f" Total percentage: {total_percentage:.1f}%")
# Display results
print(f"\n{'='*60}")
print(f"MAIN COLORS:")
print(f"{'='*60}")
for i, group in enumerate(color_groups, 1):
r, g, b = group['color']
print(f"{i}. RGB({r:3d}, {g:3d}, {b:3d}) - {group['percentage']:5.1f}% ({group['count']:,} pixels, {group['num_variants']} variants)")
return color_groups
def create_color_layers(img, color_groups, color_threshold, output_folder='output'):
"""Create one image per color group showing only that color."""
print(f"\nCreating color layers...")
# Get all pixels
pixels = np.array(img)
h, w = pixels.shape[:2]
original_pixels = pixels.reshape(-1, 3)
# Remove white background for grouping
is_white = (original_pixels[:, 0] >= 250) & (original_pixels[:, 1] >= 250) & (original_pixels[:, 2] >= 250)
# Get unique colors for matching
unique_colors, inverse = np.unique(original_pixels[~is_white], axis=0, return_inverse=True)
# For each color group, create a layer
for i, group in enumerate(color_groups, 1):
base_color = group['color']
# Calculate distances from base_color to all unique colors
diff = unique_colors.astype(float) - base_color.astype(float)
distances = np.sqrt(np.sum(diff ** 2, axis=1))
# Find which unique colors belong to this group
in_group = distances <= color_threshold
# Create mask for pixels in this group
pixel_mask = np.zeros(len(original_pixels), dtype=bool)
pixel_mask[~is_white] = in_group[inverse]
# Create layer image (white background)
layer = np.full((h, w, 3), 255, dtype=np.uint8)
layer_flat = layer.reshape(-1, 3)
# Set pixels for this color group
layer_flat[pixel_mask] = original_pixels[pixel_mask]
# Save layer
r, g, b = base_color
filename = f'layer_{i}_rgb{r}_{g}_{b}.png'
filepath = os.path.join(output_folder, filename)
Image.fromarray(layer).save(filepath)
pixel_count = np.sum(pixel_mask)
print(f" Layer {i}: {filename} ({pixel_count:,} pixels)")
def save_results(color_groups, output_folder='output'):
"""Save color palette to file."""
output_path = os.path.join(output_folder, 'main_colors.txt')
with open(output_path, 'w') as f:
f.write("MAIN COLORS (by frequency)\n")
f.write("="*60 + "\n")
f.write("Note: White background ignored\n")
f.write(" Similar colors grouped together\n\n")
for i, group in enumerate(color_groups, 1):
r, g, b = group['color']
f.write(f"{i}. RGB({r}, {g}, {b})\n")
f.write(f" {group['percentage']:.2f}% ({group['count']:,} pixels)\n")
f.write(f" {group['num_variants']} color variants\n")
f.write(f" Hex: #{r:02X}{g:02X}{b:02X}\n\n")
print(f"\nResults saved to: {output_path}")
def main(file_path, color_threshold=30, min_percentage=0.5, dpi=300, output_folder='output'):
"""Main function."""
print("="*60)
print("COLOR EXTRACTOR - Find Main Colors")
print("="*60)
# Clear output
clear_output_folder(output_folder)
# Load image
print(f"\nInput: {file_path}")
img = load_image(file_path, dpi)
# Save original
original_path = os.path.join(output_folder, 'original.png')
img.save(original_path)
# Find main colors
color_groups = find_main_colors(img, color_threshold, min_percentage)
if len(color_groups) == 0:
print("\nNo main colors found.")
return
# Create color layers
create_color_layers(img, color_groups, color_threshold, output_folder)
# Save results
save_results(color_groups, output_folder)
print(f"\n{'='*60}")
print(f"✓ COMPLETE - Found {len(color_groups)} main colors")
print(f" Created {len(color_groups)} color layer images")
print(f"{'='*60}")
if __name__ == "__main__":
# Input file (PDF or image)
file_path = "input.pdf" # or "input.png", "input.jpg", etc.
if not os.path.exists(file_path):
print(f"Error: '{file_path}' not found!")
print("Usage: Place your file as 'input.pdf' or 'input.png'")
else:
# Parameters:
# color_threshold: Distance between colors to group them (0-441)
# Distance = sqrt((R1-R2)² + (G1-G2)² + (B1-B2)²)
# Examples:
# RGB(0,0,0) to RGB(0,0,1) = distance of 1
# RGB(0,0,0) to RGB(10,10,10) = distance of ~17
# RGB(0,0,0) to RGB(30,30,30) = distance of ~52
# Recommended values:
# 10-20: Very strict - only very similar colors grouped
# 30-50: Good for most diagrams (RECOMMENDED)
# 60-100: Loose - more aggressive grouping
#
# min_percentage: Minimum % to be a "main" color
# 0.5: Include colors that are at least 0.5% of image
# 1.0: Only colors that are at least 1% of image
# 0.1: Include even small but significant colors
#
# dpi: Resolution for PDF conversion (300 recommended)
main(
file_path=file_path,
color_threshold=120, # Group similar colors
min_percentage=3, # Min 0.5% to be considered "main"
dpi=300, # PDF resolution
output_folder='output'
)