import os import shutil import numpy as np from PIL import Image from pdf2image import convert_from_path def clear_output_folder(folder): """Clear and create output folder.""" if os.path.exists(folder): shutil.rmtree(folder) os.makedirs(folder) def load_image(file_path, dpi=300): """Load image from PDF or image file.""" file_ext = os.path.splitext(file_path)[1].lower() if file_ext == '.pdf': print(f"Converting PDF to image (DPI: {dpi})...") images = convert_from_path(file_path, dpi=dpi, fmt='png') img = images[0] # First page only print(f" Converted: {img.size[0]}x{img.size[1]}") else: print(f"Loading image...") img = Image.open(file_path) print(f" Loaded: {img.size[0]}x{img.size[1]}") # Convert to RGB if img.mode != 'RGB': img = img.convert('RGB') return img def find_main_colors(img, color_threshold=30, min_percentage=0.5): """ Find main distinct colors in image. Algorithm: 1. Find most common color 2. Group all colors within distance threshold (Euclidean distance in RGB space) 3. Remove those colors 4. Find next most common color 5. Repeat until no colors left Parameters: - color_threshold: Maximum distance between colors to group them (0-441) Distance = sqrt((R1-R2)² + (G1-G2)² + (B1-B2)²) - min_percentage: Minimum percentage to be a "main" color """ print(f"\nAnalyzing colors...") print(f" Color distance threshold: {color_threshold}") print(f" Minimum percentage: {min_percentage}%") # Get all pixels pixels = np.array(img) h, w = pixels.shape[:2] pixels = pixels.reshape(-1, 3) total_pixels = len(pixels) # Remove white background (>= 250 in all RGB channels) is_white = (pixels[:, 0] >= 250) & (pixels[:, 1] >= 250) & (pixels[:, 2] >= 250) pixels = pixels[~is_white] print(f" Total pixels: {total_pixels:,}") print(f" White background: {np.sum(is_white):,} ({np.sum(is_white)/total_pixels*100:.1f}%) - IGNORED") print(f" Color pixels: {len(pixels):,} ({len(pixels)/total_pixels*100:.1f}%)") if len(pixels) == 0: print(" Error: Image is entirely white!") return [] # Get unique colors and their counts unique_colors, counts = np.unique(pixels, axis=0, return_counts=True) print(f" Unique colors found: {len(unique_colors):,}") # Greedy grouping by frequency print(f"\n Grouping colors (greedy by frequency)...") color_groups = [] remaining = np.ones(len(unique_colors), dtype=bool) # Track which colors are still available iteration = 0 while np.any(remaining): iteration += 1 # Find most common remaining color remaining_counts = counts.copy() remaining_counts[~remaining] = 0 # Zero out already-used colors if np.max(remaining_counts) == 0: break most_common_idx = np.argmax(remaining_counts) base_color = unique_colors[most_common_idx] # Calculate Euclidean distance from base_color to all colors # Distance = sqrt((R1-R2)² + (G1-G2)² + (B1-B2)²) diff = unique_colors.astype(float) - base_color.astype(float) distances = np.sqrt(np.sum(diff ** 2, axis=1)) # Find all colors within threshold distance within_threshold = (distances <= color_threshold) & remaining # Mark these colors as used remaining[within_threshold] = False # Group info group_colors = unique_colors[within_threshold] group_counts = counts[within_threshold] total_count = np.sum(group_counts) percentage = (total_count / len(pixels)) * 100 color_groups.append({ 'color': base_color, 'count': total_count, 'percentage': percentage, 'num_variants': len(group_colors) }) print(f" Group {iteration}: RGB{tuple(base_color)} -> {len(group_colors)} variants, {percentage:.1f}%") print(f" Created {len(color_groups)} color groups") # Filter by minimum percentage color_groups = [g for g in color_groups if g['percentage'] >= min_percentage] print(f" Main colors (>= {min_percentage}%): {len(color_groups)}") # Verify percentages total_percentage = sum(g['percentage'] for g in color_groups) print(f" Total percentage: {total_percentage:.1f}%") # Display results print(f"\n{'='*60}") print(f"MAIN COLORS:") print(f"{'='*60}") for i, group in enumerate(color_groups, 1): r, g, b = group['color'] print(f"{i}. RGB({r:3d}, {g:3d}, {b:3d}) - {group['percentage']:5.1f}% ({group['count']:,} pixels, {group['num_variants']} variants)") return color_groups def create_color_layers(img, color_groups, color_threshold, output_folder='output'): """Create one image per color group showing only that color.""" print(f"\nCreating color layers...") # Get all pixels pixels = np.array(img) h, w = pixels.shape[:2] original_pixels = pixels.reshape(-1, 3) # Remove white background for grouping is_white = (original_pixels[:, 0] >= 250) & (original_pixels[:, 1] >= 250) & (original_pixels[:, 2] >= 250) # Get unique colors for matching unique_colors, inverse = np.unique(original_pixels[~is_white], axis=0, return_inverse=True) # For each color group, create a layer for i, group in enumerate(color_groups, 1): base_color = group['color'] # Calculate distances from base_color to all unique colors diff = unique_colors.astype(float) - base_color.astype(float) distances = np.sqrt(np.sum(diff ** 2, axis=1)) # Find which unique colors belong to this group in_group = distances <= color_threshold # Create mask for pixels in this group pixel_mask = np.zeros(len(original_pixels), dtype=bool) pixel_mask[~is_white] = in_group[inverse] # Create layer image (white background) layer = np.full((h, w, 3), 255, dtype=np.uint8) layer_flat = layer.reshape(-1, 3) # Set pixels for this color group layer_flat[pixel_mask] = original_pixels[pixel_mask] # Save layer r, g, b = base_color filename = f'layer_{i}_rgb{r}_{g}_{b}.png' filepath = os.path.join(output_folder, filename) Image.fromarray(layer).save(filepath) pixel_count = np.sum(pixel_mask) print(f" Layer {i}: {filename} ({pixel_count:,} pixels)") def save_results(color_groups, output_folder='output'): """Save color palette to file.""" output_path = os.path.join(output_folder, 'main_colors.txt') with open(output_path, 'w') as f: f.write("MAIN COLORS (by frequency)\n") f.write("="*60 + "\n") f.write("Note: White background ignored\n") f.write(" Similar colors grouped together\n\n") for i, group in enumerate(color_groups, 1): r, g, b = group['color'] f.write(f"{i}. RGB({r}, {g}, {b})\n") f.write(f" {group['percentage']:.2f}% ({group['count']:,} pixels)\n") f.write(f" {group['num_variants']} color variants\n") f.write(f" Hex: #{r:02X}{g:02X}{b:02X}\n\n") print(f"\nResults saved to: {output_path}") def main(file_path, color_threshold=30, min_percentage=0.5, dpi=300, output_folder='output'): """Main function.""" print("="*60) print("COLOR EXTRACTOR - Find Main Colors") print("="*60) # Clear output clear_output_folder(output_folder) # Load image print(f"\nInput: {file_path}") img = load_image(file_path, dpi) # Save original original_path = os.path.join(output_folder, 'original.png') img.save(original_path) # Find main colors color_groups = find_main_colors(img, color_threshold, min_percentage) if len(color_groups) == 0: print("\nNo main colors found.") return # Create color layers create_color_layers(img, color_groups, color_threshold, output_folder) # Save results save_results(color_groups, output_folder) print(f"\n{'='*60}") print(f"✓ COMPLETE - Found {len(color_groups)} main colors") print(f" Created {len(color_groups)} color layer images") print(f"{'='*60}") if __name__ == "__main__": # Input file (PDF or image) file_path = "input.pdf" # or "input.png", "input.jpg", etc. if not os.path.exists(file_path): print(f"Error: '{file_path}' not found!") print("Usage: Place your file as 'input.pdf' or 'input.png'") else: # Parameters: # color_threshold: Distance between colors to group them (0-441) # Distance = sqrt((R1-R2)² + (G1-G2)² + (B1-B2)²) # Examples: # RGB(0,0,0) to RGB(0,0,1) = distance of 1 # RGB(0,0,0) to RGB(10,10,10) = distance of ~17 # RGB(0,0,0) to RGB(30,30,30) = distance of ~52 # Recommended values: # 10-20: Very strict - only very similar colors grouped # 30-50: Good for most diagrams (RECOMMENDED) # 60-100: Loose - more aggressive grouping # # min_percentage: Minimum % to be a "main" color # 0.5: Include colors that are at least 0.5% of image # 1.0: Only colors that are at least 1% of image # 0.1: Include even small but significant colors # # dpi: Resolution for PDF conversion (300 recommended) main( file_path=file_path, color_threshold=120, # Group similar colors min_percentage=3, # Min 0.5% to be considered "main" dpi=300, # PDF resolution output_folder='output' )