#!/usr/bin/env python3 """ Simple script to invoke AWS Textract on a PDF file. Extracts text and returns the detected content. """ import boto3 import sys import io from pathlib import Path from PyPDF2 import PdfReader def get_pdf_page_count(pdf_bytes: bytes) -> int: """ Get the number of pages in a PDF file. Args: pdf_bytes: PDF file content as bytes Returns: int: Number of pages in the PDF """ try: pdf_reader = PdfReader(io.BytesIO(pdf_bytes)) return len(pdf_reader.pages) except Exception as e: print(f"Warning: Could not determine page count: {str(e)}") return 1 def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict: """ Process a document file (PDF, PNG, JPEG) with AWS Textract. Uses async API (start_document_text_detection) for multi-page PDFs, and sync API (detect_document_text) for single-page PDFs and images. Args: pdf_path: Path to the document file (local path or S3 key) bucket_name: Optional S3 bucket name if document is in S3 Returns: dict: Textract response containing detected text """ textract = boto3.client('textract') file_ext = Path(pdf_path).suffix.lower() # For images (PNG, JPEG), always use sync API if file_ext in ['.png', '.jpg', '.jpeg']: print(f"Processing image file with sync API") if bucket_name: response = textract.detect_document_text( Document={ 'S3Object': { 'Bucket': bucket_name, 'Name': pdf_path } } ) else: with open(pdf_path, 'rb') as file: file_bytes = file.read() response = textract.detect_document_text( Document={'Bytes': file_bytes} ) return response # For PDFs, check page count to decide which API to use if file_ext == '.pdf': s3 = boto3.client('s3') # Determine number of pages if bucket_name: # Download PDF from S3 to check page count response = s3.get_object(Bucket=bucket_name, Key=pdf_path) pdf_bytes = response['Body'].read() else: # Read local PDF with open(pdf_path, 'rb') as pdf_file: pdf_bytes = pdf_file.read() page_count = get_pdf_page_count(pdf_bytes) print(f"PDF has {page_count} page(s)") # Use async API for multi-page PDFs if page_count > 1: print("Using async API (start_document_text_detection) for multi-page PDF") if bucket_name: # Process from S3 response = textract.start_document_text_detection( DocumentLocation={ 'S3Object': { 'Bucket': bucket_name, 'Name': pdf_path } } ) else: # For local files with multiple pages, we need to use S3 # Note: Textract async API requires S3 raise ValueError( "Multi-page PDFs must be processed from S3. " "Please upload the file to S3 first." ) job_id = response['JobId'] print(f"Started Textract job: {job_id}") # Wait for job to complete import time while True: result = textract.get_document_text_detection(JobId=job_id) status = result['JobStatus'] print(f"Job status: {status}") if status in ['SUCCEEDED', 'FAILED']: break time.sleep(2) return result else: # Use sync API for single-page PDFs print("Using sync API (detect_document_text) for single-page PDF") if bucket_name: response = textract.detect_document_text( Document={ 'S3Object': { 'Bucket': bucket_name, 'Name': pdf_path } } ) else: response = textract.detect_document_text( Document={'Bytes': pdf_bytes} ) return response # Unsupported file type raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg") def extract_text_from_response(response: dict) -> str: """ Extract plain text from Textract response. Args: response: Textract API response Returns: str: Extracted text """ text_lines = [] for block in response.get('Blocks', []): if block['BlockType'] == 'LINE': text_lines.append(block['Text']) return '\n'.join(text_lines) def main(): if len(sys.argv) < 2: print("Usage: python textract_pdf.py [s3_bucket]") print("\nExamples:") print(" python textract_pdf.py document.pdf") print(" python textract_pdf.py path/to/doc.pdf my-bucket") sys.exit(1) pdf_path = sys.argv[1] bucket_name = sys.argv[2] if len(sys.argv) > 2 else None if not bucket_name and not Path(pdf_path).exists(): print(f"Error: File not found: {pdf_path}") sys.exit(1) print(f"Processing PDF: {pdf_path}") if bucket_name: print(f"Using S3 bucket: {bucket_name}") # Process PDF response = process_pdf_with_textract(pdf_path, bucket_name) # Extract and display text text = extract_text_from_response(response) print("\n" + "="*80) print("EXTRACTED TEXT") print("="*80) print(text) print("="*80) # Print summary num_blocks = len(response.get('Blocks', [])) num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', []))) print(f"\nSummary:") print(f" Pages processed: {num_pages}") print(f" Total blocks: {num_blocks}") print(f" Text length: {len(text)} characters") if __name__ == '__main__': main()