Adds initial files]

2026-01-20 13:48:13 -03:00
parent 12176c50c1
commit 6026870c5c
16 changed files with 1316 additions and 0 deletions
--- a/scripts/textract.py
+++ b/scripts/textract.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Simple script to invoke AWS Textract on a PDF file.
+Extracts text and returns the detected content.
+"""
+
+import boto3
+import sys
+import io
+from pathlib import Path
+from PyPDF2 import PdfReader
+
+
+def get_pdf_page_count(pdf_bytes: bytes) -> int:
+    """
+    Get the number of pages in a PDF file.
+
+    Args:
+        pdf_bytes: PDF file content as bytes
+
+    Returns:
+        int: Number of pages in the PDF
+    """
+    try:
+        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
+        return len(pdf_reader.pages)
+    except Exception as e:
+        print(f"Warning: Could not determine page count: {str(e)}")
+        return 1
+
+
+def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
+    """
+    Process a document file (PDF, PNG, JPEG) with AWS Textract.
+    Uses async API (start_document_text_detection) for multi-page PDFs,
+    and sync API (detect_document_text) for single-page PDFs and images.
+
+    Args:
+        pdf_path: Path to the document file (local path or S3 key)
+        bucket_name: Optional S3 bucket name if document is in S3
+
+    Returns:
+        dict: Textract response containing detected text
+    """
+    textract = boto3.client('textract')
+    file_ext = Path(pdf_path).suffix.lower()
+
+    # For images (PNG, JPEG), always use sync API
+    if file_ext in ['.png', '.jpg', '.jpeg']:
+        print(f"Processing image file with sync API")
+
+        if bucket_name:
+            response = textract.detect_document_text(
+                Document={
+                    'S3Object': {
+                        'Bucket': bucket_name,
+                        'Name': pdf_path
+                    }
+                }
+            )
+        else:
+            with open(pdf_path, 'rb') as file:
+                file_bytes = file.read()
+            response = textract.detect_document_text(
+                Document={'Bytes': file_bytes}
+            )
+
+        return response
+
+    # For PDFs, check page count to decide which API to use
+    if file_ext == '.pdf':
+        s3 = boto3.client('s3')
+
+        # Determine number of pages
+        if bucket_name:
+            # Download PDF from S3 to check page count
+            response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
+            pdf_bytes = response['Body'].read()
+        else:
+            # Read local PDF
+            with open(pdf_path, 'rb') as pdf_file:
+                pdf_bytes = pdf_file.read()
+
+        page_count = get_pdf_page_count(pdf_bytes)
+        print(f"PDF has {page_count} page(s)")
+
+        # Use async API for multi-page PDFs
+        if page_count > 1:
+            print("Using async API (start_document_text_detection) for multi-page PDF")
+
+            if bucket_name:
+                # Process from S3
+                response = textract.start_document_text_detection(
+                    DocumentLocation={
+                        'S3Object': {
+                            'Bucket': bucket_name,
+                            'Name': pdf_path
+                        }
+                    }
+                )
+            else:
+                # For local files with multiple pages, we need to use S3
+                # Note: Textract async API requires S3
+                raise ValueError(
+                    "Multi-page PDFs must be processed from S3. "
+                    "Please upload the file to S3 first."
+                )
+
+            job_id = response['JobId']
+            print(f"Started Textract job: {job_id}")
+
+            # Wait for job to complete
+            import time
+            while True:
+                result = textract.get_document_text_detection(JobId=job_id)
+                status = result['JobStatus']
+                print(f"Job status: {status}")
+
+                if status in ['SUCCEEDED', 'FAILED']:
+                    break
+                time.sleep(2)
+
+            return result
+        else:
+            # Use sync API for single-page PDFs
+            print("Using sync API (detect_document_text) for single-page PDF")
+
+            if bucket_name:
+                response = textract.detect_document_text(
+                    Document={
+                        'S3Object': {
+                            'Bucket': bucket_name,
+                            'Name': pdf_path
+                        }
+                    }
+                )
+            else:
+                response = textract.detect_document_text(
+                    Document={'Bytes': pdf_bytes}
+                )
+
+            return response
+
+    # Unsupported file type
+    raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
+
+
+def extract_text_from_response(response: dict) -> str:
+    """
+    Extract plain text from Textract response.
+    
+    Args:
+        response: Textract API response
+    
+    Returns:
+        str: Extracted text
+    """
+    text_lines = []
+    
+    for block in response.get('Blocks', []):
+        if block['BlockType'] == 'LINE':
+            text_lines.append(block['Text'])
+    
+    return '\n'.join(text_lines)
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
+        print("\nExamples:")
+        print("  python textract_pdf.py document.pdf")
+        print("  python textract_pdf.py path/to/doc.pdf my-bucket")
+        sys.exit(1)
+    
+    pdf_path = sys.argv[1]
+    bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
+    
+    if not bucket_name and not Path(pdf_path).exists():
+        print(f"Error: File not found: {pdf_path}")
+        sys.exit(1)
+    
+    print(f"Processing PDF: {pdf_path}")
+    if bucket_name:
+        print(f"Using S3 bucket: {bucket_name}")
+    
+    # Process PDF
+    response = process_pdf_with_textract(pdf_path, bucket_name)
+    
+    # Extract and display text
+    text = extract_text_from_response(response)
+    
+    print("\n" + "="*80)
+    print("EXTRACTED TEXT")
+    print("="*80)
+    print(text)
+    print("="*80)
+    
+    # Print summary
+    num_blocks = len(response.get('Blocks', []))
+    num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
+    
+    print(f"\nSummary:")
+    print(f"  Pages processed: {num_pages}")
+    print(f"  Total blocks: {num_blocks}")
+    print(f"  Text length: {len(text)} characters")
+
+
+if __name__ == '__main__':
+    main()