AI-upflux-docprocessor/scripts/textract.py

#!/usr/bin/env python3
"""
Simple script to invoke AWS Textract on a PDF file.
Extracts text and returns the detected content.
"""

import boto3
import sys
import io
from pathlib import Path
from PyPDF2 import PdfReader


def get_pdf_page_count(pdf_bytes: bytes) -> int:
    """
    Get the number of pages in a PDF file.

    Args:
        pdf_bytes: PDF file content as bytes

    Returns:
        int: Number of pages in the PDF
    """
    try:
        pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
        return len(pdf_reader.pages)
    except Exception as e:
        print(f"Warning: Could not determine page count: {str(e)}")
        return 1


def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
    """
    Process a document file (PDF, PNG, JPEG) with AWS Textract.
    Uses async API (start_document_text_detection) for multi-page PDFs,
    and sync API (detect_document_text) for single-page PDFs and images.

    Args:
        pdf_path: Path to the document file (local path or S3 key)
        bucket_name: Optional S3 bucket name if document is in S3

    Returns:
        dict: Textract response containing detected text
    """
    textract = boto3.client('textract')
    file_ext = Path(pdf_path).suffix.lower()

    # For images (PNG, JPEG), always use sync API
    if file_ext in ['.png', '.jpg', '.jpeg']:
        print(f"Processing image file with sync API")

        if bucket_name:
            response = textract.detect_document_text(
                Document={
                    'S3Object': {
                        'Bucket': bucket_name,
                        'Name': pdf_path
                    }
                }
            )
        else:
            with open(pdf_path, 'rb') as file:
                file_bytes = file.read()
            response = textract.detect_document_text(
                Document={'Bytes': file_bytes}
            )

        return response

    # For PDFs, check page count to decide which API to use
    if file_ext == '.pdf':
        s3 = boto3.client('s3')

        # Determine number of pages
        if bucket_name:
            # Download PDF from S3 to check page count
            response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
            pdf_bytes = response['Body'].read()
        else:
            # Read local PDF
            with open(pdf_path, 'rb') as pdf_file:
                pdf_bytes = pdf_file.read()

        page_count = get_pdf_page_count(pdf_bytes)
        print(f"PDF has {page_count} page(s)")

        # Use async API for multi-page PDFs
        if page_count > 1:
            print("Using async API (start_document_text_detection) for multi-page PDF")

            if bucket_name:
                # Process from S3
                response = textract.start_document_text_detection(
                    DocumentLocation={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': pdf_path
                        }
                    }
                )
            else:
                # For local files with multiple pages, we need to use S3
                # Note: Textract async API requires S3
                raise ValueError(
                    "Multi-page PDFs must be processed from S3. "
                    "Please upload the file to S3 first."
                )

            job_id = response['JobId']
            print(f"Started Textract job: {job_id}")

            # Wait for job to complete
            import time
            while True:
                result = textract.get_document_text_detection(JobId=job_id)
                status = result['JobStatus']
                print(f"Job status: {status}")

                if status in ['SUCCEEDED', 'FAILED']:
                    break
                time.sleep(2)

            return result
        else:
            # Use sync API for single-page PDFs
            print("Using sync API (detect_document_text) for single-page PDF")

            if bucket_name:
                response = textract.detect_document_text(
                    Document={
                        'S3Object': {
                            'Bucket': bucket_name,
                            'Name': pdf_path
                        }
                    }
                )
            else:
                response = textract.detect_document_text(
                    Document={'Bytes': pdf_bytes}
                )

            return response

    # Unsupported file type
    raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")


def extract_text_from_response(response: dict) -> str:
    """
    Extract plain text from Textract response.

    Args:
        response: Textract API response

    Returns:
        str: Extracted text
    """
    text_lines = []

    for block in response.get('Blocks', []):
        if block['BlockType'] == 'LINE':
            text_lines.append(block['Text'])

    return '\n'.join(text_lines)


def main():
    if len(sys.argv) < 2:
        print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
        print("\nExamples:")
        print("  python textract_pdf.py document.pdf")
        print("  python textract_pdf.py path/to/doc.pdf my-bucket")
        sys.exit(1)

    pdf_path = sys.argv[1]
    bucket_name = sys.argv[2] if len(sys.argv) > 2 else None

    if not bucket_name and not Path(pdf_path).exists():
        print(f"Error: File not found: {pdf_path}")
        sys.exit(1)

    print(f"Processing PDF: {pdf_path}")
    if bucket_name:
        print(f"Using S3 bucket: {bucket_name}")

    # Process PDF
    response = process_pdf_with_textract(pdf_path, bucket_name)

    # Extract and display text
    text = extract_text_from_response(response)

    print("\n" + "="*80)
    print("EXTRACTED TEXT")
    print("="*80)
    print(text)
    print("="*80)

    # Print summary
    num_blocks = len(response.get('Blocks', []))
    num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))

    print(f"\nSummary:")
    print(f"  Pages processed: {num_pages}")
    print(f"  Total blocks: {num_blocks}")
    print(f"  Text length: {len(text)} characters")


if __name__ == '__main__':
    main()