209 lines
6.2 KiB
Python
209 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple script to invoke AWS Textract on a PDF file.
|
|
Extracts text and returns the detected content.
|
|
"""
|
|
|
|
import boto3
|
|
import sys
|
|
import io
|
|
from pathlib import Path
|
|
from PyPDF2 import PdfReader
|
|
|
|
|
|
def get_pdf_page_count(pdf_bytes: bytes) -> int:
|
|
"""
|
|
Get the number of pages in a PDF file.
|
|
|
|
Args:
|
|
pdf_bytes: PDF file content as bytes
|
|
|
|
Returns:
|
|
int: Number of pages in the PDF
|
|
"""
|
|
try:
|
|
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
return len(pdf_reader.pages)
|
|
except Exception as e:
|
|
print(f"Warning: Could not determine page count: {str(e)}")
|
|
return 1
|
|
|
|
|
|
def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
|
|
"""
|
|
Process a document file (PDF, PNG, JPEG) with AWS Textract.
|
|
Uses async API (start_document_text_detection) for multi-page PDFs,
|
|
and sync API (detect_document_text) for single-page PDFs and images.
|
|
|
|
Args:
|
|
pdf_path: Path to the document file (local path or S3 key)
|
|
bucket_name: Optional S3 bucket name if document is in S3
|
|
|
|
Returns:
|
|
dict: Textract response containing detected text
|
|
"""
|
|
textract = boto3.client('textract')
|
|
file_ext = Path(pdf_path).suffix.lower()
|
|
|
|
# For images (PNG, JPEG), always use sync API
|
|
if file_ext in ['.png', '.jpg', '.jpeg']:
|
|
print(f"Processing image file with sync API")
|
|
|
|
if bucket_name:
|
|
response = textract.detect_document_text(
|
|
Document={
|
|
'S3Object': {
|
|
'Bucket': bucket_name,
|
|
'Name': pdf_path
|
|
}
|
|
}
|
|
)
|
|
else:
|
|
with open(pdf_path, 'rb') as file:
|
|
file_bytes = file.read()
|
|
response = textract.detect_document_text(
|
|
Document={'Bytes': file_bytes}
|
|
)
|
|
|
|
return response
|
|
|
|
# For PDFs, check page count to decide which API to use
|
|
if file_ext == '.pdf':
|
|
s3 = boto3.client('s3')
|
|
|
|
# Determine number of pages
|
|
if bucket_name:
|
|
# Download PDF from S3 to check page count
|
|
response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
|
|
pdf_bytes = response['Body'].read()
|
|
else:
|
|
# Read local PDF
|
|
with open(pdf_path, 'rb') as pdf_file:
|
|
pdf_bytes = pdf_file.read()
|
|
|
|
page_count = get_pdf_page_count(pdf_bytes)
|
|
print(f"PDF has {page_count} page(s)")
|
|
|
|
# Use async API for multi-page PDFs
|
|
if page_count > 1:
|
|
print("Using async API (start_document_text_detection) for multi-page PDF")
|
|
|
|
if bucket_name:
|
|
# Process from S3
|
|
response = textract.start_document_text_detection(
|
|
DocumentLocation={
|
|
'S3Object': {
|
|
'Bucket': bucket_name,
|
|
'Name': pdf_path
|
|
}
|
|
}
|
|
)
|
|
else:
|
|
# For local files with multiple pages, we need to use S3
|
|
# Note: Textract async API requires S3
|
|
raise ValueError(
|
|
"Multi-page PDFs must be processed from S3. "
|
|
"Please upload the file to S3 first."
|
|
)
|
|
|
|
job_id = response['JobId']
|
|
print(f"Started Textract job: {job_id}")
|
|
|
|
# Wait for job to complete
|
|
import time
|
|
while True:
|
|
result = textract.get_document_text_detection(JobId=job_id)
|
|
status = result['JobStatus']
|
|
print(f"Job status: {status}")
|
|
|
|
if status in ['SUCCEEDED', 'FAILED']:
|
|
break
|
|
time.sleep(2)
|
|
|
|
return result
|
|
else:
|
|
# Use sync API for single-page PDFs
|
|
print("Using sync API (detect_document_text) for single-page PDF")
|
|
|
|
if bucket_name:
|
|
response = textract.detect_document_text(
|
|
Document={
|
|
'S3Object': {
|
|
'Bucket': bucket_name,
|
|
'Name': pdf_path
|
|
}
|
|
}
|
|
)
|
|
else:
|
|
response = textract.detect_document_text(
|
|
Document={'Bytes': pdf_bytes}
|
|
)
|
|
|
|
return response
|
|
|
|
# Unsupported file type
|
|
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
|
|
|
|
|
|
def extract_text_from_response(response: dict) -> str:
|
|
"""
|
|
Extract plain text from Textract response.
|
|
|
|
Args:
|
|
response: Textract API response
|
|
|
|
Returns:
|
|
str: Extracted text
|
|
"""
|
|
text_lines = []
|
|
|
|
for block in response.get('Blocks', []):
|
|
if block['BlockType'] == 'LINE':
|
|
text_lines.append(block['Text'])
|
|
|
|
return '\n'.join(text_lines)
|
|
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
|
|
print("\nExamples:")
|
|
print(" python textract_pdf.py document.pdf")
|
|
print(" python textract_pdf.py path/to/doc.pdf my-bucket")
|
|
sys.exit(1)
|
|
|
|
pdf_path = sys.argv[1]
|
|
bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
|
|
|
|
if not bucket_name and not Path(pdf_path).exists():
|
|
print(f"Error: File not found: {pdf_path}")
|
|
sys.exit(1)
|
|
|
|
print(f"Processing PDF: {pdf_path}")
|
|
if bucket_name:
|
|
print(f"Using S3 bucket: {bucket_name}")
|
|
|
|
# Process PDF
|
|
response = process_pdf_with_textract(pdf_path, bucket_name)
|
|
|
|
# Extract and display text
|
|
text = extract_text_from_response(response)
|
|
|
|
print("\n" + "="*80)
|
|
print("EXTRACTED TEXT")
|
|
print("="*80)
|
|
print(text)
|
|
print("="*80)
|
|
|
|
# Print summary
|
|
num_blocks = len(response.get('Blocks', []))
|
|
num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
|
|
|
|
print(f"\nSummary:")
|
|
print(f" Pages processed: {num_pages}")
|
|
print(f" Total blocks: {num_blocks}")
|
|
print(f" Text length: {len(text)} characters")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main() |