Adds initial files]
This commit is contained in:
209
scripts/textract.py
Normal file
209
scripts/textract.py
Normal file
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple script to invoke AWS Textract on a PDF file.
|
||||
Extracts text and returns the detected content.
|
||||
"""
|
||||
|
||||
import boto3
|
||||
import sys
|
||||
import io
|
||||
from pathlib import Path
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
|
||||
def get_pdf_page_count(pdf_bytes: bytes) -> int:
|
||||
"""
|
||||
Get the number of pages in a PDF file.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file content as bytes
|
||||
|
||||
Returns:
|
||||
int: Number of pages in the PDF
|
||||
"""
|
||||
try:
|
||||
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
|
||||
return len(pdf_reader.pages)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not determine page count: {str(e)}")
|
||||
return 1
|
||||
|
||||
|
||||
def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
|
||||
"""
|
||||
Process a document file (PDF, PNG, JPEG) with AWS Textract.
|
||||
Uses async API (start_document_text_detection) for multi-page PDFs,
|
||||
and sync API (detect_document_text) for single-page PDFs and images.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the document file (local path or S3 key)
|
||||
bucket_name: Optional S3 bucket name if document is in S3
|
||||
|
||||
Returns:
|
||||
dict: Textract response containing detected text
|
||||
"""
|
||||
textract = boto3.client('textract')
|
||||
file_ext = Path(pdf_path).suffix.lower()
|
||||
|
||||
# For images (PNG, JPEG), always use sync API
|
||||
if file_ext in ['.png', '.jpg', '.jpeg']:
|
||||
print(f"Processing image file with sync API")
|
||||
|
||||
if bucket_name:
|
||||
response = textract.detect_document_text(
|
||||
Document={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': pdf_path
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
with open(pdf_path, 'rb') as file:
|
||||
file_bytes = file.read()
|
||||
response = textract.detect_document_text(
|
||||
Document={'Bytes': file_bytes}
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# For PDFs, check page count to decide which API to use
|
||||
if file_ext == '.pdf':
|
||||
s3 = boto3.client('s3')
|
||||
|
||||
# Determine number of pages
|
||||
if bucket_name:
|
||||
# Download PDF from S3 to check page count
|
||||
response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
|
||||
pdf_bytes = response['Body'].read()
|
||||
else:
|
||||
# Read local PDF
|
||||
with open(pdf_path, 'rb') as pdf_file:
|
||||
pdf_bytes = pdf_file.read()
|
||||
|
||||
page_count = get_pdf_page_count(pdf_bytes)
|
||||
print(f"PDF has {page_count} page(s)")
|
||||
|
||||
# Use async API for multi-page PDFs
|
||||
if page_count > 1:
|
||||
print("Using async API (start_document_text_detection) for multi-page PDF")
|
||||
|
||||
if bucket_name:
|
||||
# Process from S3
|
||||
response = textract.start_document_text_detection(
|
||||
DocumentLocation={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': pdf_path
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
# For local files with multiple pages, we need to use S3
|
||||
# Note: Textract async API requires S3
|
||||
raise ValueError(
|
||||
"Multi-page PDFs must be processed from S3. "
|
||||
"Please upload the file to S3 first."
|
||||
)
|
||||
|
||||
job_id = response['JobId']
|
||||
print(f"Started Textract job: {job_id}")
|
||||
|
||||
# Wait for job to complete
|
||||
import time
|
||||
while True:
|
||||
result = textract.get_document_text_detection(JobId=job_id)
|
||||
status = result['JobStatus']
|
||||
print(f"Job status: {status}")
|
||||
|
||||
if status in ['SUCCEEDED', 'FAILED']:
|
||||
break
|
||||
time.sleep(2)
|
||||
|
||||
return result
|
||||
else:
|
||||
# Use sync API for single-page PDFs
|
||||
print("Using sync API (detect_document_text) for single-page PDF")
|
||||
|
||||
if bucket_name:
|
||||
response = textract.detect_document_text(
|
||||
Document={
|
||||
'S3Object': {
|
||||
'Bucket': bucket_name,
|
||||
'Name': pdf_path
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
response = textract.detect_document_text(
|
||||
Document={'Bytes': pdf_bytes}
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
# Unsupported file type
|
||||
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
|
||||
|
||||
|
||||
def extract_text_from_response(response: dict) -> str:
|
||||
"""
|
||||
Extract plain text from Textract response.
|
||||
|
||||
Args:
|
||||
response: Textract API response
|
||||
|
||||
Returns:
|
||||
str: Extracted text
|
||||
"""
|
||||
text_lines = []
|
||||
|
||||
for block in response.get('Blocks', []):
|
||||
if block['BlockType'] == 'LINE':
|
||||
text_lines.append(block['Text'])
|
||||
|
||||
return '\n'.join(text_lines)
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
|
||||
print("\nExamples:")
|
||||
print(" python textract_pdf.py document.pdf")
|
||||
print(" python textract_pdf.py path/to/doc.pdf my-bucket")
|
||||
sys.exit(1)
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
|
||||
if not bucket_name and not Path(pdf_path).exists():
|
||||
print(f"Error: File not found: {pdf_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Processing PDF: {pdf_path}")
|
||||
if bucket_name:
|
||||
print(f"Using S3 bucket: {bucket_name}")
|
||||
|
||||
# Process PDF
|
||||
response = process_pdf_with_textract(pdf_path, bucket_name)
|
||||
|
||||
# Extract and display text
|
||||
text = extract_text_from_response(response)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("EXTRACTED TEXT")
|
||||
print("="*80)
|
||||
print(text)
|
||||
print("="*80)
|
||||
|
||||
# Print summary
|
||||
num_blocks = len(response.get('Blocks', []))
|
||||
num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
|
||||
|
||||
print(f"\nSummary:")
|
||||
print(f" Pages processed: {num_pages}")
|
||||
print(f" Total blocks: {num_blocks}")
|
||||
print(f" Text length: {len(text)} characters")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user