Adds initial files]

This commit is contained in:
2026-01-20 13:48:13 -03:00
parent 12176c50c1
commit 6026870c5c
16 changed files with 1316 additions and 0 deletions

209
scripts/textract.py Normal file
View File

@@ -0,0 +1,209 @@
#!/usr/bin/env python3
"""
Simple script to invoke AWS Textract on a PDF file.
Extracts text and returns the detected content.
"""
import boto3
import sys
import io
from pathlib import Path
from PyPDF2 import PdfReader
def get_pdf_page_count(pdf_bytes: bytes) -> int:
"""
Get the number of pages in a PDF file.
Args:
pdf_bytes: PDF file content as bytes
Returns:
int: Number of pages in the PDF
"""
try:
pdf_reader = PdfReader(io.BytesIO(pdf_bytes))
return len(pdf_reader.pages)
except Exception as e:
print(f"Warning: Could not determine page count: {str(e)}")
return 1
def process_pdf_with_textract(pdf_path: str, bucket_name: str = None) -> dict:
"""
Process a document file (PDF, PNG, JPEG) with AWS Textract.
Uses async API (start_document_text_detection) for multi-page PDFs,
and sync API (detect_document_text) for single-page PDFs and images.
Args:
pdf_path: Path to the document file (local path or S3 key)
bucket_name: Optional S3 bucket name if document is in S3
Returns:
dict: Textract response containing detected text
"""
textract = boto3.client('textract')
file_ext = Path(pdf_path).suffix.lower()
# For images (PNG, JPEG), always use sync API
if file_ext in ['.png', '.jpg', '.jpeg']:
print(f"Processing image file with sync API")
if bucket_name:
response = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': bucket_name,
'Name': pdf_path
}
}
)
else:
with open(pdf_path, 'rb') as file:
file_bytes = file.read()
response = textract.detect_document_text(
Document={'Bytes': file_bytes}
)
return response
# For PDFs, check page count to decide which API to use
if file_ext == '.pdf':
s3 = boto3.client('s3')
# Determine number of pages
if bucket_name:
# Download PDF from S3 to check page count
response = s3.get_object(Bucket=bucket_name, Key=pdf_path)
pdf_bytes = response['Body'].read()
else:
# Read local PDF
with open(pdf_path, 'rb') as pdf_file:
pdf_bytes = pdf_file.read()
page_count = get_pdf_page_count(pdf_bytes)
print(f"PDF has {page_count} page(s)")
# Use async API for multi-page PDFs
if page_count > 1:
print("Using async API (start_document_text_detection) for multi-page PDF")
if bucket_name:
# Process from S3
response = textract.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': bucket_name,
'Name': pdf_path
}
}
)
else:
# For local files with multiple pages, we need to use S3
# Note: Textract async API requires S3
raise ValueError(
"Multi-page PDFs must be processed from S3. "
"Please upload the file to S3 first."
)
job_id = response['JobId']
print(f"Started Textract job: {job_id}")
# Wait for job to complete
import time
while True:
result = textract.get_document_text_detection(JobId=job_id)
status = result['JobStatus']
print(f"Job status: {status}")
if status in ['SUCCEEDED', 'FAILED']:
break
time.sleep(2)
return result
else:
# Use sync API for single-page PDFs
print("Using sync API (detect_document_text) for single-page PDF")
if bucket_name:
response = textract.detect_document_text(
Document={
'S3Object': {
'Bucket': bucket_name,
'Name': pdf_path
}
}
)
else:
response = textract.detect_document_text(
Document={'Bytes': pdf_bytes}
)
return response
# Unsupported file type
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: .pdf, .png, .jpg, .jpeg")
def extract_text_from_response(response: dict) -> str:
"""
Extract plain text from Textract response.
Args:
response: Textract API response
Returns:
str: Extracted text
"""
text_lines = []
for block in response.get('Blocks', []):
if block['BlockType'] == 'LINE':
text_lines.append(block['Text'])
return '\n'.join(text_lines)
def main():
if len(sys.argv) < 2:
print("Usage: python textract_pdf.py <pdf_path> [s3_bucket]")
print("\nExamples:")
print(" python textract_pdf.py document.pdf")
print(" python textract_pdf.py path/to/doc.pdf my-bucket")
sys.exit(1)
pdf_path = sys.argv[1]
bucket_name = sys.argv[2] if len(sys.argv) > 2 else None
if not bucket_name and not Path(pdf_path).exists():
print(f"Error: File not found: {pdf_path}")
sys.exit(1)
print(f"Processing PDF: {pdf_path}")
if bucket_name:
print(f"Using S3 bucket: {bucket_name}")
# Process PDF
response = process_pdf_with_textract(pdf_path, bucket_name)
# Extract and display text
text = extract_text_from_response(response)
print("\n" + "="*80)
print("EXTRACTED TEXT")
print("="*80)
print(text)
print("="*80)
# Print summary
num_blocks = len(response.get('Blocks', []))
num_pages = len(set(b.get('Page', 1) for b in response.get('Blocks', [])))
print(f"\nSummary:")
print(f" Pages processed: {num_pages}")
print(f" Total blocks: {num_blocks}")
print(f" Text length: {len(text)} characters")
if __name__ == '__main__':
main()