Skip to content

Commit

Permalink
Demo for serverless integration with Textract
Browse files Browse the repository at this point in the history
  • Loading branch information
Vinod Kumar authored and Vinod Kumar committed Dec 12, 2024
1 parent 9331c49 commit 8c00f78
Show file tree
Hide file tree
Showing 12 changed files with 1,883 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
webapp/node_modules/
webapp/package-lock.json
iac/aws/terraform/creating-custom-vpc/.terraform/
iac/demo/textract/.terraform.lock.hcl
iac/demo/textract/.terraform/*

27 changes: 27 additions & 0 deletions iac/demo/textract/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Python Lambda files
LAMBDA1_FILE = lambda_function.py
LAMBDA2_FILE = sqs_to_csv_lambda.py

# Output zip files
LAMBDA1_ZIP = lambda_function.zip
LAMBDA2_ZIP = sqs_to_csv_lambda.zip

# Default target
all: zip-lambdas

# Zip the Lambda functions
zip-lambdas: $(LAMBDA1_ZIP) $(LAMBDA2_ZIP)

$(LAMBDA1_ZIP): $(LAMBDA1_FILE)
@echo "Zipping $(LAMBDA1_FILE) into $(LAMBDA1_ZIP)..."
zip $(LAMBDA1_ZIP) $(LAMBDA1_FILE)

$(LAMBDA2_ZIP): $(LAMBDA2_FILE)
@echo "Zipping $(LAMBDA2_FILE) into $(LAMBDA2_ZIP)..."
zip $(LAMBDA2_ZIP) $(LAMBDA2_FILE)

# Clean the zip files
clean:
rm -f $(LAMBDA1_ZIP) $(LAMBDA2_ZIP)
@echo "Cleaned up old zip files!"

136 changes: 136 additions & 0 deletions iac/demo/textract/lambda_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import boto3
import json
import os

s3_client = boto3.client('s3')
textract_client = boto3.client('textract')
sns_client = boto3.client('sns')

SNS_TOPIC_ARN = os.environ['SNS_TOPIC_ARN'] # Environment variable for SNS topic ARN

SUPPORTED_IMAGE_EXTENSIONS = ['.png', '.jpg', '.jpeg']
SUPPORTED_PDF_EXTENSION = '.pdf'

def lambda_handler(event, context):
try:
print(f"Event: {json.dumps(event)}")

# Get S3 bucket and object key from the S3 event
bucket_name = event['Records'][0]['s3']['bucket']['name']
object_key = event['Records'][0]['s3']['object']['key']

print('bucket_name::', bucket_name, ' - object_key::', object_key)

# Validate file extension
if any(object_key.lower().endswith(ext) for ext in SUPPORTED_IMAGE_EXTENSIONS):
# Process image files
response = textract_client.detect_document_text(
Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}
)

print('Textract detect_document_text response::', response)

# Extract text blocks
# text_blocks = [block['Text'] for block in response['Blocks'] if block['BlockType'] == 'LINE']
# extracted_text = '\n'.join(text_blocks)
# print('extracted_text::', extracted_text)


# Extract text blocks with confidence scores
text_blocks_with_confidence = []
for block in response['Blocks']:
if block['BlockType'] == 'LINE':
text_blocks_with_confidence.append((block['Text'], block['Confidence']))


# Format the extracted text with confidence scores
extracted_text_with_confidence = '\n'.join([f"{text} (Confidence: {confidence:.2f})" for text, confidence in text_blocks_with_confidence])
print('extracted_text_with_confidence::', extracted_text_with_confidence)


# Send extracted text to SNS
sns_client.publish(
TopicArn=SNS_TOPIC_ARN,
Message=json.dumps({
'bucket': bucket_name,
'key': object_key,
'text': extracted_text_with_confidence
}),
Subject='Textract Extracted Text from Image'
)

elif object_key.lower().endswith(SUPPORTED_PDF_EXTENSION):
# Process PDF files
try:
response = textract_client.analyze_document(
Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}},
FeatureTypes=['QUERIES'],
QueriesConfig={
'Queries': [
{'Text': 'What is the event name?', 'Alias': 'EventName'},
{'Text': 'What is the Location?', 'Alias': 'Location'}
]
}
)

print('Textract analyze_document response::', response)

# # Extract key-value pairs from QUERY_RESULT blocks
# query_results = {
# block['QueryResult']['Alias']: block['Text']
# for block in response['Blocks']
# if block['BlockType'] == 'QUERY_RESULT'
# }


query_results = {}
for block in response['Blocks']:
if block['BlockType'] == 'QUERY_RESULT':
for relationship in block['Relationships']:
if relationship['Type'] == 'ANSWER':
query_id = relationship['Ids'][0]
for query_block in response['Blocks']:
if query_block['Id'] == query_id:
query_alias = query_block['Query']['Alias']
query_results[query_alias] = {
'Text': block['Text'],
'Confidence': block['Confidence']
}

print('query_results::', query_results)


# Send key-value pairs to SNS
sns_client.publish(
TopicArn=SNS_TOPIC_ARN,
Message=json.dumps({
'bucket': bucket_name,
'key': object_key,
'key_value_pairs': query_results
}),
Subject='Textract Extracted Key-Value Pairs from PDF'
)

except Exception as e:
print(f"Error processing file: {e}")

else:
raise ValueError(f"Unsupported file extension for file: {object_key}")

return {
'statusCode': 200,
'body': json.dumps('File processed successfully and data sent to SNS')
}

except textract_client.exceptions.UnsupportedDocumentException as e:
print(f"Unsupported document format: {e}")
return {
'statusCode': 400,
'body': json.dumps('Unsupported document format')
}
except Exception as e:
print(f"Error processing file: {e}")
return {
'statusCode': 500,
'body': json.dumps(f"Error processing file: {str(e)}")
}
Binary file added iac/demo/textract/lambda_function.zip
Binary file not shown.
65 changes: 65 additions & 0 deletions iac/demo/textract/lambda_function_backup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import boto3
import json
import os

s3_client = boto3.client('s3')
textract_client = boto3.client('textract')
sns_client = boto3.client('sns')

SNS_TOPIC_ARN = os.environ['SNS_TOPIC_ARN'] # Environment variable for SNS topic ARN

SUPPORTED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.pdf'] # Add more formats if necessary

def lambda_handler(event, context):
try:
print(f"Event: {json.dumps(event)}")

# Get S3 bucket and object key from the S3 event
bucket_name = event['Records'][0]['s3']['bucket']['name']
object_key = event['Records'][0]['s3']['object']['key']

print('bucket_name::', bucket_name, ' - object_key::', object_key)

# Validate file extension
if not any(object_key.lower().endswith(ext) for ext in SUPPORTED_EXTENSIONS):
raise ValueError(f"Unsupported file extension for file: {object_key}")

# Call Textract to extract text
response = textract_client.detect_document_text(
Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}
)

print('response::', response)

# Extract text blocks
text_blocks = [block['Text'] for block in response['Blocks'] if block['BlockType'] == 'LINE']
extracted_text = '\n'.join(text_blocks)

# Send extracted text to SNS
sns_client.publish(
TopicArn=SNS_TOPIC_ARN,
Message=json.dumps({
'bucket': bucket_name,
'key': object_key,
'text': extracted_text
}),
Subject='Textract Extracted Text'
)

return {
'statusCode': 200,
'body': json.dumps('Text extracted and sent to SNS')
}

except textract_client.exceptions.UnsupportedDocumentException as e:
print(f"Unsupported document format: {e}")
return {
'statusCode': 400,
'body': json.dumps('Unsupported document format')
}
except Exception as e:
print(f"Error processing file: {e}")
return {
'statusCode': 500,
'body': json.dumps(f"Error processing file: {str(e)}")
}
Loading

0 comments on commit 8c00f78

Please sign in to comment.