-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Demo for serverless integration with Textract
- Loading branch information
Vinod Kumar
authored and
Vinod Kumar
committed
Nov 22, 2024
1 parent
9331c49
commit 7a2fb03
Showing
11 changed files
with
1,684 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Python Lambda files | ||
LAMBDA1_FILE = lambda_function.py | ||
LAMBDA2_FILE = sqs_to_csv_lambda.py | ||
|
||
# Output zip files | ||
LAMBDA1_ZIP = lambda_function.zip | ||
LAMBDA2_ZIP = sqs_to_csv_lambda.zip | ||
|
||
# Default target | ||
all: zip-lambdas | ||
|
||
# Zip the Lambda functions | ||
zip-lambdas: $(LAMBDA1_ZIP) $(LAMBDA2_ZIP) | ||
|
||
$(LAMBDA1_ZIP): $(LAMBDA1_FILE) | ||
@echo "Zipping $(LAMBDA1_FILE) into $(LAMBDA1_ZIP)..." | ||
zip $(LAMBDA1_ZIP) $(LAMBDA1_FILE) | ||
|
||
$(LAMBDA2_ZIP): $(LAMBDA2_FILE) | ||
@echo "Zipping $(LAMBDA2_FILE) into $(LAMBDA2_ZIP)..." | ||
zip $(LAMBDA2_ZIP) $(LAMBDA2_FILE) | ||
|
||
# Clean the zip files | ||
clean: | ||
rm -f $(LAMBDA1_ZIP) $(LAMBDA2_ZIP) | ||
@echo "Cleaned up old zip files!" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import boto3 | ||
import json | ||
import os | ||
|
||
s3_client = boto3.client('s3') | ||
textract_client = boto3.client('textract') | ||
sns_client = boto3.client('sns') | ||
|
||
SNS_TOPIC_ARN = os.environ['SNS_TOPIC_ARN'] # Environment variable for SNS topic ARN | ||
|
||
SUPPORTED_EXTENSIONS = ['.png', '.jpg', '.jpeg', '.pdf'] # Add more formats if necessary | ||
|
||
def lambda_handler(event, context): | ||
try: | ||
print(f"Event: {json.dumps(event)}") | ||
|
||
# Get S3 bucket and object key from the S3 event | ||
bucket_name = event['Records'][0]['s3']['bucket']['name'] | ||
object_key = event['Records'][0]['s3']['object']['key'] | ||
|
||
print('bucket_name::', bucket_name, ' - object_key::', object_key) | ||
|
||
# Validate file extension | ||
if not any(object_key.lower().endswith(ext) for ext in SUPPORTED_EXTENSIONS): | ||
raise ValueError(f"Unsupported file extension for file: {object_key}") | ||
|
||
# Call Textract to extract text | ||
response = textract_client.detect_document_text( | ||
Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}} | ||
) | ||
|
||
print('response::', response) | ||
|
||
# Extract text blocks | ||
text_blocks = [block['Text'] for block in response['Blocks'] if block['BlockType'] == 'LINE'] | ||
extracted_text = '\n'.join(text_blocks) | ||
|
||
# Send extracted text to SNS | ||
sns_client.publish( | ||
TopicArn=SNS_TOPIC_ARN, | ||
Message=json.dumps({ | ||
'bucket': bucket_name, | ||
'key': object_key, | ||
'text': extracted_text | ||
}), | ||
Subject='Textract Extracted Text' | ||
) | ||
|
||
return { | ||
'statusCode': 200, | ||
'body': json.dumps('Text extracted and sent to SNS') | ||
} | ||
|
||
except textract_client.exceptions.UnsupportedDocumentException as e: | ||
print(f"Unsupported document format: {e}") | ||
return { | ||
'statusCode': 400, | ||
'body': json.dumps('Unsupported document format') | ||
} | ||
except Exception as e: | ||
print(f"Error processing file: {e}") | ||
return { | ||
'statusCode': 500, | ||
'body': json.dumps(f"Error processing file: {str(e)}") | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
provider "aws" { | ||
region = var.region | ||
profile = var.aws_profile | ||
} | ||
|
||
# S3 Bucket | ||
resource "aws_s3_bucket" "textract_bucket" { | ||
bucket_prefix = var.s3_bucket_name | ||
|
||
tags = { | ||
Name = "TextractBucket" | ||
} | ||
force_destroy = true | ||
} | ||
|
||
# SNS Topic | ||
resource "aws_sns_topic" "textract_topic" { | ||
name = "textract-sns-topic" | ||
|
||
tags = { | ||
Name = "TextractSNSTopic" | ||
} | ||
} | ||
|
||
# Lambda Function | ||
resource "aws_lambda_function" "textract_lambda" { | ||
filename = "lambda_function.zip" | ||
function_name = var.lambda_function_name | ||
role = aws_iam_role.lambda_role.arn | ||
handler = "lambda_function.lambda_handler" | ||
runtime = "python3.11" | ||
source_code_hash = filebase64sha256("lambda_function.zip") | ||
timeout = 60 | ||
|
||
environment { | ||
variables = { | ||
SNS_TOPIC_ARN = aws_sns_topic.textract_topic.arn # Pass SNS Topic ARN as an environment variable | ||
} | ||
} | ||
} | ||
|
||
# S3 Bucket Notification to Lambda | ||
resource "aws_s3_bucket_notification" "bucket_notification" { | ||
bucket = aws_s3_bucket.textract_bucket.id | ||
|
||
lambda_function { | ||
lambda_function_arn = aws_lambda_function.textract_lambda.arn | ||
events = ["s3:ObjectCreated:*"] | ||
filter_suffix = ".pdf" | ||
} | ||
|
||
lambda_function { | ||
lambda_function_arn = aws_lambda_function.textract_lambda.arn | ||
events = ["s3:ObjectCreated:*"] | ||
filter_suffix = ".jpg" | ||
} | ||
|
||
depends_on = [aws_lambda_permission.allow_s3] | ||
} | ||
|
||
# IAM Role for Lambda | ||
resource "aws_iam_role" "lambda_role" { | ||
name = "lambda-textract-role" | ||
|
||
assume_role_policy = jsonencode({ | ||
Version = "2012-10-17" | ||
Statement = [ | ||
{ | ||
Action = "sts:AssumeRole" | ||
Effect = "Allow" | ||
Principal = { | ||
Service = "lambda.amazonaws.com" | ||
} | ||
} | ||
] | ||
}) | ||
} | ||
|
||
# IAM Policy for Lambda to publish to SNS | ||
resource "aws_iam_role_policy" "lambda_policy" { | ||
name = "lambda-textract-policy" | ||
role = aws_iam_role.lambda_role.id | ||
policy = jsonencode({ | ||
Version = "2012-10-17" | ||
Statement = [ | ||
{ | ||
Effect = "Allow" | ||
Action = ["s3:GetObject"] | ||
Resource = "${aws_s3_bucket.textract_bucket.arn}/*" | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = ["sns:Publish"] | ||
Resource = aws_sns_topic.textract_topic.arn | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = ["textract:DetectDocumentText"] | ||
Resource = "*" | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = [ | ||
"logs:CreateLogGroup", | ||
"logs:CreateLogStream", | ||
"logs:PutLogEvents" | ||
] | ||
Resource = "*" | ||
}, | ||
{ | ||
Effect = "Allow" | ||
Action = [ | ||
"sqs:ReceiveMessage", | ||
"sqs:DeleteMessage", | ||
"sqs:GetQueueAttributes" | ||
] | ||
Resource = aws_sqs_queue.textract_queue.arn | ||
} | ||
] | ||
}) | ||
} | ||
|
||
|
||
|
||
|
||
# Allow S3 to invoke Lambda | ||
resource "aws_lambda_permission" "allow_s3" { | ||
statement_id = "AllowS3Invoke" | ||
action = "lambda:InvokeFunction" | ||
function_name = aws_lambda_function.textract_lambda.function_name | ||
principal = "s3.amazonaws.com" | ||
source_arn = aws_s3_bucket.textract_bucket.arn | ||
} | ||
|
||
# SQS Queue | ||
resource "aws_sqs_queue" "textract_queue" { | ||
name = var.sqs_queue_name | ||
visibility_timeout_seconds = 60 | ||
|
||
tags = { | ||
Name = "TextractQueue" | ||
} | ||
} | ||
|
||
# SQS Queue Policy to allow SNS to publish messages | ||
resource "aws_sqs_queue_policy" "sns_publish_policy" { | ||
queue_url = aws_sqs_queue.textract_queue.id | ||
policy = jsonencode({ | ||
Version = "2012-10-17" | ||
Statement = [ | ||
{ | ||
Effect = "Allow" | ||
Principal = { | ||
Service = "sns.amazonaws.com" | ||
} | ||
Action = "sqs:SendMessage" | ||
Resource = aws_sqs_queue.textract_queue.arn | ||
Condition = { | ||
ArnEquals = { | ||
"aws:SourceArn" = aws_sns_topic.textract_topic.arn | ||
} | ||
} | ||
} | ||
] | ||
}) | ||
} | ||
|
||
# SNS Subscription for SQS | ||
resource "aws_sns_topic_subscription" "sns_to_sqs" { | ||
topic_arn = aws_sns_topic.textract_topic.arn | ||
protocol = "sqs" | ||
endpoint = aws_sqs_queue.textract_queue.arn | ||
|
||
# Allow SNS to publish to SQS | ||
depends_on = [aws_sqs_queue_policy.sns_publish_policy] | ||
} | ||
|
||
resource "aws_lambda_function" "sqs_to_csv_lambda" { | ||
filename = "sqs_to_csv_lambda.zip" # Path to your Lambda zip file | ||
function_name = var.lambda_function_name_2 | ||
role = aws_iam_role.lambda_role.arn # IAM role for Lambda | ||
handler = "sqs_to_csv_lambda.lambda_handler" # Lambda function handler | ||
runtime = "python3.11" # Lambda runtime | ||
source_code_hash = filebase64sha256("sqs_to_csv_lambda.zip") # Source code hash for validation | ||
timeout = 60 # Timeout in seconds | ||
|
||
environment { | ||
variables = { | ||
CSV_S3_BUCKET = aws_s3_bucket.textract_bucket.bucket # S3 bucket name | ||
CSV_S3_PREFIX = var.csv_s3_prefix # Prefix for CSV files | ||
} | ||
} | ||
} | ||
|
||
resource "aws_lambda_event_source_mapping" "sqs_trigger" { | ||
event_source_arn = aws_sqs_queue.textract_queue.arn | ||
function_name = aws_lambda_function.sqs_to_csv_lambda.arn | ||
batch_size = 1 | ||
enabled = true | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
output "s3_bucket_name" { | ||
value = aws_s3_bucket.textract_bucket.id | ||
} | ||
|
||
output "lambda_function_arn" { | ||
value = aws_lambda_function.textract_lambda.arn | ||
} | ||
|
||
output "sqs_queue_url" { | ||
value = aws_sqs_queue.textract_queue.id | ||
} | ||
|
||
output "csv_lambda_function_arn" { | ||
value = aws_lambda_function.sqs_to_csv_lambda.arn | ||
} | ||
|
||
output "csv_s3_prefix" { | ||
value = var.csv_s3_prefix | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import boto3 | ||
import csv | ||
import os | ||
import json | ||
from io import StringIO | ||
import datetime | ||
|
||
s3_client = boto3.client('s3') | ||
sqs_client = boto3.client('sqs') | ||
|
||
CSV_S3_BUCKET = os.environ['CSV_S3_BUCKET'] | ||
CSV_S3_PREFIX = os.environ['CSV_S3_PREFIX'] | ||
|
||
def lambda_handler(event, context): | ||
for record in event['Records']: | ||
# Parse the SQS message | ||
message_body = json.loads(record['body']) | ||
bucket = message_body['bucket'] | ||
key = message_body['key'] | ||
extracted_text = message_body['text'] | ||
|
||
# Prepare CSV data | ||
csv_buffer = StringIO() | ||
csv_writer = csv.writer(csv_buffer) | ||
csv_writer.writerow(['Bucket', 'Key', 'ExtractedText']) | ||
csv_writer.writerow([bucket, key, extracted_text]) | ||
|
||
# Define the CSV file path and name | ||
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") | ||
csv_filename = f"{CSV_S3_PREFIX}{key.replace('/', '_')}_{timestamp}.csv" | ||
|
||
# Upload CSV to S3 | ||
s3_client.put_object( | ||
Bucket=CSV_S3_BUCKET, | ||
Key=csv_filename, | ||
Body=csv_buffer.getvalue() | ||
) | ||
|
||
print(f"CSV file saved to S3: {csv_filename}") | ||
|
||
return { | ||
'statusCode': 200, | ||
'body': 'CSV files created and saved to S3' | ||
} |
Binary file not shown.
Oops, something went wrong.