Skip to content

Commit cfae526

Browse files
Vinod KumarVinod Kumar
authored andcommitted
Demo for serverless integration with Textract
1 parent 9331c49 commit cfae526

File tree

10 files changed

+985
-0
lines changed

10 files changed

+985
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@
33
webapp/node_modules/
44
webapp/package-lock.json
55
iac/aws/terraform/creating-custom-vpc/.terraform/
6+
iac/demo/textract/.terraform.lock.hcl
7+
iac/demo/textract/.terraform/*
8+

iac/demo/textract/lambda_function.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import boto3
2+
import json
3+
import os
4+
5+
s3_client = boto3.client('s3')
6+
textract_client = boto3.client('textract')
7+
sqs_client = boto3.client('sqs')
8+
9+
SQS_QUEUE_URL = os.environ['SQS_QUEUE_URL']
10+
11+
def lambda_handler(event, context):
12+
# Get S3 bucket and object key from event
13+
bucket_name = event['Records'][0]['s3']['bucket']['name']
14+
object_key = event['Records'][0]['s3']['object']['key']
15+
16+
# Call Textract to extract text
17+
response = textract_client.detect_document_text(
18+
Document={'S3Object': {'Bucket': bucket_name, 'Name': object_key}}
19+
)
20+
21+
# Extract text blocks
22+
text_blocks = [block['Text'] for block in response['Blocks'] if block['BlockType'] == 'LINE']
23+
extracted_text = '\n'.join(text_blocks)
24+
25+
# Send extracted text to SQS
26+
sqs_client.send_message(
27+
QueueUrl=SQS_QUEUE_URL,
28+
MessageBody=json.dumps({
29+
'bucket': bucket_name,
30+
'key': object_key,
31+
'text': extracted_text
32+
})
33+
)
34+
35+
return {
36+
'statusCode': 200,
37+
'body': json.dumps('Text extracted and sent to SQS')
38+
}

iac/demo/textract/lambda_function.zip

680 Bytes
Binary file not shown.

iac/demo/textract/main.tf

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
provider "aws" {
2+
region = var.region
3+
profile = var.aws_profile
4+
}
5+
6+
# S3 Bucket
7+
resource "aws_s3_bucket" "textract_bucket" {
8+
bucket = var.s3_bucket_name
9+
acl = "private"
10+
11+
tags = {
12+
Name = "TextractBucket"
13+
}
14+
}
15+
16+
resource "aws_s3_bucket_notification" "bucket_notification" {
17+
bucket = aws_s3_bucket.textract_bucket.id
18+
19+
lambda_function {
20+
lambda_function_arn = aws_lambda_function.textract_lambda.arn
21+
events = ["s3:ObjectCreated:*"]
22+
filter_prefix = ""
23+
filter_suffix = ".pdf"
24+
}
25+
26+
lambda_function {
27+
lambda_function_arn = aws_lambda_function.textract_lambda.arn
28+
events = ["s3:ObjectCreated:*"]
29+
filter_suffix = ".jpg"
30+
}
31+
32+
depends_on = [aws_lambda_permission.allow_s3]
33+
}
34+
35+
# SQS Queue
36+
resource "aws_sqs_queue" "textract_queue" {
37+
name = var.sqs_queue_name
38+
39+
tags = {
40+
Name = "TextractQueue"
41+
}
42+
}
43+
44+
# IAM Role for Lambda
45+
resource "aws_iam_role" "lambda_role" {
46+
name = "lambda-textract-role"
47+
48+
assume_role_policy = jsonencode({
49+
Version = "2012-10-17"
50+
Statement = [
51+
{
52+
Action = "sts:AssumeRole"
53+
Effect = "Allow"
54+
Principal = {
55+
Service = "lambda.amazonaws.com"
56+
}
57+
}
58+
]
59+
})
60+
}
61+
62+
# IAM Policy for Lambda
63+
resource "aws_iam_role_policy" "lambda_policy" {
64+
name = "lambda-textract-policy"
65+
role = aws_iam_role.lambda_role.id
66+
policy = jsonencode({
67+
Version = "2012-10-17"
68+
Statement = [
69+
{
70+
Effect = "Allow"
71+
Action = ["s3:GetObject"]
72+
Resource = "${aws_s3_bucket.textract_bucket.arn}/*"
73+
},
74+
{
75+
Effect = "Allow"
76+
Action = ["sqs:SendMessage"]
77+
Resource = aws_sqs_queue.textract_queue.arn
78+
},
79+
{
80+
Effect = "Allow"
81+
Action = ["textract:DetectDocumentText", "textract:AnalyzeDocument"]
82+
Resource = "*"
83+
},
84+
{
85+
Effect = "Allow"
86+
Action = [
87+
"logs:CreateLogGroup",
88+
"logs:CreateLogStream",
89+
"logs:PutLogEvents"
90+
]
91+
Resource = "*"
92+
}
93+
]
94+
})
95+
}
96+
97+
# Lambda Function
98+
resource "aws_lambda_function" "textract_lambda" {
99+
filename = "lambda_function.zip"
100+
function_name = var.lambda_function_name
101+
role = aws_iam_role.lambda_role.arn
102+
handler = "lambda_function.lambda_handler"
103+
runtime = "python3.9"
104+
source_code_hash = filebase64sha256("lambda_function.zip")
105+
timeout = 60
106+
107+
environment {
108+
variables = {
109+
SQS_QUEUE_URL = aws_sqs_queue.textract_queue.id
110+
}
111+
}
112+
}
113+
114+
# Allow S3 to invoke Lambda
115+
resource "aws_lambda_permission" "allow_s3" {
116+
statement_id = "AllowS3Invoke"
117+
action = "lambda:InvokeFunction"
118+
function_name = aws_lambda_function.textract_lambda.function_name
119+
principal = "s3.amazonaws.com"
120+
source_arn = aws_s3_bucket.textract_bucket.arn
121+
}
122+
123+
124+
# IAM Policy for Lambda Function 2
125+
resource "aws_iam_role_policy" "lambda_policy_csv" {
126+
name = "lambda-csv-policy"
127+
role = aws_iam_role.lambda_role.id
128+
policy = jsonencode({
129+
Version = "2012-10-17"
130+
Statement = [
131+
{
132+
Effect = "Allow"
133+
Action = ["s3:PutObject"]
134+
Resource = "${aws_s3_bucket.textract_bucket.arn}/*"
135+
},
136+
{
137+
Effect = "Allow"
138+
Action = ["sqs:ReceiveMessage", "sqs:DeleteMessage", "sqs:GetQueueAttributes"]
139+
Resource = aws_sqs_queue.textract_queue.arn
140+
},
141+
{
142+
Effect = "Allow"
143+
Action = [
144+
"logs:CreateLogGroup",
145+
"logs:CreateLogStream",
146+
"logs:PutLogEvents"
147+
]
148+
Resource = "*"
149+
}
150+
]
151+
})
152+
}
153+
154+
# Lambda Function 2 to process SQS messages and generate CSV
155+
resource "aws_lambda_function" "sqs_to_csv_lambda" {
156+
filename = "sqs_to_csv_lambda.zip"
157+
function_name = var.lambda_function_name_2
158+
role = aws_iam_role.lambda_role.arn
159+
handler = "sqs_to_csv_lambda.lambda_handler"
160+
runtime = "python3.9"
161+
source_code_hash = filebase64sha256("sqs_to_csv_lambda.zip")
162+
timeout = 60
163+
164+
environment {
165+
variables = {
166+
CSV_S3_BUCKET = aws_s3_bucket.textract_bucket.id
167+
CSV_S3_PREFIX = var.csv_s3_prefix
168+
}
169+
}
170+
}
171+
172+
# SQS Event Source Mapping for Lambda Function 2
173+
resource "aws_lambda_event_source_mapping" "sqs_trigger" {
174+
event_source_arn = aws_sqs_queue.textract_queue.arn
175+
function_name = aws_lambda_function.sqs_to_csv_lambda.arn
176+
batch_size = 10
177+
enabled = true
178+
}
179+

iac/demo/textract/outputs.tf

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
output "s3_bucket_name" {
2+
value = aws_s3_bucket.textract_bucket.id
3+
}
4+
5+
output "lambda_function_arn" {
6+
value = aws_lambda_function.textract_lambda.arn
7+
}
8+
9+
output "sqs_queue_url" {
10+
value = aws_sqs_queue.textract_queue.id
11+
}
12+
13+
output "csv_lambda_function_arn" {
14+
value = aws_lambda_function.sqs_to_csv_lambda.arn
15+
}
16+
17+
output "csv_s3_prefix" {
18+
value = var.csv_s3_prefix
19+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import boto3
2+
import csv
3+
import os
4+
import json
5+
from io import StringIO
6+
import datetime
7+
8+
s3_client = boto3.client('s3')
9+
sqs_client = boto3.client('sqs')
10+
11+
CSV_S3_BUCKET = os.environ['CSV_S3_BUCKET']
12+
CSV_S3_PREFIX = os.environ['CSV_S3_PREFIX']
13+
14+
def lambda_handler(event, context):
15+
for record in event['Records']:
16+
# Parse the SQS message
17+
message_body = json.loads(record['body'])
18+
bucket = message_body['bucket']
19+
key = message_body['key']
20+
extracted_text = message_body['text']
21+
22+
# Prepare CSV data
23+
csv_buffer = StringIO()
24+
csv_writer = csv.writer(csv_buffer)
25+
csv_writer.writerow(['Bucket', 'Key', 'ExtractedText'])
26+
csv_writer.writerow([bucket, key, extracted_text])
27+
28+
# Define the CSV file path and name
29+
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
30+
csv_filename = f"{CSV_S3_PREFIX}{key.replace('/', '_')}_{timestamp}.csv"
31+
32+
# Upload CSV to S3
33+
s3_client.put_object(
34+
Bucket=CSV_S3_BUCKET,
35+
Key=csv_filename,
36+
Body=csv_buffer.getvalue()
37+
)
38+
39+
print(f"CSV file saved to S3: {csv_filename}")
40+
41+
return {
42+
'statusCode': 200,
43+
'body': 'CSV files created and saved to S3'
44+
}
760 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)