Step Functions Orchestration
AWS Step Functions lets you orchestrate multi-step serverless workflows with built-in retry logic, parallel execution, and visual debugging. Instead of chaining Lambda functions with messy callback code, Step Functions provides a state machine that coordinates your workflow declaratively. We design and implement Step Functions for order processing pipelines, data ETL jobs, saga transactions, and human approval workflows.
Need this done for your project?
We implement, you ship. Async, documented, done in days.
State Machine Design
We design state machines using Amazon States Language (ASL) with clear states for each business step, explicit error handling, and timeout configuration.
// Order processing workflow — Terraform + ASL
resource "aws_sfn_state_machine" "order_processing" {
name = "order-processing-${var.env}"
role_arn = aws_iam_role.step_functions.arn
definition = jsonencode({
Comment = "Order processing pipeline"
StartAt = "ValidateOrder"
States = {
ValidateOrder = {
Type = "Task"
Resource = aws_lambda_function.validate_order.arn
Next = "ProcessPayment"
Catch = [{
ErrorEquals = ["ValidationError"]
Next = "OrderFailed"
ResultPath = "$.error"
}]
TimeoutSeconds = 10
}
ProcessPayment = {
Type = "Task"
Resource = aws_lambda_function.process_payment.arn
Next = "ParallelFulfillment"
Retry = [{
ErrorEquals = ["PaymentGatewayTimeout"]
IntervalSeconds = 5
MaxAttempts = 3
BackoffRate = 2.0
}]
Catch = [{
ErrorEquals = ["PaymentDeclined"]
Next = "OrderFailed"
ResultPath = "$.error"
}]
}
ParallelFulfillment = {
Type = "Parallel"
Next = "OrderCompleted"
Branches = [
{
StartAt = "ReserveInventory"
States = {
ReserveInventory = {
Type = "Task"
Resource = aws_lambda_function.reserve_inventory.arn
End = true
}
}
},
{
StartAt = "SendConfirmation"
States = {
SendConfirmation = {
Type = "Task"
Resource = aws_lambda_function.send_confirmation.arn
End = true
}
}
}
]
Catch = [{
ErrorEquals = ["States.ALL"]
Next = "CompensatePayment"
ResultPath = "$.error"
}]
}
CompensatePayment = {
Type = "Task"
Resource = aws_lambda_function.refund_payment.arn
Next = "OrderFailed"
}
OrderCompleted = {
Type = "Succeed"
}
OrderFailed = {
Type = "Fail"
Cause = "Order processing failed"
}
}
})
}Each state has explicit error handling with Catch blocks and retry policies. The Parallel state runs inventory reservation and email confirmation concurrently. If either fails, the CompensatePayment state refunds the charge — implementing the saga pattern for distributed transactions.
Saga Pattern for Distributed Transactions
The saga pattern coordinates multi-service transactions where a traditional database transaction is impossible. Each step has a compensating action that undoes its work on failure.
// Saga definition — each step has execute + compensate
const sagaSteps = [
{
name: 'ReserveInventory',
execute: 'arn:aws:lambda:...:reserve-inventory',
compensate: 'arn:aws:lambda:...:release-inventory',
},
{
name: 'ChargePayment',
execute: 'arn:aws:lambda:...:charge-payment',
compensate: 'arn:aws:lambda:...:refund-payment',
},
{
name: 'CreateShipment',
execute: 'arn:aws:lambda:...:create-shipment',
compensate: 'arn:aws:lambda:...:cancel-shipment',
},
];
// Step Functions ASL for saga with compensation chain
// On failure at step N, run compensate for steps N-1, N-2, ..., 1
// The state machine handles this via Catch blocks that chain
// to compensation states in reverse order:
// CreateShipment fails →
// CompensatePayment → CompensateInventory → SagaFailed
// ChargePayment fails →
// CompensateInventory → SagaFailed
// ReserveInventory fails →
// SagaFailed (nothing to compensate)We generate the ASL state machine from the saga definition, ensuring every execution path eventually reaches either Success or a fully compensated Fail state. Each compensating action is idempotent — it can be safely retried if it fails during compensation.
Human Approval Workflows
Step Functions supports human-in-the-loop workflows using callback tasks that pause execution until an external signal resumes them.
// State machine with human approval
WaitForApproval = {
Type = "Task"
Resource = "arn:aws:states:::lambda:invoke.waitForTaskToken"
Parameters = {
FunctionName = aws_lambda_function.request_approval.arn
Payload = {
"taskToken.$" = "$$.Task.Token"
"orderId.$" = "$.orderId"
"amount.$" = "$.amount"
"requestedBy.$" = "$.requestedBy"
}
}
TimeoutSeconds = 86400 // 24 hour approval window
Catch = [{
ErrorEquals = ["States.Timeout"]
Next = "ApprovalTimedOut"
}]
}
// Lambda sends approval request with task token
export const requestApproval = async (event: { taskToken: string; orderId: string; amount: number }) => {
// Send Slack message with approve/reject buttons
await slack.chat.postMessage({
channel: '#approvals',
blocks: [
{
type: 'section',
text: { type: 'mrkdwn', text: `*Approval Required*\nOrder: ${event.orderId}\nAmount: $${event.amount}` },
},
{
type: 'actions',
elements: [
{
type: 'button',
text: { type: 'plain_text', text: 'Approve' },
action_id: 'approve',
value: event.taskToken,
style: 'primary',
},
{
type: 'button',
text: { type: 'plain_text', text: 'Reject' },
action_id: 'reject',
value: event.taskToken,
style: 'danger',
},
],
},
],
});
};
// Slack webhook handler — resumes the state machine
export const handleApproval = async (event: any) => {
const { action_id, value: taskToken } = event.actions[0];
const sfn = new SFNClient({});
if (action_id === 'approve') {
await sfn.send(new SendTaskSuccessCommand({
taskToken,
output: JSON.stringify({ approved: true }),
}));
} else {
await sfn.send(new SendTaskFailureCommand({
taskToken,
cause: 'Rejected by approver',
}));
}
};The workflow pauses for up to 24 hours waiting for human approval via Slack buttons, email links, or a custom dashboard. The task token uniquely identifies the paused execution and is passed through the approval channel.
Express Workflows for High-Volume Processing
For high-throughput, short-duration workflows (data transformation, IoT event processing), we use Express Workflows which support up to 100,000 executions per second at a fraction of the cost.
resource "aws_sfn_state_machine" "data_transform" {
name = "data-transform-${var.env}"
role_arn = aws_iam_role.step_functions.arn
type = "EXPRESS" # High-throughput mode
definition = jsonencode({
StartAt = "ParseInput"
States = {
ParseInput = {
Type = "Task"
Resource = aws_lambda_function.parse.arn
Next = "EnrichData"
ResultPath = "$.parsed"
}
EnrichData = {
Type = "Task"
Resource = aws_lambda_function.enrich.arn
Next = "RouteByType"
}
RouteByType = {
Type = "Choice"
Choices = [
{
Variable = "$.parsed.type"
StringEquals = "metric"
Next = "StoreInTimeSeries"
},
{
Variable = "$.parsed.type"
StringEquals = "event"
Next = "PublishToEventBridge"
}
]
Default = "StoreInS3"
}
StoreInTimeSeries = {
Type = "Task"
Resource = "arn:aws:states:::aws-sdk:timestreamwrite:writeRecords"
End = true
}
PublishToEventBridge = {
Type = "Task"
Resource = "arn:aws:states:::events:putEvents"
End = true
}
StoreInS3 = {
Type = "Task"
Resource = "arn:aws:states:::s3:putObject"
End = true
}
}
})
logging_configuration {
log_destination = "${aws_cloudwatch_log_group.sfn.arn}:*"
include_execution_data = true
level = "ERROR"
}
}Express Workflows use AWS SDK integrations to call services directly without Lambda — reducing latency and cost. Choice states route data based on content, enabling complex branching logic without code. Logging is configured at ERROR level for production to minimize CloudWatch costs while capturing failures.
Why Anubiz Engineering
Ready to get started?
Skip the research. Tell us what you need, and we'll scope it, implement it, and hand it back — fully documented and production-ready.