Terraform Data Engineering IaC

Skill by ara.so — Data Skills collection.

This project demonstrates Infrastructure-as-Code (IaC) fundamentals for data engineering using Terraform. It provisions AWS resources commonly used in data pipelines including S3 buckets for data storage and EC2 instances for data processing workloads.

What It Does

Provisions AWS S3 buckets for data lake storage
Creates EC2 instances for data processing and pipeline execution
Manages IAM policies for secure resource access
Uses Terraform state to track and manage infrastructure changes
Provides reproducible infrastructure for data engineering environments

Prerequisites

Before using this project, ensure you have:

AWS Account with appropriate permissions
Terraform CLI installed
AWS CLI installed and configured
IAM user with S3, EC2, and IAM permissions

Installation

1. Install Terraform

# macOS
brew install terraform

# Linux
wget https://releases.hashicorp.com/terraform/1.5.0/terraform_1.5.0_linux_amd64.zip
unzip terraform_1.5.0_linux_amd64.zip
sudo mv terraform /usr/local/bin/

# Verify installation
terraform version

2. Install AWS CLI

# macOS
brew install awscli

# Linux
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
sudo ./aws/install

# Configure AWS credentials
aws configure

3. Set Up IAM Permissions

Create an IAM user with the following managed policies:

AmazonS3FullAccess
AmazonEC2FullAccess
IAMFullAccess

Note: For production, use fine-grained permissions instead of full access.

Project Structure

terraform/
├── main.tf           # Main infrastructure definitions
├── variables.tf      # Input variables
├── outputs.tf        # Output values
└── terraform.tfstate # State file (generated)

Key Terraform Commands

Initialize Terraform

# Initialize backend and download providers
terraform -chdir=terraform init

Validate Configuration

# Check syntax and validate configuration
terraform -chdir=terraform validate

Format Code

# Auto-format HCL files
terraform -chdir=terraform fmt

Plan Infrastructure Changes

# Preview what will be created/changed
terraform -chdir=terraform plan

Apply Infrastructure

# Create or update infrastructure
terraform -chdir=terraform apply

# Auto-approve without confirmation (use carefully)
terraform -chdir=terraform apply -auto-approve

Destroy Infrastructure

# Remove all managed infrastructure
terraform -chdir=terraform destroy

# Auto-approve destruction (use carefully)
terraform -chdir=terraform destroy -auto-approve

State Management

# List all resources in state
terraform -chdir=terraform state list

# Show detailed resource information
terraform -chdir=terraform state show aws_s3_bucket.data_bucket

# View state as JSON
cat terraform/terraform.tfstate | jq -r '.resources[] | [.type, .name] | join(",")'

Configuration Examples

Basic S3 Bucket for Data Storage

# terraform/main.tf
terraform {
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
  }
}

provider "aws" {
  region = var.aws_region
}

resource "aws_s3_bucket" "data_lake" {
  bucket = "my-unique-data-lake-bucket-${var.environment}"
  
  tags = {
    Name        = "Data Lake Bucket"
    Environment = var.environment
    Project     = "data-engineering"
  }
}

resource "aws_s3_bucket_versioning" "data_lake_versioning" {
  bucket = aws_s3_bucket.data_lake.id
  
  versioning_configuration {
    status = "Enabled"
  }
}

resource "aws_s3_bucket_lifecycle_configuration" "data_lake_lifecycle" {
  bucket = aws_s3_bucket.data_lake.id

  rule {
    id     = "archive_old_data"
    status = "Enabled"

    transition {
      days          = 90
      storage_class = "GLACIER"
    }

    expiration {
      days = 365
    }
  }
}

EC2 Instance for Data Processing

# terraform/main.tf (continued)
data "aws_ami" "ubuntu" {
  most_recent = true
  owners      = ["099720109477"] # Canonical

  filter {
    name   = "name"
    values = ["ubuntu/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server-*"]
  }
}

resource "aws_instance" "data_processor" {
  ami           = data.aws_ami.ubuntu.id
  instance_type = var.instance_type

  tags = {
    Name        = "Data Processing Server"
    Environment = var.environment
  }

  user_data = <<-EOF
              #!/bin/bash
              sudo apt-get update
              sudo apt-get install -y python3-pip
              pip3 install pandas boto3 apache-airflow
              EOF
}

resource "aws_eip" "data_processor_eip" {
  instance = aws_instance.data_processor.id
  domain   = "vpc"
}

Variables Configuration

# terraform/variables.tf
variable "aws_region" {
  description = "AWS region for resources"
  type        = string
  default     = "us-east-1"
}

variable "environment" {
  description = "Environment name"
  type        = string
  default     = "dev"
}

variable "instance_type" {
  description = "EC2 instance type"
  type        = string
  default     = "t3.medium"
}

Outputs Configuration

# terraform/outputs.tf
output "s3_bucket_name" {
  description = "Name of the S3 data lake bucket"
  value       = aws_s3_bucket.data_lake.id
}

output "ec2_public_ip" {
  description = "Public IP of data processing EC2 instance"
  value       = aws_eip.data_processor_eip.public_ip
}

output "ec2_instance_id" {
  description = "Instance ID of data processor"
  value       = aws_instance.data_processor.id
}

Common Patterns

Multi-Environment Setup

# Use workspace or separate state files
terraform workspace new staging
terraform workspace new production

# Or use variable files
terraform apply -var-file="environments/dev.tfvars"
terraform apply -var-file="environments/prod.tfvars"

Remote State with S3 Backend

# terraform/backend.tf
terraform {
  backend "s3" {
    bucket         = "my-terraform-state-bucket"
    key            = "data-engineering/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-state-lock"
  }
}

IAM Role for EC2 with S3 Access

resource "aws_iam_role" "data_processor_role" {
  name = "data-processor-role"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [{
      Action = "sts:AssumeRole"
      Effect = "Allow"
      Principal = {
        Service = "ec2.amazonaws.com"
      }
    }]
  })
}

resource "aws_iam_role_policy_attachment" "s3_access" {
  role       = aws_iam_role.data_processor_role.name
  policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess"
}

resource "aws_iam_instance_profile" "data_processor_profile" {
  name = "data-processor-profile"
  role = aws_iam_role.data_processor_role.name
}

resource "aws_instance" "data_processor" {
  ami                  = data.aws_ami.ubuntu.id
  instance_type        = var.instance_type
  iam_instance_profile = aws_iam_instance_profile.data_processor_profile.name
}

Verification Commands

Verify S3 Buckets

# List all S3 buckets
aws s3 ls

# Get bucket details
aws s3api get-bucket-location --bucket my-data-lake-bucket

# List bucket contents
aws s3 ls s3://my-data-lake-bucket/

Verify EC2 Instances

# List running instances
aws ec2 describe-instances \
  --filters "Name=instance-state-name,Values=running" \
  --query 'Reservations[].Instances[].{ID:InstanceId, Name:Tags[?Key==`Name`].Value, Type:InstanceType, State:State.Name, PublicIP:PublicIpAddress, PrivateIP:PrivateIpAddress}' \
  --output table

# Get specific instance details
aws ec2 describe-instances --instance-ids i-1234567890abcdef0

Connect to EC2 Instance

# SSH into instance (requires key pair)
ssh -i ~/.ssh/my-key.pem ubuntu@$(terraform -chdir=terraform output -raw ec2_public_ip)

Troubleshooting

Issue: Terraform Init Fails

# Clear cache and reinitialize
rm -rf terraform/.terraform
rm terraform/.terraform.lock.hcl
terraform -chdir=terraform init

Issue: State Lock Error

# Force unlock (use with caution)
terraform -chdir=terraform force-unlock LOCK_ID

Issue: AWS Credentials Not Found

# Verify AWS configuration
aws configure list
aws sts get-caller-identity

# Set credentials explicitly
export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID}"
export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY}"
export AWS_DEFAULT_REGION="us-east-1"

Issue: Resource Already Exists

# Import existing resource into state
terraform -chdir=terraform import aws_s3_bucket.data_lake my-existing-bucket

# Or recreate with unique name
terraform -chdir=terraform apply -var="bucket_suffix=$(date +%s)"

Issue: Permission Denied

Check IAM policies and ensure your user has required permissions:

# Test S3 permissions
aws s3 ls

# Test EC2 permissions
aws ec2 describe-instances

# Test IAM permissions
aws iam list-users

Debugging Terraform

# Enable debug logging
export TF_LOG=DEBUG
terraform -chdir=terraform apply

# Show detailed plan
terraform -chdir=terraform plan -out=tfplan
terraform -chdir=terraform show tfplan

# Refresh state from actual infrastructure
terraform -chdir=terraform refresh

Best Practices

Always use unique bucket names: S3 bucket names must be globally unique
Version your state files: Enable S3 versioning for state file backups
Use remote state: Store state in S3 with locking via DynamoDB
Tag all resources: Apply consistent tagging for cost tracking and organization
Use variables: Parameterize configurations for reusability
Run terraform plan before apply to review changes
Destroy dev resources: Don't leave test infrastructure running to avoid costs

terraform-data-engineering-iac