Building a Serverless Data Lake on AWS: S3, Athena, and Glue
Complete guide to architecting a cost-effective, scalable data lake using AWS services with automated ETL pipelines and real-time analytics capabilities.
The Modern Data Lake Architecture
A data lake is a centralized repository that stores structured and unstructured data at any scale. Unlike data warehouses, data lakes store raw data in its native format until needed.
- •No infrastructure management: AWS handles scaling, patching, backups
- •Pay per query: Only pay for data scanned and stored
- •Unlimited scale: Petabyte-scale storage and queries
- •Fast time-to-value: Set up in hours, not weeks
- •S3: Unlimited object storage ($0.023/GB/month)
- •AWS Glue: Serverless ETL and data catalog
- •Amazon Athena: SQL queries directly on S3 data
- •Lake Formation: Centralized governance and security
- •QuickSight: BI dashboards and visualizations
Setting Up the Data Lake with Terraform
# terraform/data-lake.tf
# Complete serverless data lake setup
# S3 Buckets for Data Lake Layers
resource "aws_s3_bucket" "data_lake" {
bucket = "company-data-lake-prod"
tags = {
Environment = "production"
Purpose = "data-lake"
}
}
# Enable versioning for data protection
resource "aws_s3_bucket_versioning" "data_lake" {
bucket = aws_s3_bucket.data_lake.id
versioning_configuration {
status = "Enabled"
}
}
# Encryption at rest
resource "aws_s3_bucket_server_side_encryption_configuration" "data_lake" {
bucket = aws_s3_bucket.data_lake.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
# Lifecycle policies for cost optimization
resource "aws_s3_bucket_lifecycle_configuration" "data_lake" {
bucket = aws_s3_bucket.data_lake.id
# Bronze layer: raw data transitions to cheaper storage
rule {
id = "bronze-layer-lifecycle"
status = "Enabled"
filter {
prefix = "bronze/"
}
transition {
days = 30
storage_class = "STANDARD_IA" # Infrequent Access
}
transition {
days = 90
storage_class = "GLACIER_IR" # Instant Retrieval
}
transition {
days = 180
storage_class = "DEEP_ARCHIVE"
}
}
# Silver layer: processed data
rule {
id = "silver-layer-lifecycle"
status = "Enabled"
filter {
prefix = "silver/"
}
transition {
days = 60
storage_class = "STANDARD_IA"
}
}
# Gold layer: keep hot for queries
rule {
id = "gold-layer-lifecycle"
status = "Enabled"
filter {
prefix = "gold/"
}
transition {
days = 90
storage_class = "STANDARD_IA"
}
}
}
# Glue Database for Data Catalog
resource "aws_glue_catalog_database" "data_lake" {
name = "data_lake_prod"
description = "Production data lake catalog"
location_uri = "s3://${aws_s3_bucket.data_lake.bucket}/gold/"
}
# Glue Crawler for automatic schema discovery
resource "aws_glue_crawler" "bronze_crawler" {
name = "bronze-data-crawler"
role = aws_iam_role.glue_crawler.arn
database_name = aws_glue_catalog_database.data_lake.name
s3_target {
path = "s3://${aws_s3_bucket.data_lake.bucket}/bronze/"
}
schedule = "cron(0 */6 * * ? *)" # Every 6 hours
schema_change_policy {
delete_behavior = "LOG"
update_behavior = "UPDATE_IN_DATABASE"
}
}
# Glue ETL Job: Bronze to Silver transformation
resource "aws_glue_job" "bronze_to_silver" {
name = "bronze-to-silver-etl"
role_arn = aws_iam_role.glue_job.arn
command {
name = "glueetl"
script_location = "s3://${aws_s3_bucket.glue_scripts.bucket}/bronze_to_silver.py"
python_version = "3"
}
default_arguments = {
"--job-language" = "python"
"--enable-continuous-cloudwatch-log" = "true"
"--enable-metrics" = "true"
"--enable-spark-ui" = "true"
"--spark-event-logs-path" = "s3://${aws_s3_bucket.glue_logs.bucket}/sparkui/"
}
max_capacity = 10 # DPUs (Data Processing Units)
timeout = 60 # minutes
}
# Athena Workgroup for query optimization
resource "aws_athena_workgroup" "data_lake" {
name = "data-lake-prod"
configuration {
enforce_workgroup_configuration = true
publish_cloudwatch_metrics_enabled = true
result_configuration {
output_location = "s3://${aws_s3_bucket.athena_results.bucket}/results/"
encryption_configuration {
encryption_option = "SSE_S3"
}
}
engine_version {
selected_engine_version = "Athena engine version 3"
}
}
}
# Lake Formation settings for governance
resource "aws_lakeformation_resource" "data_lake" {
arn = aws_s3_bucket.data_lake.arn
}
resource "aws_lakeformation_permissions" "data_analysts" {
principal = aws_iam_role.data_analysts.arn
permissions = ["SELECT"]
table {
database_name = aws_glue_catalog_database.data_lake.name
wildcard = true
}
}
Glue ETL Job: Data Transformation
# glue_jobs/bronze_to_silver.py
# AWS Glue ETL job to transform raw data to processed data
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import functions as F
from pyspark.sql.types import *
# Initialize Glue context
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# Read from Bronze layer (raw data)
bronze_df = spark.read.json("s3://company-data-lake-prod/bronze/events/")
# Data quality checks
def validate_data(df):
"""Validate and clean data"""
# Remove duplicates
df = df.dropDuplicates(['event_id'])
# Remove nulls in critical fields
df = df.filter(
F.col('event_id').isNotNull() &
F.col('timestamp').isNotNull() &
F.col('user_id').isNotNull()
)
# Filter invalid timestamps
df = df.filter(
(F.col('timestamp') > '2020-01-01') &
(F.col('timestamp') < F.current_timestamp())
)
return df
# Transform data
silver_df = (bronze_df
.pipe(validate_data)
# Parse timestamp
.withColumn('event_date', F.to_date('timestamp'))
.withColumn('event_hour', F.hour('timestamp'))
# Normalize user agent
.withColumn('device_type',
F.when(F.col('user_agent').contains('Mobile'), 'mobile')
.when(F.col('user_agent').contains('Tablet'), 'tablet')
.otherwise('desktop'))
# Geo enrichment
.withColumn('country', F.coalesce(F.col('geo.country'), F.lit('unknown')))
.withColumn('city', F.coalesce(F.col('geo.city'), F.lit('unknown')))
# Add processing metadata
.withColumn('processed_at', F.current_timestamp())
.withColumn('processing_job', F.lit(args['JOB_NAME']))
)
# Write to Silver layer (Parquet format, partitioned)
(silver_df.write
.mode('append')
.partitionBy('event_date', 'event_hour')
.parquet("s3://company-data-lake-prod/silver/events/"))
# Update Glue Data Catalog
glueContext.create_dynamic_frame.from_catalog(
database = "data_lake_prod",
table_name = "silver_events",
transformation_ctx = "silver_events"
)
# Commit job
job.commit()
# Job metrics:
# - Input: 100GB JSON (bronze)
# - Output: 25GB Parquet (silver) - 75% compression
# - Processing time: 15 minutes on 10 DPUs
# - Cost: $0.44 per run (10 DPUs * 0.25 hours * $0.44/DPU-hour)
Querying with Amazon Athena
Athena Query Best Practices:
- 1.Use Parquet format: 10x faster queries, 90% less data scanned
- 2.Partition data: By date/region for query pruning
- 3.Use columnar projections: Only query columns needed
- 4.Compress data: Snappy or ZSTD compression
- 5.Use CTAS: CREATE TABLE AS SELECT for materialized views
- •Simple aggregation: 1-3 seconds (10GB)
- •Complex joins: 5-15 seconds (100GB)
- •Full table scan: 30-60 seconds (1TB)
- •$5 per TB of data scanned
- •Partitioning reduces scan by 90%+
- •Parquet reduces scan by 80-90%
- •Example: 1TB query on partitioned Parquet = $0.50 vs $5.00 raw
Production Query Example (Daily active users by device type):
SELECT event_date, device_type, COUNT(DISTINCT user_id) as dau, COUNT(*) as total_events, AVG(session_duration_seconds) as avg_session FROM silver_events WHERE event_date >= DATE '2025-01-01' AND event_date < CURRENT_DATE AND event_type = 'page_view' GROUP BY 1, 2 ORDER BY 1 DESC, 3 DESC;
Cost: ~$0.10 for 20GB scan | Time: 2-3 seconds
- •Query result caching (saves cost on repeated queries)
- •Query limits (prevent runaway costs)
- •Per-query data scanned limits
- •Encryption at rest and in transit
Related Articles
Optimizing Apache Spark on AWS EMR for Petabyte-Scale Data Processing
Deep dive into performance tuning Spark clusters on EMR, memory management, partitioning strategies, and cost reduction techniques for processing massive datasets.
Data EngineeringBuilding Real-Time Analytics with Kafka, Flink, and ClickHouse
Architecture and implementation of streaming data pipelines for real-time analytics, handling millions of events per second with sub-second latency.