Performance Tuning
Performance Tuning
Comprehensive guide to optimizing system performance across applications, databases, infrastructure, and networks for maximum efficiency.
Performance Tuning Methodology
Systematic approach to identifying and resolving performance bottlenecks through measurement, analysis, and optimization.
Performance Analysis Process
- Baseline: Establish current performance metrics
- Identify Bottlenecks: Find the limiting factors
- Analyze Root Cause: Understand why bottlenecks exist
- Implement Changes: Apply targeted optimizations
- Measure Impact: Verify improvements
- Iterate: Repeat for next bottleneck
Application Performance
Code Profiling
# Python profiling with cProfile
import cProfile
import pstats
from pstats import SortKey
def profile_function(func):
"""Decorator to profile function execution"""
def wrapper(*args, **kwargs):
profiler = cProfile.Profile()
profiler.enable()
result = func(*args, **kwargs)
profiler.disable()
stats = pstats.Stats(profiler)
stats.sort_stats(SortKey.TIME)
stats.print_stats(10) # Top 10 time-consuming functions
return result
return wrapper
# Memory profiling
from memory_profiler import profile
@profile
def memory_intensive_function():
# Large data structure
data = [i for i in range(1000000)]
processed = process_data(data)
return processed
# Async profiling for Node.js
const async_hooks = require('async_hooks');
const fs = require('fs');
// Track async operations
const asyncHook = async_hooks.createHook({
init(asyncId, type, triggerAsyncId) {
fs.writeSync(1, `Init: ${type} (${asyncId}) triggered by ${triggerAsyncId}\n`);
},
before(asyncId) {
fs.writeSync(1, `Before: ${asyncId}\n`);
},
after(asyncId) {
fs.writeSync(1, `After: ${asyncId}\n`);
},
destroy(asyncId) {
fs.writeSync(1, `Destroy: ${asyncId}\n`);
}
});
asyncHook.enable();
Caching Strategies
# Multi-level caching implementation
class CacheManager:
def __init__(self):
self.l1_cache = {} # In-memory cache
self.l2_cache = Redis() # Redis cache
self.l3_cache = CDN() # CDN cache
def get(self, key):
# Check L1 cache
if key in self.l1_cache:
return self.l1_cache[key]
# Check L2 cache
value = self.l2_cache.get(key)
if value:
self.l1_cache[key] = value
return value
# Check L3 cache
value = self.l3_cache.get(key)
if value:
self.l2_cache.set(key, value, ttl=3600)
self.l1_cache[key] = value
return value
# Cache miss - fetch from source
value = self.fetch_from_source(key)
self.set_all_levels(key, value)
return value
def set_all_levels(self, key, value):
self.l1_cache[key] = value
self.l2_cache.set(key, value, ttl=3600)
self.l3_cache.set(key, value, ttl=86400)
# Cache-aside pattern with proper invalidation
@app.route('/api/user/')
def get_user(user_id):
cache_key = f'user:{user_id}'
# Try cache first
cached_user = cache.get(cache_key)
if cached_user:
return cached_user
# Cache miss - fetch from database
user = db.query('SELECT * FROM users WHERE id = ?', user_id)
# Cache for future requests
cache.set(cache_key, user, ttl=300)
return user
# Invalidate cache on updates
@app.route('/api/user/', methods=['PUT'])
def update_user(user_id):
# Update database
db.execute('UPDATE users SET ... WHERE id = ?', user_id)
# Invalidate cache
cache.delete(f'user:{user_id}')
cache.delete(f'user_list:*') # Invalidate list caches
return {'status': 'updated'}
Async and Parallel Processing
# Python async optimization
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor
async def fetch_data_async(session, url):
async with session.get(url) as response:
return await response.json()
async def process_urls_concurrently(urls):
async with aiohttp.ClientSession() as session:
tasks = [fetch_data_async(session, url) for url in urls]
results = await asyncio.gather(*tasks)
return results
# CPU-bound tasks with multiprocessing
from multiprocessing import Pool
import numpy as np
def process_chunk(data_chunk):
# CPU-intensive processing
return np.mean(data_chunk ** 2)
def parallel_processing(data, num_processes=4):
chunk_size = len(data) // num_processes
chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
with Pool(processes=num_processes) as pool:
results = pool.map(process_chunk, chunks)
return np.mean(results)
# Go goroutines for concurrent processing
func processItems(items []Item) []Result {
results := make([]Result, len(items))
var wg sync.WaitGroup
// Worker pool
workers := runtime.NumCPU()
itemChan := make(chan int, len(items))
// Start workers
for w := 0; w < workers; w++ {
wg.Add(1)
go func() {
defer wg.Done()
for idx := range itemChan {
results[idx] = processItem(items[idx])
}
}()
}
// Queue work
for i := range items {
itemChan <- i
}
close(itemChan)
wg.Wait()
return results
}
Database Optimization
Query Optimization
-- Analyze slow queries
EXPLAIN (ANALYZE, BUFFERS)
SELECT u.id, u.name, COUNT(o.id) as order_count, SUM(o.total) as total_spent
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.created_at > '2023-01-01'
GROUP BY u.id, u.name
HAVING COUNT(o.id) > 5
ORDER BY total_spent DESC
LIMIT 100;
-- Create appropriate indexes
CREATE INDEX idx_users_created_at ON users(created_at);
CREATE INDEX idx_orders_user_id_total ON orders(user_id, total);
-- Optimize with materialized view
CREATE MATERIALIZED VIEW user_order_summary AS
SELECT
u.id,
u.name,
COUNT(o.id) as order_count,
SUM(o.total) as total_spent,
MAX(o.created_at) as last_order_date
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.name;
CREATE UNIQUE INDEX idx_user_order_summary_id ON user_order_summary(id);
-- Refresh strategy
CREATE OR REPLACE FUNCTION refresh_user_order_summary()
RETURNS void AS $$
BEGIN
REFRESH MATERIALIZED VIEW CONCURRENTLY user_order_summary;
END;
$$ LANGUAGE plpgsql;
-- Schedule refresh
SELECT cron.schedule('refresh-summary', '0 */2 * * *', 'SELECT refresh_user_order_summary();');
Connection Pooling
# Database connection pool configuration
from sqlalchemy import create_engine
from sqlalchemy.pool import QueuePool
# Create engine with connection pooling
engine = create_engine(
'postgresql://user:password@localhost/db',
poolclass=QueuePool,
pool_size=20, # Number of connections to maintain
max_overflow=10, # Maximum overflow connections
pool_pre_ping=True, # Test connections before using
pool_recycle=3600, # Recycle connections after 1 hour
echo_pool=True # Log pool checkouts/checkins
)
# PgBouncer configuration for PostgreSQL
[databases]
mydb = host=localhost port=5432 dbname=mydb
[pgbouncer]
listen_port = 6432
listen_addr = *
auth_type = md5
auth_file = /etc/pgbouncer/userlist.txt
pool_mode = transaction
max_client_conn = 1000
default_pool_size = 25
reserve_pool_size = 5
reserve_pool_timeout = 3
server_lifetime = 3600
server_idle_timeout = 600
# Monitor pool performance
SHOW POOLS;
SHOW STATS;
SHOW SERVERS;
Database Partitioning
-- Time-based partitioning for large tables
CREATE TABLE events (
id BIGSERIAL,
created_at TIMESTAMP NOT NULL,
event_type VARCHAR(50),
data JSONB
) PARTITION BY RANGE (created_at);
-- Create monthly partitions
CREATE TABLE events_2024_01 PARTITION OF events
FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
CREATE TABLE events_2024_02 PARTITION OF events
FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');
-- Automated partition management
CREATE OR REPLACE FUNCTION create_monthly_partition()
RETURNS void AS $$
DECLARE
start_date date;
end_date date;
partition_name text;
BEGIN
start_date := date_trunc('month', CURRENT_DATE + interval '1 month');
end_date := start_date + interval '1 month';
partition_name := 'events_' || to_char(start_date, 'YYYY_MM');
EXECUTE format('CREATE TABLE IF NOT EXISTS %I PARTITION OF events FOR VALUES FROM (%L) TO (%L)',
partition_name, start_date, end_date);
END;
$$ LANGUAGE plpgsql;
-- Schedule partition creation
SELECT cron.schedule('create-partitions', '0 0 25 * *', 'SELECT create_monthly_partition();');
Infrastructure Optimization
Linux Kernel Tuning
# /etc/sysctl.conf - Network performance tuning
# Increase TCP buffer sizes
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
net.ipv4.tcp_rmem = 4096 87380 134217728
net.ipv4.tcp_wmem = 4096 65536 134217728
# Enable TCP Fast Open
net.ipv4.tcp_fastopen = 3
# Increase connection backlog
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535
# Optimize for low latency
net.ipv4.tcp_low_latency = 1
net.ipv4.tcp_nodelay = 1
# File system tuning
fs.file-max = 2097152
vm.swappiness = 10
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5
# Apply settings
sysctl -p
# CPU governor for performance
for cpu in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
echo performance > $cpu
done
# Disable CPU frequency scaling
systemctl disable ondemand
# NUMA optimization
numactl --hardware # Check NUMA topology
# Run process on specific NUMA node
numactl --cpunodebind=0 --membind=0 ./myapp
Container Resource Optimization
# Kubernetes resource tuning
apiVersion: v1
kind: Pod
metadata:
name: optimized-app
spec:
containers:
- name: app
image: myapp:latest
resources:
requests:
memory: "256Mi"
cpu: "250m"
ephemeral-storage: "1Gi"
limits:
memory: "512Mi"
cpu: "500m"
ephemeral-storage: "2Gi"
# JVM optimization for containers
env:
- name: JAVA_OPTS
value: "-XX:MaxRAMPercentage=75.0 -XX:+UseG1GC -XX:+UseStringDeduplication"
# Enable vertical pod autoscaling
- name: VPA_ENABLED
value: "true"
---
# Vertical Pod Autoscaler
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
name: app-vpa
spec:
targetRef:
apiVersion: "apps/v1"
kind: Deployment
name: myapp
updatePolicy:
updateMode: "Auto"
resourcePolicy:
containerPolicies:
- containerName: app
minAllowed:
cpu: 100m
memory: 128Mi
maxAllowed:
cpu: 2
memory: 2Gi
Storage Performance
# Disk I/O optimization
# Check current I/O scheduler
cat /sys/block/sda/queue/scheduler
# Set to deadline for databases
echo deadline > /sys/block/sda/queue/scheduler
# Tune for SSDs
echo 0 > /sys/block/sda/queue/rotational
echo 0 > /sys/block/sda/queue/rq_affinity
echo noop > /sys/block/sda/queue/scheduler
# RAID optimization
# RAID 10 for databases (performance + redundancy)
mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/sd[b-e]
# Tune RAID parameters
echo 65536 > /sys/block/md0/md/stripe_cache_size
echo 8192 > /sys/block/md0/queue/read_ahead_kb
# File system optimization
# XFS for large files
mkfs.xfs -f -d agcount=32 -l size=512m /dev/md0
# Mount options for performance
mount -o noatime,nodiratime,nobarrier,logbufs=8,logbsize=256k /dev/md0 /data
# Monitor I/O performance
iostat -x 1
iotop -o
Network Optimization
TCP Tuning
# TCP optimization for high-throughput
# /etc/sysctl.d/99-tcp-tuning.conf
# TCP congestion control
net.ipv4.tcp_congestion_control = bbr
net.core.default_qdisc = fq
# Connection tracking
net.netfilter.nf_conntrack_max = 1048576
net.netfilter.nf_conntrack_tcp_timeout_established = 3600
# TCP keepalive
net.ipv4.tcp_keepalive_time = 600
net.ipv4.tcp_keepalive_intvl = 60
net.ipv4.tcp_keepalive_probes = 3
# Enable window scaling
net.ipv4.tcp_window_scaling = 1
# Increase port range
net.ipv4.ip_local_port_range = 10000 65535
# Reuse TIME_WAIT sockets
net.ipv4.tcp_tw_reuse = 1
# NIC optimization
# Enable receive packet steering
echo f > /sys/class/net/eth0/queues/rx-0/rps_cpus
# Increase ring buffer
ethtool -G eth0 rx 4096 tx 4096
# Enable offloading
ethtool -K eth0 gro on gso on tso on
# CPU affinity for network interrupts
# Find IRQs
grep eth0 /proc/interrupts
# Set CPU affinity
echo 1 > /proc/irq/24/smp_affinity # Bind to CPU 0
Load Balancer Optimization
# NGINX performance tuning
user nginx;
worker_processes auto;
worker_cpu_affinity auto;
worker_rlimit_nofile 65535;
events {
worker_connections 65535;
use epoll;
multi_accept on;
}
http {
# Basic settings
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
keepalive_requests 1000;
# Buffer settings
client_body_buffer_size 16K;
client_header_buffer_size 1k;
client_max_body_size 8m;
large_client_header_buffers 4 16k;
# Caching
open_file_cache max=10000 inactive=60s;
open_file_cache_valid 30s;
open_file_cache_min_uses 2;
open_file_cache_errors on;
# Gzip compression
gzip on;
gzip_vary on;
gzip_proxied any;
gzip_comp_level 6;
gzip_types text/plain text/css text/xml application/json application/javascript application/xml+rss application/atom+xml image/svg+xml;
# Upstream configuration
upstream backend {
least_conn;
keepalive 300;
server backend1:8080 weight=5 max_fails=3 fail_timeout=30s;
server backend2:8080 weight=5 max_fails=3 fail_timeout=30s;
server backend3:8080 weight=5 max_fails=3 fail_timeout=30s;
}
server {
listen 80 default_server reuseport;
listen [::]:80 default_server reuseport;
location / {
proxy_pass http://backend;
proxy_http_version 1.1;
proxy_set_header Connection "";
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
# Caching
proxy_cache_bypass $http_upgrade;
proxy_cache_valid 200 302 10m;
proxy_cache_valid 404 1m;
}
}
}
Application-Specific Tuning
JVM Tuning
# Modern JVM optimization
JAVA_OPTS="-server \
-Xms4g -Xmx4g \
-XX:+UseG1GC \
-XX:MaxGCPauseMillis=200 \
-XX:ParallelGCThreads=20 \
-XX:ConcGCThreads=5 \
-XX:InitiatingHeapOccupancyPercent=70 \
-XX:+HeapDumpOnOutOfMemoryError \
-XX:HeapDumpPath=/var/log/app/heap-dump.hprof \
-XX:+UseStringDeduplication \
-XX:+OptimizeStringConcat \
-XX:+UseCompressedOops \
-Djava.net.preferIPv4Stack=true \
-Dfile.encoding=UTF-8"
# GraalVM native image for startup performance
native-image \
--no-server \
--no-fallback \
--initialize-at-build-time \
-H:+ReportExceptionStackTraces \
-H:+PrintClassInitialization \
--enable-http --enable-https \
-jar myapp.jar
Node.js Optimization
// Cluster mode for multi-core utilization
const cluster = require('cluster');
const numCPUs = require('os').cpus().length;
if (cluster.isMaster) {
console.log(`Master ${process.pid} is running`);
// Fork workers
for (let i = 0; i < numCPUs; i++) {
cluster.fork();
}
cluster.on('exit', (worker, code, signal) => {
console.log(`Worker ${worker.process.pid} died`);
cluster.fork(); // Replace dead worker
});
} else {
// Worker process
const app = require('./app');
app.listen(3000);
}
// V8 optimization flags
// node --max-old-space-size=4096 --optimize-for-size --gc-interval=100 app.js
// Memory leak detection
const v8 = require('v8');
const heapSnapshot = v8.writeHeapSnapshot();
console.log(`Heap snapshot written to ${heapSnapshot}`);
Performance Testing
Load Testing
# k6 load test script
import http from 'k6/http';
import { check, sleep } from 'k6';
export let options = {
stages: [
{ duration: '5m', target: 100 }, // Ramp up
{ duration: '10m', target: 100 }, // Stay at 100 users
{ duration: '5m', target: 200 }, // Ramp up to 200
{ duration: '10m', target: 200 }, // Stay at 200
{ duration: '5m', target: 0 }, // Ramp down
],
thresholds: {
http_req_duration: ['p(95)<500'], // 95% of requests under 500ms
http_req_failed: ['rate<0.1'], // Error rate under 10%
},
};
export default function () {
let response = http.get('https://api.example.com/endpoint');
check(response, {
'status is 200': (r) => r.status === 200,
'response time < 500ms': (r) => r.timings.duration < 500,
});
sleep(1);
}
# Run test
k6 run --out influxdb=http://localhost:8086/k6 load-test.js
Profiling in Production
# Continuous profiling with async-profiler
java -agentpath:/path/to/libasyncProfiler.so=start,event=cpu,file=profile.html -jar app.jar
# Go pprof
import _ "net/http/pprof"
go func() {
log.Println(http.ListenAndServe("localhost:6060", nil))
}()
# Profile CPU
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30
# Profile memory
go tool pprof http://localhost:6060/debug/pprof/heap
Performance Monitoring
Application Metrics
# Custom performance metrics
from prometheus_client import Histogram, Counter, Gauge
import time
# Define metrics
request_duration = Histogram('request_duration_seconds',
'Request duration',
['method', 'endpoint'],
buckets=[.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5])
db_query_duration = Histogram('db_query_duration_seconds',
'Database query duration',
['query_type'])
cache_hits = Counter('cache_hits_total', 'Cache hit count')
cache_misses = Counter('cache_misses_total', 'Cache miss count')
active_connections = Gauge('active_connections', 'Active DB connections')
# Use in application
@app.route('/api/')
def api_handler(endpoint):
start_time = time.time()
try:
result = process_request(endpoint)
return result
finally:
duration = time.time() - start_time
request_duration.labels(method='GET', endpoint=endpoint).observe(duration)
Best Practices
- Measure First: Profile before optimizing
- One Change at a Time: Isolate optimization impact
- Production-Like Testing: Test with realistic data and load
- Monitor Continuously: Track performance over time
- Document Changes: Record what worked and why
- Avoid Premature Optimization: Focus on actual bottlenecks
- Consider Trade-offs: Balance performance vs. complexity
Performance Checklist
- □ Application profiling completed
- □ Database queries optimized
- □ Caching strategy implemented
- □ Connection pooling configured
- □ Infrastructure resources right-sized
- □ Network latency minimized
- □ Load testing performed
- □ Monitoring dashboards created
- □ Performance SLOs defined
- □ Runbooks documented
Related Resources
Note: This documentation is provided for reference purposes only. It reflects general best practices and industry-aligned guidelines, and any examples, claims, or recommendations are intended as illustrative—not definitive or binding.