Performance Tuning

15 min read
Updated Jun 19, 2025

Performance Tuning

Comprehensive guide to optimizing system performance across applications, databases, infrastructure, and networks for maximum efficiency.

Performance Tuning Methodology

Systematic approach to identifying and resolving performance bottlenecks through measurement, analysis, and optimization.

Performance Analysis Process

  1. Baseline: Establish current performance metrics
  2. Identify Bottlenecks: Find the limiting factors
  3. Analyze Root Cause: Understand why bottlenecks exist
  4. Implement Changes: Apply targeted optimizations
  5. Measure Impact: Verify improvements
  6. Iterate: Repeat for next bottleneck

Application Performance

Code Profiling

# Python profiling with cProfile
import cProfile
import pstats
from pstats import SortKey

def profile_function(func):
    """Decorator to profile function execution"""
    def wrapper(*args, **kwargs):
        profiler = cProfile.Profile()
        profiler.enable()
        result = func(*args, **kwargs)
        profiler.disable()
        
        stats = pstats.Stats(profiler)
        stats.sort_stats(SortKey.TIME)
        stats.print_stats(10)  # Top 10 time-consuming functions
        
        return result
    return wrapper

# Memory profiling
from memory_profiler import profile

@profile
def memory_intensive_function():
    # Large data structure
    data = [i for i in range(1000000)]
    processed = process_data(data)
    return processed

# Async profiling for Node.js
const async_hooks = require('async_hooks');
const fs = require('fs');

// Track async operations
const asyncHook = async_hooks.createHook({
    init(asyncId, type, triggerAsyncId) {
        fs.writeSync(1, `Init: ${type} (${asyncId}) triggered by ${triggerAsyncId}\n`);
    },
    before(asyncId) {
        fs.writeSync(1, `Before: ${asyncId}\n`);
    },
    after(asyncId) {
        fs.writeSync(1, `After: ${asyncId}\n`);
    },
    destroy(asyncId) {
        fs.writeSync(1, `Destroy: ${asyncId}\n`);
    }
});

asyncHook.enable();

Caching Strategies

# Multi-level caching implementation
class CacheManager:
    def __init__(self):
        self.l1_cache = {}  # In-memory cache
        self.l2_cache = Redis()  # Redis cache
        self.l3_cache = CDN()  # CDN cache
        
    def get(self, key):
        # Check L1 cache
        if key in self.l1_cache:
            return self.l1_cache[key]
        
        # Check L2 cache
        value = self.l2_cache.get(key)
        if value:
            self.l1_cache[key] = value
            return value
        
        # Check L3 cache
        value = self.l3_cache.get(key)
        if value:
            self.l2_cache.set(key, value, ttl=3600)
            self.l1_cache[key] = value
            return value
        
        # Cache miss - fetch from source
        value = self.fetch_from_source(key)
        self.set_all_levels(key, value)
        return value
    
    def set_all_levels(self, key, value):
        self.l1_cache[key] = value
        self.l2_cache.set(key, value, ttl=3600)
        self.l3_cache.set(key, value, ttl=86400)

# Cache-aside pattern with proper invalidation
@app.route('/api/user/')
def get_user(user_id):
    cache_key = f'user:{user_id}'
    
    # Try cache first
    cached_user = cache.get(cache_key)
    if cached_user:
        return cached_user
    
    # Cache miss - fetch from database
    user = db.query('SELECT * FROM users WHERE id = ?', user_id)
    
    # Cache for future requests
    cache.set(cache_key, user, ttl=300)
    
    return user

# Invalidate cache on updates
@app.route('/api/user/', methods=['PUT'])
def update_user(user_id):
    # Update database
    db.execute('UPDATE users SET ... WHERE id = ?', user_id)
    
    # Invalidate cache
    cache.delete(f'user:{user_id}')
    cache.delete(f'user_list:*')  # Invalidate list caches
    
    return {'status': 'updated'}

Async and Parallel Processing

# Python async optimization
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor

async def fetch_data_async(session, url):
    async with session.get(url) as response:
        return await response.json()

async def process_urls_concurrently(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_data_async(session, url) for url in urls]
        results = await asyncio.gather(*tasks)
        return results

# CPU-bound tasks with multiprocessing
from multiprocessing import Pool
import numpy as np

def process_chunk(data_chunk):
    # CPU-intensive processing
    return np.mean(data_chunk ** 2)

def parallel_processing(data, num_processes=4):
    chunk_size = len(data) // num_processes
    chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]
    
    with Pool(processes=num_processes) as pool:
        results = pool.map(process_chunk, chunks)
    
    return np.mean(results)

# Go goroutines for concurrent processing
func processItems(items []Item) []Result {
    results := make([]Result, len(items))
    var wg sync.WaitGroup
    
    // Worker pool
    workers := runtime.NumCPU()
    itemChan := make(chan int, len(items))
    
    // Start workers
    for w := 0; w < workers; w++ {
        wg.Add(1)
        go func() {
            defer wg.Done()
            for idx := range itemChan {
                results[idx] = processItem(items[idx])
            }
        }()
    }
    
    // Queue work
    for i := range items {
        itemChan <- i
    }
    close(itemChan)
    
    wg.Wait()
    return results
}

Database Optimization

Query Optimization

-- Analyze slow queries
EXPLAIN (ANALYZE, BUFFERS) 
SELECT u.id, u.name, COUNT(o.id) as order_count, SUM(o.total) as total_spent
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
WHERE u.created_at > '2023-01-01'
GROUP BY u.id, u.name
HAVING COUNT(o.id) > 5
ORDER BY total_spent DESC
LIMIT 100;

-- Create appropriate indexes
CREATE INDEX idx_users_created_at ON users(created_at);
CREATE INDEX idx_orders_user_id_total ON orders(user_id, total);

-- Optimize with materialized view
CREATE MATERIALIZED VIEW user_order_summary AS
SELECT 
    u.id,
    u.name,
    COUNT(o.id) as order_count,
    SUM(o.total) as total_spent,
    MAX(o.created_at) as last_order_date
FROM users u
LEFT JOIN orders o ON u.id = o.user_id
GROUP BY u.id, u.name;

CREATE UNIQUE INDEX idx_user_order_summary_id ON user_order_summary(id);

-- Refresh strategy
CREATE OR REPLACE FUNCTION refresh_user_order_summary()
RETURNS void AS $$
BEGIN
    REFRESH MATERIALIZED VIEW CONCURRENTLY user_order_summary;
END;
$$ LANGUAGE plpgsql;

-- Schedule refresh
SELECT cron.schedule('refresh-summary', '0 */2 * * *', 'SELECT refresh_user_order_summary();');

Connection Pooling

# Database connection pool configuration
from sqlalchemy import create_engine
from sqlalchemy.pool import QueuePool

# Create engine with connection pooling
engine = create_engine(
    'postgresql://user:password@localhost/db',
    poolclass=QueuePool,
    pool_size=20,          # Number of connections to maintain
    max_overflow=10,       # Maximum overflow connections
    pool_pre_ping=True,    # Test connections before using
    pool_recycle=3600,     # Recycle connections after 1 hour
    echo_pool=True         # Log pool checkouts/checkins
)

# PgBouncer configuration for PostgreSQL
[databases]
mydb = host=localhost port=5432 dbname=mydb

[pgbouncer]
listen_port = 6432
listen_addr = *
auth_type = md5
auth_file = /etc/pgbouncer/userlist.txt
pool_mode = transaction
max_client_conn = 1000
default_pool_size = 25
reserve_pool_size = 5
reserve_pool_timeout = 3
server_lifetime = 3600
server_idle_timeout = 600

# Monitor pool performance
SHOW POOLS;
SHOW STATS;
SHOW SERVERS;

Database Partitioning

-- Time-based partitioning for large tables
CREATE TABLE events (
    id BIGSERIAL,
    created_at TIMESTAMP NOT NULL,
    event_type VARCHAR(50),
    data JSONB
) PARTITION BY RANGE (created_at);

-- Create monthly partitions
CREATE TABLE events_2024_01 PARTITION OF events
    FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');

CREATE TABLE events_2024_02 PARTITION OF events
    FOR VALUES FROM ('2024-02-01') TO ('2024-03-01');

-- Automated partition management
CREATE OR REPLACE FUNCTION create_monthly_partition()
RETURNS void AS $$
DECLARE
    start_date date;
    end_date date;
    partition_name text;
BEGIN
    start_date := date_trunc('month', CURRENT_DATE + interval '1 month');
    end_date := start_date + interval '1 month';
    partition_name := 'events_' || to_char(start_date, 'YYYY_MM');
    
    EXECUTE format('CREATE TABLE IF NOT EXISTS %I PARTITION OF events FOR VALUES FROM (%L) TO (%L)',
        partition_name, start_date, end_date);
END;
$$ LANGUAGE plpgsql;

-- Schedule partition creation
SELECT cron.schedule('create-partitions', '0 0 25 * *', 'SELECT create_monthly_partition();');

Infrastructure Optimization

Linux Kernel Tuning

# /etc/sysctl.conf - Network performance tuning
# Increase TCP buffer sizes
net.core.rmem_max = 134217728
net.core.wmem_max = 134217728
net.ipv4.tcp_rmem = 4096 87380 134217728
net.ipv4.tcp_wmem = 4096 65536 134217728

# Enable TCP Fast Open
net.ipv4.tcp_fastopen = 3

# Increase connection backlog
net.core.somaxconn = 65535
net.ipv4.tcp_max_syn_backlog = 65535

# Optimize for low latency
net.ipv4.tcp_low_latency = 1
net.ipv4.tcp_nodelay = 1

# File system tuning
fs.file-max = 2097152
vm.swappiness = 10
vm.dirty_ratio = 15
vm.dirty_background_ratio = 5

# Apply settings
sysctl -p

# CPU governor for performance
for cpu in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
    echo performance > $cpu
done

# Disable CPU frequency scaling
systemctl disable ondemand

# NUMA optimization
numactl --hardware  # Check NUMA topology
# Run process on specific NUMA node
numactl --cpunodebind=0 --membind=0 ./myapp

Container Resource Optimization

# Kubernetes resource tuning
apiVersion: v1
kind: Pod
metadata:
  name: optimized-app
spec:
  containers:
  - name: app
    image: myapp:latest
    resources:
      requests:
        memory: "256Mi"
        cpu: "250m"
        ephemeral-storage: "1Gi"
      limits:
        memory: "512Mi"
        cpu: "500m"
        ephemeral-storage: "2Gi"
    
    # JVM optimization for containers
    env:
    - name: JAVA_OPTS
      value: "-XX:MaxRAMPercentage=75.0 -XX:+UseG1GC -XX:+UseStringDeduplication"
    
    # Enable vertical pod autoscaling
    - name: VPA_ENABLED
      value: "true"

---
# Vertical Pod Autoscaler
apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
  name: app-vpa
spec:
  targetRef:
    apiVersion: "apps/v1"
    kind: Deployment
    name: myapp
  updatePolicy:
    updateMode: "Auto"
  resourcePolicy:
    containerPolicies:
    - containerName: app
      minAllowed:
        cpu: 100m
        memory: 128Mi
      maxAllowed:
        cpu: 2
        memory: 2Gi

Storage Performance

# Disk I/O optimization
# Check current I/O scheduler
cat /sys/block/sda/queue/scheduler

# Set to deadline for databases
echo deadline > /sys/block/sda/queue/scheduler

# Tune for SSDs
echo 0 > /sys/block/sda/queue/rotational
echo 0 > /sys/block/sda/queue/rq_affinity
echo noop > /sys/block/sda/queue/scheduler

# RAID optimization
# RAID 10 for databases (performance + redundancy)
mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/sd[b-e]

# Tune RAID parameters
echo 65536 > /sys/block/md0/md/stripe_cache_size
echo 8192 > /sys/block/md0/queue/read_ahead_kb

# File system optimization
# XFS for large files
mkfs.xfs -f -d agcount=32 -l size=512m /dev/md0

# Mount options for performance
mount -o noatime,nodiratime,nobarrier,logbufs=8,logbsize=256k /dev/md0 /data

# Monitor I/O performance
iostat -x 1
iotop -o

Network Optimization

TCP Tuning

# TCP optimization for high-throughput
# /etc/sysctl.d/99-tcp-tuning.conf

# TCP congestion control
net.ipv4.tcp_congestion_control = bbr
net.core.default_qdisc = fq

# Connection tracking
net.netfilter.nf_conntrack_max = 1048576
net.netfilter.nf_conntrack_tcp_timeout_established = 3600

# TCP keepalive
net.ipv4.tcp_keepalive_time = 600
net.ipv4.tcp_keepalive_intvl = 60
net.ipv4.tcp_keepalive_probes = 3

# Enable window scaling
net.ipv4.tcp_window_scaling = 1

# Increase port range
net.ipv4.ip_local_port_range = 10000 65535

# Reuse TIME_WAIT sockets
net.ipv4.tcp_tw_reuse = 1

# NIC optimization
# Enable receive packet steering
echo f > /sys/class/net/eth0/queues/rx-0/rps_cpus

# Increase ring buffer
ethtool -G eth0 rx 4096 tx 4096

# Enable offloading
ethtool -K eth0 gro on gso on tso on

# CPU affinity for network interrupts
# Find IRQs
grep eth0 /proc/interrupts

# Set CPU affinity
echo 1 > /proc/irq/24/smp_affinity  # Bind to CPU 0

Load Balancer Optimization

# NGINX performance tuning
user nginx;
worker_processes auto;
worker_cpu_affinity auto;
worker_rlimit_nofile 65535;

events {
    worker_connections 65535;
    use epoll;
    multi_accept on;
}

http {
    # Basic settings
    sendfile on;
    tcp_nopush on;
    tcp_nodelay on;
    keepalive_timeout 65;
    keepalive_requests 1000;
    
    # Buffer settings
    client_body_buffer_size 16K;
    client_header_buffer_size 1k;
    client_max_body_size 8m;
    large_client_header_buffers 4 16k;
    
    # Caching
    open_file_cache max=10000 inactive=60s;
    open_file_cache_valid 30s;
    open_file_cache_min_uses 2;
    open_file_cache_errors on;
    
    # Gzip compression
    gzip on;
    gzip_vary on;
    gzip_proxied any;
    gzip_comp_level 6;
    gzip_types text/plain text/css text/xml application/json application/javascript application/xml+rss application/atom+xml image/svg+xml;
    
    # Upstream configuration
    upstream backend {
        least_conn;
        keepalive 300;
        
        server backend1:8080 weight=5 max_fails=3 fail_timeout=30s;
        server backend2:8080 weight=5 max_fails=3 fail_timeout=30s;
        server backend3:8080 weight=5 max_fails=3 fail_timeout=30s;
    }
    
    server {
        listen 80 default_server reuseport;
        listen [::]:80 default_server reuseport;
        
        location / {
            proxy_pass http://backend;
            proxy_http_version 1.1;
            proxy_set_header Connection "";
            proxy_set_header X-Real-IP $remote_addr;
            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
            
            # Caching
            proxy_cache_bypass $http_upgrade;
            proxy_cache_valid 200 302 10m;
            proxy_cache_valid 404 1m;
        }
    }
}

Application-Specific Tuning

JVM Tuning

# Modern JVM optimization
JAVA_OPTS="-server \
  -Xms4g -Xmx4g \
  -XX:+UseG1GC \
  -XX:MaxGCPauseMillis=200 \
  -XX:ParallelGCThreads=20 \
  -XX:ConcGCThreads=5 \
  -XX:InitiatingHeapOccupancyPercent=70 \
  -XX:+HeapDumpOnOutOfMemoryError \
  -XX:HeapDumpPath=/var/log/app/heap-dump.hprof \
  -XX:+UseStringDeduplication \
  -XX:+OptimizeStringConcat \
  -XX:+UseCompressedOops \
  -Djava.net.preferIPv4Stack=true \
  -Dfile.encoding=UTF-8"

# GraalVM native image for startup performance
native-image \
  --no-server \
  --no-fallback \
  --initialize-at-build-time \
  -H:+ReportExceptionStackTraces \
  -H:+PrintClassInitialization \
  --enable-http --enable-https \
  -jar myapp.jar

Node.js Optimization

// Cluster mode for multi-core utilization
const cluster = require('cluster');
const numCPUs = require('os').cpus().length;

if (cluster.isMaster) {
    console.log(`Master ${process.pid} is running`);
    
    // Fork workers
    for (let i = 0; i < numCPUs; i++) {
        cluster.fork();
    }
    
    cluster.on('exit', (worker, code, signal) => {
        console.log(`Worker ${worker.process.pid} died`);
        cluster.fork(); // Replace dead worker
    });
} else {
    // Worker process
    const app = require('./app');
    app.listen(3000);
}

// V8 optimization flags
// node --max-old-space-size=4096 --optimize-for-size --gc-interval=100 app.js

// Memory leak detection
const v8 = require('v8');
const heapSnapshot = v8.writeHeapSnapshot();
console.log(`Heap snapshot written to ${heapSnapshot}`);

Performance Testing

Load Testing

# k6 load test script
import http from 'k6/http';
import { check, sleep } from 'k6';

export let options = {
    stages: [
        { duration: '5m', target: 100 },   // Ramp up
        { duration: '10m', target: 100 },  // Stay at 100 users
        { duration: '5m', target: 200 },   // Ramp up to 200
        { duration: '10m', target: 200 },  // Stay at 200
        { duration: '5m', target: 0 },     // Ramp down
    ],
    thresholds: {
        http_req_duration: ['p(95)<500'], // 95% of requests under 500ms
        http_req_failed: ['rate<0.1'],    // Error rate under 10%
    },
};

export default function () {
    let response = http.get('https://api.example.com/endpoint');
    
    check(response, {
        'status is 200': (r) => r.status === 200,
        'response time < 500ms': (r) => r.timings.duration < 500,
    });
    
    sleep(1);
}

# Run test
k6 run --out influxdb=http://localhost:8086/k6 load-test.js

Profiling in Production

# Continuous profiling with async-profiler
java -agentpath:/path/to/libasyncProfiler.so=start,event=cpu,file=profile.html -jar app.jar

# Go pprof
import _ "net/http/pprof"

go func() {
    log.Println(http.ListenAndServe("localhost:6060", nil))
}()

# Profile CPU
go tool pprof http://localhost:6060/debug/pprof/profile?seconds=30

# Profile memory
go tool pprof http://localhost:6060/debug/pprof/heap

Performance Monitoring

Application Metrics

# Custom performance metrics
from prometheus_client import Histogram, Counter, Gauge
import time

# Define metrics
request_duration = Histogram('request_duration_seconds', 
                           'Request duration',
                           ['method', 'endpoint'],
                           buckets=[.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5])

db_query_duration = Histogram('db_query_duration_seconds',
                             'Database query duration',
                             ['query_type'])

cache_hits = Counter('cache_hits_total', 'Cache hit count')
cache_misses = Counter('cache_misses_total', 'Cache miss count')

active_connections = Gauge('active_connections', 'Active DB connections')

# Use in application
@app.route('/api/')
def api_handler(endpoint):
    start_time = time.time()
    
    try:
        result = process_request(endpoint)
        return result
    finally:
        duration = time.time() - start_time
        request_duration.labels(method='GET', endpoint=endpoint).observe(duration)

Best Practices

  • Measure First: Profile before optimizing
  • One Change at a Time: Isolate optimization impact
  • Production-Like Testing: Test with realistic data and load
  • Monitor Continuously: Track performance over time
  • Document Changes: Record what worked and why
  • Avoid Premature Optimization: Focus on actual bottlenecks
  • Consider Trade-offs: Balance performance vs. complexity

Performance Checklist

  • □ Application profiling completed
  • □ Database queries optimized
  • □ Caching strategy implemented
  • □ Connection pooling configured
  • □ Infrastructure resources right-sized
  • □ Network latency minimized
  • □ Load testing performed
  • □ Monitoring dashboards created
  • □ Performance SLOs defined
  • □ Runbooks documented
Note: This documentation is provided for reference purposes only. It reflects general best practices and industry-aligned guidelines, and any examples, claims, or recommendations are intended as illustrative—not definitive or binding.