Deploy Kubernetes Cluster
Deploy Kubernetes Cluster
Complete guide to deploying production-ready Kubernetes clusters with high availability, security, and observability.
Kubernetes Overview
Kubernetes is a container orchestration platform that automates deployment, scaling, and management of containerized applications.
Architecture Components
- Control Plane: API server, scheduler, controller manager, etcd
- Worker Nodes: Kubelet, kube-proxy, container runtime
- Add-ons: DNS, dashboard, monitoring, ingress controller
- Networking: CNI plugins for pod networking
Deployment Options
Managed Kubernetes Services
Provider | Service | Key Features | Best For |
---|---|---|---|
AWS | EKS | Fargate support, IAM integration | AWS-native workloads |
Google Cloud | GKE | Autopilot, workload identity | Advanced features |
Azure | AKS | Azure AD integration, Arc | Microsoft workloads |
Self-managed | kubeadm | Full control, customizable | On-premises/edge |
Production Cluster Deployment
Prerequisites
# System requirements per node
CPU: 2+ cores
Memory: 4GB+ RAM
Disk: 30GB+ available
OS: Ubuntu 20.04 LTS or RHEL 8+
# Network requirements
- Unique hostname, MAC address, and product_uuid for each node
- Full network connectivity between all nodes
- Open ports:
- Control plane: 6443, 2379-2380, 10250-10252
- Worker nodes: 10250, 30000-32767
Install Kubernetes with kubeadm
#!/bin/bash
# Run on all nodes
# Disable swap
sudo swapoff -a
sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
# Install container runtime (containerd)
sudo apt-get update
sudo apt-get install -y ca-certificates curl gnupg lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y containerd.io
# Configure containerd
sudo mkdir -p /etc/containerd
containerd config default | sudo tee /etc/containerd/config.toml
sudo systemctl restart containerd
# Install Kubernetes components
sudo apt-get update
sudo apt-get install -y apt-transport-https ca-certificates curl
sudo curl -fsSLo /usr/share/keyrings/kubernetes-archive-keyring.gpg https://packages.cloud.google.com/apt/doc/apt-key.gpg
echo "deb [signed-by=/usr/share/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo apt-get update
sudo apt-get install -y kubelet=1.26.0-00 kubeadm=1.26.0-00 kubectl=1.26.0-00
sudo apt-mark hold kubelet kubeadm kubectl
# Enable kernel modules
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF
sudo modprobe overlay
sudo modprobe br_netfilter
# Configure sysctl
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
sudo sysctl --system
Initialize Control Plane
# On first control plane node
sudo kubeadm init \
--control-plane-endpoint="k8s-api.example.com:6443" \
--upload-certs \
--pod-network-cidr=10.244.0.0/16 \
--service-cidr=10.96.0.0/12
# Configure kubectl for admin user
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
# Save join commands from output
# For additional control plane nodes:
kubeadm join k8s-api.example.com:6443 --token [token] \
--discovery-token-ca-cert-hash sha256:[hash] \
--control-plane --certificate-key [key]
# For worker nodes:
kubeadm join k8s-api.example.com:6443 --token [token] \
--discovery-token-ca-cert-hash sha256:[hash]
High Availability Setup
HA Control Plane
# HAProxy configuration for API server load balancing
global
log /dev/log local0
chroot /var/lib/haproxy
stats socket /run/haproxy/admin.sock mode 660 level admin
stats timeout 30s
user haproxy
group haproxy
daemon
defaults
log global
mode tcp
option tcplog
option dontlognull
timeout connect 5000
timeout client 50000
timeout server 50000
frontend kubernetes-api
bind *:6443
mode tcp
option tcplog
default_backend kubernetes-masters
backend kubernetes-masters
mode tcp
balance roundrobin
option tcp-check
server master1 10.0.1.10:6443 check
server master2 10.0.1.11:6443 check
server master3 10.0.1.12:6443 check
etcd Backup and Restore
# Backup etcd
ETCDCTL_API=3 etcdctl \
--endpoints=https://127.0.0.1:2379 \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
snapshot save /backup/etcd-snapshot-$(date +%Y%m%d_%H%M%S).db
# Restore etcd
ETCDCTL_API=3 etcdctl \
--data-dir=/var/lib/etcd-restore \
snapshot restore /backup/etcd-snapshot.db
# Update etcd manifest
sudo vim /etc/kubernetes/manifests/etcd.yaml
# Change --data-dir to /var/lib/etcd-restore
Networking Configuration
Install CNI Plugin (Calico)
# Install Calico
kubectl create -f https://raw.githubusercontent.com/projectcalico/calico/v3.25.0/manifests/tigera-operator.yaml
# Configure Calico
cat <<EOF | kubectl create -f -
apiVersion: operator.tigera.io/v1
kind: Installation
metadata:
name: default
spec:
calicoNetwork:
ipPools:
- blockSize: 26
cidr: 10.244.0.0/16
encapsulation: VXLANCrossSubnet
natOutgoing: Enabled
nodeSelector: all()
EOF
Network Policies
# Default deny all ingress traffic
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: default-deny-ingress
namespace: production
spec:
podSelector: {}
policyTypes:
- Ingress
---
# Allow specific traffic
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: allow-frontend-to-backend
namespace: production
spec:
podSelector:
matchLabels:
tier: backend
policyTypes:
- Ingress
ingress:
- from:
- podSelector:
matchLabels:
tier: frontend
ports:
- protocol: TCP
port: 8080
Storage Configuration
Storage Classes
# Local storage class
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: local-storage
provisioner: kubernetes.io/no-provisioner
volumeBindingMode: WaitForFirstConsumer
---
# AWS EBS storage class
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: fast-ssd
provisioner: ebs.csi.aws.com
parameters:
type: gp3
encrypted: "true"
volumeBindingMode: WaitForFirstConsumer
allowVolumeExpansion: true
---
# NFS storage class
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: nfs-storage
provisioner: nfs.csi.k8s.io
parameters:
server: nfs-server.example.com
share: /exported/path
Persistent Volume Example
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: app-data
spec:
accessModes:
- ReadWriteOnce
storageClassName: fast-ssd
resources:
requests:
storage: 100Gi
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: database
spec:
serviceName: database
replicas: 3
selector:
matchLabels:
app: database
template:
metadata:
labels:
app: database
spec:
containers:
- name: postgres
image: postgres:14
volumeMounts:
- name: data
mountPath: /var/lib/postgresql/data
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: fast-ssd
resources:
requests:
storage: 100Gi
Security Hardening
RBAC Configuration
# Create service account
apiVersion: v1
kind: ServiceAccount
metadata:
name: app-service-account
namespace: production
---
# Define role
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: app-role
namespace: production
rules:
- apiGroups: [""]
resources: ["pods", "services"]
verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list"]
---
# Bind role to service account
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: app-role-binding
namespace: production
subjects:
- kind: ServiceAccount
name: app-service-account
namespace: production
roleRef:
kind: Role
name: app-role
apiGroup: rbac.authorization.k8s.io
Pod Security Standards
# Enforce pod security standards
apiVersion: v1
kind: Namespace
metadata:
name: production
labels:
pod-security.kubernetes.io/enforce: restricted
pod-security.kubernetes.io/audit: restricted
pod-security.kubernetes.io/warn: restricted
---
# Security context for pods
apiVersion: v1
kind: Pod
metadata:
name: secure-pod
spec:
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 2000
seccompProfile:
type: RuntimeDefault
containers:
- name: app
image: myapp:latest
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
Ingress and Load Balancing
Install NGINX Ingress Controller
# Install NGINX ingress
kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.5.1/deploy/static/provider/cloud/deploy.yaml
# Configure ingress resource
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: app-ingress
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
nginx.ingress.kubernetes.io/ssl-redirect: "true"
cert-manager.io/cluster-issuer: "letsencrypt-prod"
spec:
tls:
- hosts:
- app.example.com
secretName: app-tls
rules:
- host: app.example.com
http:
paths:
- path: /api
pathType: Prefix
backend:
service:
name: api-service
port:
number: 8080
- path: /
pathType: Prefix
backend:
service:
name: frontend-service
port:
number: 80
Monitoring and Observability
Prometheus and Grafana
# Install Prometheus stack using Helm
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
helm install monitoring prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--create-namespace \
--set prometheus.prometheusSpec.retention=30d \
--set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=100Gi
Application Metrics
# ServiceMonitor for Prometheus
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: app-metrics
labels:
release: monitoring
spec:
selector:
matchLabels:
app: myapp
endpoints:
- port: metrics
interval: 30s
path: /metrics
CI/CD Integration
GitOps with ArgoCD
# Install ArgoCD
kubectl create namespace argocd
kubectl apply -n argocd -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml
# Create application
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: myapp
namespace: argocd
spec:
project: default
source:
repoURL: https://github.com/myorg/myapp
targetRevision: HEAD
path: k8s
destination:
server: https://kubernetes.default.svc
namespace: production
syncPolicy:
automated:
prune: true
selfHeal: true
Disaster Recovery
Cluster Backup with Velero
# Install Velero
velero install \
--provider aws \
--plugins velero/velero-plugin-for-aws:v1.6.0 \
--bucket velero-backups \
--backup-location-config region=us-east-1 \
--snapshot-location-config region=us-east-1 \
--secret-file ./credentials-velero
# Create backup schedule
velero schedule create daily-backup \
--schedule="0 2 * * *" \
--ttl 720h0m0s
# Restore from backup
velero restore create --from-backup daily-backup-20231201020015
Troubleshooting
Common Issues
- Pods not starting: Check events, logs, resource limits
- Network issues: Verify CNI, network policies, DNS
- Storage problems: Check PV/PVC status, storage class
- Performance issues: Review resource requests/limits, node capacity
Useful Commands
# Debug pod issues
kubectl describe pod
kubectl logs -p
kubectl exec -it -- /bin/sh
# Check cluster health
kubectl get nodes
kubectl get componentstatuses
kubectl get pods --all-namespaces
# Resource usage
kubectl top nodes
kubectl top pods
# Network debugging
kubectl run debug --image=nicolaka/netshoot -it --rm
Best Practices
- Resource Management: Always set resource requests and limits
- Health Checks: Configure liveness and readiness probes
- Namespaces: Use namespaces for multi-tenancy
- Labels: Use consistent labeling strategy
- Updates: Use rolling updates with proper strategy
- Monitoring: Implement comprehensive monitoring
Related Resources
Note: This documentation is provided for reference purposes only. It reflects general best practices and industry-aligned guidelines, and any examples, claims, or recommendations are intended as illustrative—not definitive or binding.