Effective monitoring is essential for maintaining healthy virtual clusters. This guide covers monitoring strategies, metrics collection, and observability best practices for vCluster deployments.
# List all vClustersvcluster list# Check pods in host namespacekubectl get pods -n production -l release=my-vcluster# Check pod health detailskubectl describe pod -n production -l app=vcluster,release=my-vcluster
# Connect and check nodesvcluster connect my-vcluster --namespace productionkubectl get nodeskubectl get pods --all-namespaces# Check API server responsivenesskubectl get --raw /healthzkubectl get --raw /readyz
apiserver_storage_objects - Number of stored objects
Prometheus query examples:
# API server request raterate(apiserver_request_total[5m])# API server error raterate(apiserver_request_errors_total[5m])# P95 API latencyhistogram_quantile(0.95, rate(apiserver_request_duration_seconds_bucket[5m]))
# Panel: vCluster Healthup{job="vcluster", namespace="production"}# Use single stat visualization# Thresholds: 1 = green, 0 = red
# Panel: Synced Resources by Typesum by (resource_type) ( vcluster_syncer_resources_synced{namespace="production"})# Use bar chart or table visualization
# Panel: API Latency P95histogram_quantile(0.95, rate(apiserver_request_duration_seconds_bucket{ namespace="production" }[5m]))# Use time series graph# Alert threshold: > 1s
# Panel: Error Ratesum(rate(apiserver_request_errors_total{ namespace="production"}[5m]))# Use time series with alert threshold
# fluentbit-config.yamlapiVersion: v1kind: ConfigMapmetadata: name: fluent-bit-config namespace: loggingdata: fluent-bit.conf: | [INPUT] Name tail Path /var/log/containers/my-vcluster-*.log Parser docker Tag vcluster.* Refresh_Interval 5 [FILTER] Name kubernetes Match vcluster.* Kube_URL https://kubernetes.default.svc:443 [OUTPUT] Name es Match vcluster.* Host elasticsearch.logging.svc Port 9200 Index vcluster-logs
#!/bin/bash# monitor-vcluster.shVCLUSTER_NAME="my-vcluster"NAMESPACE="production"ALERT_WEBHOOK="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"check_health() { # Check if pods are running RUNNING=$(kubectl get pods -n $NAMESPACE -l release=$VCLUSTER_NAME \ -o jsonpath='{.items[*].status.phase}' | grep -c "Running") if [ $RUNNING -eq 0 ]; then echo "CRITICAL: No running pods found for $VCLUSTER_NAME" send_alert "vCluster $VCLUSTER_NAME is down!" return 1 fi # Check API server if ! vcluster connect $VCLUSTER_NAME --namespace $NAMESPACE -- kubectl get --raw /healthz &>/dev/null; then echo "WARNING: API server not responding" send_alert "vCluster $VCLUSTER_NAME API server not responding" return 1 fi echo "OK: vCluster $VCLUSTER_NAME is healthy" return 0}send_alert() { MESSAGE=$1 curl -X POST $ALERT_WEBHOOK \ -H 'Content-Type: application/json' \ -d "{\"text\": \"$MESSAGE\"}"}check_healthexit $?
Schedule with cron:
# Run every 5 minutes*/5 * * * * /usr/local/bin/monitor-vcluster.sh
# Monitor CPU and memorywatch -n 5 'kubectl top pods -n production -l release=my-vcluster'# Export metrics for analysiskubectl top pods -n production -l release=my-vcluster --containers > metrics-$(date +%Y%m%d-%H%M%S).txt