本文章仍在施工中。。。。

# ETCD 损坏

# 故障现象

kubectl 不可用

排查步骤

容器没起来, etcdfailed to recover v3 backend from snapshot + snapshot file doesn't exist ( etcd 的数据快照文件丢失 / 损坏,导致 etcd 进程直接 panic 崩溃,无法启动),apiserver 连带故障

[root@kubernetes-master snap]# crictl ps -a
CONTAINER           IMAGE               CREATED              STATE               NAME                      ATTEMPT             POD ID              POD
afafe5064e4c7       8fa62c12256df       About a minute ago   Exited              kube-apiserver            29                  147ef48d67e5f       kube-apiserver-kubernetes-master
3521bf7b6c885       25f8c7f3da61c       4 minutes ago        Exited              etcd                      29                  e17f5053bbbce       etcd-kubernetes-master
a8329167383b2       df7b72818ad2e       30 minutes ago       Running             kube-controller-manager   9                   2c11b6455c19b       kube-controller-manager-kubernetes-master
75e81ab06da64       595f327f224a4       30 minutes ago       Running             kube-scheduler            9                   b17e89783be58       kube-scheduler-kubernetes-master
be508f61e4339       595f327f224a4       2 hours ago          Exited              kube-scheduler            8                   2e3abf940027d       kube-scheduler-kubernetes-master
62e0e2adeb86a       df7b72818ad2e       2 hours ago          Exited              kube-controller-manager   8                   dc5d4fcec259b       kube-controller-manager-kubernetes-master
7530c5ac098d0       a4ca41631cc7a       5 hours ago          Exited              coredns                   7                   63c274cd8f789       coredns-6d8c4cb4d-vwvh2
731462f291166       08616d26b8e74       5 hours ago          Exited              calico-node               7                   989cfe7beeafc       calico-node-s2pxt
bc11a5eef8486       08616d26b8e74       5 hours ago          Exited              mount-bpffs               0                   989cfe7beeafc       calico-node-s2pxt
8a7ebf0767d4c       d70a5947d57e5       5 hours ago          Exited              install-cni               0                   989cfe7beeafc       calico-node-s2pxt
14fd49d74ea09       d70a5947d57e5       5 hours ago          Exited              upgrade-ipam              7                   989cfe7beeafc       calico-node-s2pxt
e7fd07f886ba5       4c03754524064       5 hours ago          Exited              kube-proxy                7                   cb372e716fb28       kube-proxy-7bdqc
[root@kubernetes-master snap]#
[root@kubernetes-master snap]# crictl logs 3521bf7b6c885
{"level":"info","ts":"2026-03-07T08:07:30.213Z","caller":"etcdmain/etcd.go:72","msg":"Running:","args":["etcd","--advertise-client-urls=https://10.0.0.10:2379","--cert-file=/etc/kubernetes/pki/etcd/server.crt","--client-cert-auth=true","--data-dir=/var/lib/etcd","--initial-advertise-peer-urls=https://10.0.0.10:2380","--initial-cluster=kubernetes-master=https://10.0.0.10:2380","--key-file=/etc/kubernetes/pki/etcd/server.key","--listen-client-urls=https://127.0.0.1:2379,https://10.0.0.10:2379","--listen-metrics-urls=http://127.0.0.1:2381","--listen-peer-urls=https://10.0.0.10:2380","--name=kubernetes-master","--peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt","--peer-client-cert-auth=true","--peer-key-file=/etc/kubernetes/pki/etcd/peer.key","--peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt","--snapshot-count=10000","--trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt"]}
{"level":"info","ts":"2026-03-07T08:07:30.214Z","caller":"etcdmain/etcd.go:115","msg":"server has been already initialized","data-dir":"/var/lib/etcd","dir-type":"member"}
{"level":"info","ts":"2026-03-07T08:07:30.214Z","caller":"embed/etcd.go:131","msg":"configuring peer listeners","listen-peer-urls":["https://10.0.0.10:2380"]}
{"level":"info","ts":"2026-03-07T08:07:30.214Z","caller":"embed/etcd.go:478","msg":"starting with peer TLS","tls-info":"cert = /etc/kubernetes/pki/etcd/peer.crt, key = /etc/kubernetes/pki/etcd/peer.key, client-cert=, client-key=, trusted-ca = /etc/kubernetes/pki/etcd/ca.crt, client-cert-auth = true, crl-file =","cipher-suites":[]}
{"level":"info","ts":"2026-03-07T08:07:30.214Z","caller":"embed/etcd.go:139","msg":"configuring client listeners","listen-client-urls":["https://10.0.0.10:2379","https://127.0.0.1:2379"]}
{"level":"info","ts":"2026-03-07T08:07:30.214Z","caller":"embed/etcd.go:307","msg":"starting an etcd server","etcd-version":"3.5.1","git-sha":"e8732fb5f","go-version":"go1.16.3","go-os":"linux","go-arch":"amd64","max-cpu-set":4,"max-cpu-available":4,"member-initialized":true,"name":"kubernetes-master","data-dir":"/var/lib/etcd","wal-dir":"","wal-dir-dedicated":"","member-dir":"/var/lib/etcd/member","force-new-cluster":false,"heartbeat-interval":"100ms","election-timeout":"1s","initial-election-tick-advance":true,"snapshot-count":10000,"snapshot-catchup-entries":5000,"initial-advertise-peer-urls":["https://10.0.0.10:2380"],"listen-peer-urls":["https://10.0.0.10:2380"],"advertise-client-urls":["https://10.0.0.10:2379"],"listen-client-urls":["https://10.0.0.10:2379","https://127.0.0.1:2379"],"listen-metrics-urls":["http://127.0.0.1:2381"],"cors":["*"],"host-whitelist":["*"],"initial-cluster":"","initial-cluster-state":"new","initial-cluster-token":"","quota-size-bytes":2147483648,"pre-vote":true,"initial-corrupt-check":false,"corrupt-check-time-interval":"0s","auto-compaction-mode":"periodic","auto-compaction-retention":"0s","auto-compaction-interval":"0s","discovery-url":"","discovery-proxy":"","downgrade-check-interval":"5s"}
{"level":"info","ts":"2026-03-07T08:07:30.215Z","caller":"etcdserver/backend.go:81","msg":"opened backend db","path":"/var/lib/etcd/member/snap/db","took":"579.74µs"}
{"level":"info","ts":"2026-03-07T08:07:30.682Z","caller":"etcdserver/server.go:508","msg":"recovered v2 store from snapshot","snapshot-index":360038,"snapshot-size":"9.7 kB"}
{"level":"warn","ts":"2026-03-07T08:07:30.682Z","caller":"snap/db.go:88","msg":"failed to find [SNAPSHOT-INDEX].snap.db","snapshot-index":360038,"snapshot-file-path":"/var/lib/etcd/member/snap/0000000000057e66.snap.db","error":"snap: snapshot file doesn't exist"}
{"level":"panic","ts":"2026-03-07T08:07:30.682Z","caller":"etcdserver/server.go:515","msg":"failed to recover v3 backend from snapshot","error":"failed to find database snapshot file (snap: snapshot file doesn't exist)","stacktrace":"go.etcd.io/etcd/server/v3/etcdserver.NewServer\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdserver/server.go:515\ngo.etcd.io/etcd/server/v3/embed.StartEtcd\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/server/embed/etcd.go:244\ngo.etcd.io/etcd/server/v3/etcdmain.startEtcd\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdmain/etcd.go:227\ngo.etcd.io/etcd/server/v3/etcdmain.startEtcdOrProxyV2\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdmain/etcd.go:122\ngo.etcd.io/etcd/server/v3/etcdmain.Main\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdmain/main.go:40\nmain.main\n\t/tmp/etcd-release-3.5.1/etcd/release/etcd/server/main.go:32\nruntime.main\n\t/home/remote/sbatsche/.gvm/gos/go1.16.3/src/runtime/proc.go:225"}
panic: failed to recover v3 backend from snapshot
goroutine 1 [running]:
go.uber.org/zap/zapcore.(*CheckedEntry).Write (0xc00050a300, 0xc000108240, 0x1, 0x1)
        /home/remote/sbatsche/.gvm/pkgsets/go1.16.3/global/pkg/mod/go.uber.org/zap@v1.17.0/zapcore/entry.go:234 +0x58d
go.uber.org/zap.(*Logger).Panic (0xc0004c43c0, 0x122f41e, 0x2a, 0xc000108240, 0x1, 0x1)
        /home/remote/sbatsche/.gvm/pkgsets/go1.16.3/global/pkg/mod/go.uber.org/zap@v1.17.0/logger.go:227 +0x85
go.etcd.io/etcd/server/v3/etcdserver.NewServer (0x7ffcf5cd8e37, 0x11, 0x0, 0x0, 0x0, 0x0, 0xc00001f7a0, 0x1, 0x1, 0xc00001f9e0, ...)
        /tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdserver/server.go:515 +0x1656
go.etcd.io/etcd/server/v3/embed.StartEtcd (0xc000020000, 0xc000020600, 0x0, 0x0)
        /tmp/etcd-release-3.5.1/etcd/release/etcd/server/embed/etcd.go:244 +0xef8
go.etcd.io/etcd/server/v3/etcdmain.startEtcd (0xc000020000, 0x1203b54, 0x6, 0xc000186301, 0x2)
        /tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdmain/etcd.go:227 +0x32
go.etcd.io/etcd/server/v3/etcdmain.startEtcdOrProxyV2 (0xc0000a6000, 0x12, 0x12)
        /tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdmain/etcd.go:122 +0x257a
go.etcd.io/etcd/server/v3/etcdmain.Main (0xc0000a6000, 0x12, 0x12)
        /tmp/etcd-release-3.5.1/etcd/release/etcd/server/etcdmain/main.go:40 +0x11f
main.main ()
        /tmp/etcd-release-3.5.1/etcd/release/etcd/server/main.go:32 +0x45
[root@kubernetes-master snap]# crictl logs afafe5064e4c7
I0307 08:10:30.264211       1 server.go:565] external host was not specified, using 10.0.0.10
I0307 08:10:30.264684       1 server.go:172] Version: v1.23.6
I0307 08:10:30.674407       1 shared_informer.go:240] Waiting for caches to sync for node_authorizer
I0307 08:10:30.674851       1 plugins.go:158] Loaded 12 mutating admission controller (s) successfully in the following order: NamespaceLifecycle,LimitRanger,ServiceAccount,NodeRestriction,TaintNodesByCondition,Priority,DefaultTolerationSeconds,DefaultStorageClass,StorageObjectInUseProtection,RuntimeClass,DefaultIngressClass,MutatingAdmissionWebhook.
I0307 08:10:30.674876       1 plugins.go:161] Loaded 11 validating admission controller (s) successfully in the following order: LimitRanger,ServiceAccount,PodSecurity,Priority,PersistentVolumeClaimResize,RuntimeClass,CertificateApproval,CertificateSigning,CertificateSubjectRestriction,ValidatingAdmissionWebhook,ResourceQuota.
I0307 08:10:30.675632       1 plugins.go:158] Loaded 12 mutating admission controller (s) successfully in the following order: NamespaceLifecycle,LimitRanger,ServiceAccount,NodeRestriction,TaintNodesByCondition,Priority,DefaultTolerationSeconds,DefaultStorageClass,StorageObjectInUseProtection,RuntimeClass,DefaultIngressClass,MutatingAdmissionWebhook.
I0307 08:10:30.675657       1 plugins.go:161] Loaded 11 validating admission controller (s) successfully in the following order: LimitRanger,ServiceAccount,PodSecurity,Priority,PersistentVolumeClaimResize,RuntimeClass,CertificateApproval,CertificateSigning,CertificateSubjectRestriction,ValidatingAdmissionWebhook,ResourceQuota.
W0307 08:10:30.677541       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:31.677942       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:31.678090       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:32.678325       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:33.510008       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:34.584837       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:36.516738       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:37.162914       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:40.978996       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:42.068748       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:48.033615       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0307 08:10:48.749224       1 clientconn.go:1331] [core] grpc: addrConn.createTransport failed to connect to {127.0.0.1:2379 127.0.0.1 <nil> 0 <nil>}. Err: connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
E0307 08:10:50.677359       1 run.go:74] "command failed" err="context deadline exceeded"
[root@kubernetes-master snap]#

# kubeproxy 无法启动

kube-proxy 的 ConfigMap 配置错误 :kube-proxy 尝试连接 192.168.0.102:6443 (master 的 ens36 网卡 IP),但 API Server 证书只包含 10.96.0.1 和 10.0.0.10 。

这导致:

  1. kube-proxy 无法连接 API Server → 无法创建 iptables 规则
  2. 没有 iptables 规则 → ClusterIP (10.96.0.1) 无法路由
  3. calico-node 无法获取 token → 初始化失败
  4. coredns 等 Pod 无法创建网络

将 kube-proxy ConfigMap 中的 server 地址从 192.168.0.102 改为 10.0.0.10 :

kubectl get configmap kube-proxy -n kube-system -o yaml | sed's/192.168.0.102/10.0.0.10/g' | kubectl apply -f -
kubectl delete pods -n kube-system -l k8s-app=kube-proxy
kubectl delete pods -n kube-system -l k8s-app=calico-node