From e05105d2634cc46da3e3bd9deae7c147fdcef515 Mon Sep 17 00:00:00 2001 From: redscholar Date: Tue, 23 Sep 2025 17:26:29 +0800 Subject: [PATCH] fix: scaling down etcd Signed-off-by: redscholar --- .../core/defaults/inventory/localhost.yaml | 1 + builtin/core/playbooks/add_nodes.yaml | 43 ++- builtin/core/playbooks/create_cluster.yaml | 5 +- builtin/core/playbooks/delete_cluster.yaml | 4 +- builtin/core/playbooks/delete_nodes.yaml | 42 ++- builtin/core/roles/certs/init/tasks/main.yaml | 43 ++- .../roles/certs/renew/etcd/tasks/main.yaml | 6 +- .../certs/renew/kubernetes/tasks/etcd.yaml | 4 +- .../defaults/defaults/main/02-certs.yaml | 2 + .../roles/defaults/defaults/main/04-etcd.yaml | 15 +- .../core/roles/etcd/backup/tasks/main.yaml | 29 ++ .../etcd/{ => install}/files/backup.service | 0 .../etcd/{ => install}/files/etcd.service | 1 + .../{ => install}/tasks/backup_service.yaml | 0 .../etcd/{ => install}/tasks/install.yaml | 13 +- .../core/roles/etcd/install/tasks/main.yaml | 14 + .../etcd/{ => install}/templates/backup.sh | 17 +- .../etcd/{ => install}/templates/backup.timer | 0 .../roles/etcd/install/templates/etcd.env | 100 ++++++ .../roles/etcd/postprocess/tasks/main.yaml | 74 ++++ .../core/roles/etcd/prepare/tasks/main.yaml | 113 ++++++ .../roles/etcd/scaling_down/tasks/main.yaml | 147 ++++++++ .../etcd/scaling_up/learner/tasks/main.yaml | 42 +++ .../etcd/scaling_up/promote/tasks/main.yaml | 41 +++ builtin/core/roles/etcd/tasks/expansion.yaml | 37 -- builtin/core/roles/etcd/tasks/main.yaml | 20 -- builtin/core/roles/etcd/tasks/prepare.yaml | 82 ----- builtin/core/roles/etcd/templates/etcd.env | 75 ---- .../upgrade.yaml => upgrade/tasks/main.yaml} | 7 +- .../templates/kubeadm/kubeadm-init.v1beta3 | 11 +- .../templates/kubeadm/kubeadm-init.v1beta4 | 11 +- .../kubernetes/pre-kubernetes/tasks/main.yaml | 4 +- .../sync-etcd-config/tasks/main.yaml | 64 ++++ builtin/core/roles/security/tasks/main.yaml | 2 +- .../core/roles/uninstall/etcd/tasks/main.yaml | 35 -- .../kubernetes/tasks/kubernetes.yaml | 4 +- cmd/kk/app/builtin/add.go | 20 +- cmd/kk/app/builtin/delete.go | 7 +- cmd/kk/app/options/builtin/add.go | 277 +++++++++++++-- cmd/kk/app/options/builtin/add_test.go | 336 ++++++++++++++++++ cmd/kk/app/options/builtin/builtin.go | 3 +- cmd/kk/app/options/builtin/delete.go | 159 +++++++++ cmd/kk/app/options/builtin/delete_test.go | 309 ++++++++++++++++ pkg/executor/task_executor.go | 10 +- plugins/roles/etcd/backup/tasks/main.yaml | 4 + plugins/roles/etcd/restore/tasks/main.yaml | 4 + 46 files changed, 1896 insertions(+), 341 deletions(-) create mode 100644 builtin/core/roles/etcd/backup/tasks/main.yaml rename builtin/core/roles/etcd/{ => install}/files/backup.service (100%) rename builtin/core/roles/etcd/{ => install}/files/etcd.service (92%) rename builtin/core/roles/etcd/{ => install}/tasks/backup_service.yaml (100%) rename builtin/core/roles/etcd/{ => install}/tasks/install.yaml (84%) create mode 100644 builtin/core/roles/etcd/install/tasks/main.yaml rename builtin/core/roles/etcd/{ => install}/templates/backup.sh (60%) rename builtin/core/roles/etcd/{ => install}/templates/backup.timer (100%) create mode 100644 builtin/core/roles/etcd/install/templates/etcd.env create mode 100644 builtin/core/roles/etcd/postprocess/tasks/main.yaml create mode 100644 builtin/core/roles/etcd/prepare/tasks/main.yaml create mode 100644 builtin/core/roles/etcd/scaling_down/tasks/main.yaml create mode 100644 builtin/core/roles/etcd/scaling_up/learner/tasks/main.yaml create mode 100644 builtin/core/roles/etcd/scaling_up/promote/tasks/main.yaml delete mode 100644 builtin/core/roles/etcd/tasks/expansion.yaml delete mode 100644 builtin/core/roles/etcd/tasks/main.yaml delete mode 100644 builtin/core/roles/etcd/tasks/prepare.yaml delete mode 100644 builtin/core/roles/etcd/templates/etcd.env rename builtin/core/roles/etcd/{tasks/upgrade.yaml => upgrade/tasks/main.yaml} (66%) create mode 100644 builtin/core/roles/kubernetes/sync-etcd-config/tasks/main.yaml delete mode 100644 builtin/core/roles/uninstall/etcd/tasks/main.yaml create mode 100644 cmd/kk/app/options/builtin/add_test.go create mode 100644 cmd/kk/app/options/builtin/delete_test.go diff --git a/builtin/core/defaults/inventory/localhost.yaml b/builtin/core/defaults/inventory/localhost.yaml index 4c3e1c693..19d84298d 100644 --- a/builtin/core/defaults/inventory/localhost.yaml +++ b/builtin/core/defaults/inventory/localhost.yaml @@ -14,6 +14,7 @@ spec: # port: 22 # user: root # password: 123456 + # internal_ipv4: 1.1.1.1 groups: # all kubernetes nodes. k8s_cluster: diff --git a/builtin/core/playbooks/add_nodes.yaml b/builtin/core/playbooks/add_nodes.yaml index 24b5be49e..ab69937f3 100644 --- a/builtin/core/playbooks/add_nodes.yaml +++ b/builtin/core/playbooks/add_nodes.yaml @@ -36,8 +36,48 @@ - etcd gather_facts: true roles: - - role: etcd + - role: etcd/prepare when: .etcd.deployment_type | eq "external" + - role: etcd/backup + when: .etcd.deployment_type | eq "external" +- hosts: + - etcd + serial: 1 + roles: + - role: etcd/scaling_up/learner + when: + - .etcd.deployment_type | eq "external" + - .installed_etcd | empty | not + - .need_installed_etcd | default list | has .inventory_hostname + - role: etcd/install + when: + - .etcd.deployment_type | eq "external" + - .installed_etcd | empty | not + - .need_installed_etcd | default list | has .inventory_hostname + - role: etcd/scaling_up/promote + when: + - .etcd.deployment_type | eq "external" + - .installed_etcd | empty | not + - .need_installed_etcd | default list | has .inventory_hostname +- hosts: + - etcd + gather_facts: true + roles: + - role: etcd/postprocess + when: + - .etcd.deployment_type | eq "external" + - .installed_etcd | empty | not + - .need_installed_etcd | default list | has .inventory_hostname + +- hosts: + - kube_control_plane + serial: 1 + roles: + - role: kubernetes/sync-etcd-config + when: + - .need_installed_etcd | empty | not + - .etcd.deployment_type | eq "external" + - .kubernetes_install_ActiveState.stdout | eq "active" - hosts: - k8s_cluster @@ -47,7 +87,6 @@ - role: kubernetes/pre-kubernetes when: or (.add_nodes | default list | empty) (.add_nodes | default list | has .inventory_hostname) - role: kubernetes/init-kubernetes - when: or (.add_nodes | default list | empty) (.add_nodes | default list | has .inventory_hostname) - role: kubernetes/join-kubernetes when: - or (.add_nodes | default list | empty) (.add_nodes | default list | has .inventory_hostname) diff --git a/builtin/core/playbooks/create_cluster.yaml b/builtin/core/playbooks/create_cluster.yaml index a63e87f09..788b0c174 100644 --- a/builtin/core/playbooks/create_cluster.yaml +++ b/builtin/core/playbooks/create_cluster.yaml @@ -35,8 +35,11 @@ - hosts: - etcd roles: - - role: etcd + - role: etcd/prepare when: .etcd.deployment_type | eq "external" + - role: etcd/install + when: .etcd.deployment_type | eq "external" + # Install the private image registry - hosts: diff --git a/builtin/core/playbooks/delete_cluster.yaml b/builtin/core/playbooks/delete_cluster.yaml index e121f9bbf..46bdefc03 100644 --- a/builtin/core/playbooks/delete_cluster.yaml +++ b/builtin/core/playbooks/delete_cluster.yaml @@ -32,8 +32,8 @@ - hosts: - etcd roles: - - role: uninstall/etcd - when: + - role: etcd/scaling_down + when: - .delete.etcd - .etcd.deployment_type | eq "external" diff --git a/builtin/core/playbooks/delete_nodes.yaml b/builtin/core/playbooks/delete_nodes.yaml index 43270460a..335cf54b5 100644 --- a/builtin/core/playbooks/delete_nodes.yaml +++ b/builtin/core/playbooks/delete_nodes.yaml @@ -10,14 +10,39 @@ gather_facts: true roles: - defaults - - precheck - hosts: - - kube_control_plane + - etcd + roles: + - role: etcd/prepare + when: + - .delete.etcd + - .etcd.deployment_type | eq "external" +- hosts: + - etcd + serial: 1 + roles: + - role: etcd/scaling_down + when: + - .delete.etcd + - .etcd.deployment_type | eq "external" + - .need_uninstall_etcd | has .inventory_hostname +- hosts: + - etcd gather_facts: true - tasks: + roles: + - role: etcd/postprocess + when: + - .delete.etcd + - .etcd.deployment_type | eq "external" + - .need_uninstall_etcd | empty | not + - .need_uninstall_etcd | has .inventory_hostname | not + +- hosts: + - kube_control_plane + serial: 1 + pre_tasks: - name: DeleteNode | Ensure at least one control plane node remains in the cluster - run_once: true command: | {{- $cpNodes := list -}} {{- range .groups.kube_control_plane -}} @@ -29,6 +54,13 @@ echo "At least one control plane node must be retained in the cluster." >&2 exit 1 {{- end }} + roles: + - role: kubernetes/sync-etcd-config + when: + - .need_installed_etcd | empty | not + - .etcd.deployment_type | eq "external" + - .kubernetes_install_ActiveState.stdout | eq "active" + - .delete_nodes | default list | has .inventory_hostname | not - hosts: - k8s_cluster @@ -71,7 +103,7 @@ - hosts: - etcd roles: - - role: uninstall/etcd + - role: etcd when: - .delete.etcd - .etcd.deployment_type | eq "external" diff --git a/builtin/core/roles/certs/init/tasks/main.yaml b/builtin/core/roles/certs/init/tasks/main.yaml index cc6568205..62e6a8e43 100644 --- a/builtin/core/roles/certs/init/tasks/main.yaml +++ b/builtin/core/roles/certs/init/tasks/main.yaml @@ -42,6 +42,8 @@ {{ .binary_dir }}/pki/front-proxy.crt - name: Cert | Generate the etcd certificate file + loop: "{{ .groups.etcd | toJson }}" + when: .item | empty | not gen_cert: root_key: >- {{ .binary_dir }}/pki/root.key @@ -50,24 +52,43 @@ cn: etcd sans: >- {{- $ips := list -}} - {{- range .groups.etcd | default list -}} - {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" -}} - {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" -}} - {{- if $internalIPv4 | empty | not -}} - {{- $ips = append $ips $internalIPv4 -}} - {{- end -}} - {{- if $internalIPv6 | empty | not -}} - {{- $ips = append $ips $internalIPv6 -}} - {{- end -}} + {{- $hostname := index .hostvars .item "hostname" | default "" -}} + {{- if .native.set_hostname -}} + {{- $hostname = .item -}} + {{- end -}} + {{- if $hostname | empty | not -}} + {{- $ips = append $ips $hostname -}} + {{- end -}} + {{- $internalIPv4 := index .hostvars .item "internal_ipv4" | default "" -}} + {{- if $internalIPv4 | empty | not -}} + {{- $ips = append $ips $internalIPv4 -}} + {{- end -}} + {{- $internalIPv6 := index .hostvars .item "internal_ipv6" | default "" -}} + {{- if $internalIPv6 | empty | not -}} + {{- $ips = append $ips $internalIPv6 -}} {{- end -}} {{ $ips | toJson }} date: "{{ .certs.etcd.date }}" policy: "{{ .certs.etcd.gen_cert_policy }}" out_key: >- - {{ .binary_dir }}/pki/etcd.key + {{ .binary_dir }}/pki/etcd-{{ .item }}.key out_cert: >- - {{ .binary_dir }}/pki/etcd.crt + {{ .binary_dir }}/pki/etcd-{{ .item }}.crt + +- name: Cert | Generate the etcd client certificate file when: .groups.etcd | default list | empty | not + gen_cert: + root_key: >- + {{ .binary_dir }}/pki/root.key + root_cert: >- + {{ .binary_dir }}/pki/root.crt + cn: etcd + date: "{{ .certs.etcd.date }}" + policy: "{{ .certs.etcd.gen_cert_policy }}" + out_key: >- + {{ .binary_dir }}/pki/etcd-client.key + out_cert: >- + {{ .binary_dir }}/pki/etcd-client.crt - name: Cert | Generate the image registry certificate file tags: ["image_registry"] diff --git a/builtin/core/roles/certs/renew/etcd/tasks/main.yaml b/builtin/core/roles/certs/renew/etcd/tasks/main.yaml index 838e12b55..7a2515be3 100644 --- a/builtin/core/roles/certs/renew/etcd/tasks/main.yaml +++ b/builtin/core/roles/certs/renew/etcd/tasks/main.yaml @@ -2,19 +2,19 @@ - name: ETCD | Copy CA certificate to remote host copy: src: >- - {{ ..etcd.ca_file }} + {{ .etcd.ca_file }} dest: /etc/ssl/etcd/ssl/ca.crt - name: ETCD | Copy server certificate to remote host copy: src: >- - {{ .etcd.cert_file }} + {{ tpl .etcd.server_cert_file .inventory_hostname }} dest: /etc/ssl/etcd/ssl/server.crt - name: ETCD | Copy server private key to remote host copy: src: >- - {{ .etcd.key_file }} + {{ tpl .etcd.server_key_file .inventory_hostname }} dest: /etc/ssl/etcd/ssl/server.key - name: ETCD | Restart etcd service to apply new certificates diff --git a/builtin/core/roles/certs/renew/kubernetes/tasks/etcd.yaml b/builtin/core/roles/certs/renew/kubernetes/tasks/etcd.yaml index dfd71d810..475b0927f 100644 --- a/builtin/core/roles/certs/renew/kubernetes/tasks/etcd.yaml +++ b/builtin/core/roles/certs/renew/kubernetes/tasks/etcd.yaml @@ -9,13 +9,13 @@ - name: ETCD | Copy client certificate to remote host copy: src: >- - {{ .etcd.cert_file }} + {{ .etcd.client_cert_file }} dest: /etc/kubernetes/pki/etcd/client.crt mode: 0755 - name: ETCD | Copy client key to remote host copy: src: >- - {{ .etcd.key_file }} + {{ .etcd.client_key_file }} dest: /etc/kubernetes/pki/etcd/client.key mode: 0755 diff --git a/builtin/core/roles/defaults/defaults/main/02-certs.yaml b/builtin/core/roles/defaults/defaults/main/02-certs.yaml index a7fca7f48..3757d354e 100644 --- a/builtin/core/roles/defaults/defaults/main/02-certs.yaml +++ b/builtin/core/roles/defaults/defaults/main/02-certs.yaml @@ -8,6 +8,8 @@ # CA (self-signed or provided) # |- etcd.cert # |- etcd.key +# |- etcd-client.cert +# |- etcd-client.key # | # |- image_registry.cert # |- image_registry.key diff --git a/builtin/core/roles/defaults/defaults/main/04-etcd.yaml b/builtin/core/roles/defaults/defaults/main/04-etcd.yaml index 29c33ef56..eb7dfc670 100644 --- a/builtin/core/roles/defaults/defaults/main/04-etcd.yaml +++ b/builtin/core/roles/defaults/defaults/main/04-etcd.yaml @@ -9,7 +9,8 @@ etcd: {{ .image_registry.dockerio_registry }} repository: kubesphere/etcd tag: "{{ .etcd.etcd_version }}" - # endpoints: ["https://127.1.1.1:2379"] + port: 2379 + peer_port: 2380 # Environment variables for etcd service env: election_timeout: 5000 @@ -37,7 +38,11 @@ etcd: traffic_priority: false ca_file: >- {{ .binary_dir }}/pki/root.crt - cert_file: >- - {{ .binary_dir }}/pki/etcd.crt - key_file: >- - {{ .binary_dir }}/pki/etcd.key \ No newline at end of file + server_cert_file: >- + {{ .binary_dir }}/pki/etcd-{{ "{{ . }}" }}.crt + server_key_file: >- + {{ .binary_dir }}/pki/etcd-{{ "{{ . }}" }}.key + client_cert_file: >- + {{ .binary_dir }}/pki/etcd-client.crt + client_key_file: >- + {{ .binary_dir }}/pki/etcd-client.key \ No newline at end of file diff --git a/builtin/core/roles/etcd/backup/tasks/main.yaml b/builtin/core/roles/etcd/backup/tasks/main.yaml new file mode 100644 index 000000000..5c197fcd5 --- /dev/null +++ b/builtin/core/roles/etcd/backup/tasks/main.yaml @@ -0,0 +1,29 @@ +- name: Backup | Get leader node name + run_once: true + delegate_to: "{{ .installed_etcd }}" + command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + + # Get leader ID from endpoint status (text format: endpoint, ID, version, size, leader, term, index) + LEADER_ID=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint status | awk -F',' '{gsub(/^[ \t]+|[ \t]+$/, "", $5); print $5}') + + # Convert decimal ID to hex (member list shows hex ID) + LEADER_ID_HEX=$(printf "%x" "$LEADER_ID") + + # Get leader name from member list (format: ID,name,peerURL,clientURL,learners) + ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | grep "^${LEADER_ID_HEX}" | awk -F',' '{gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2}' + register: etcd_backup_leader_name + +- name: Backup | Backup on leader node + when: .inventory_hostname | eq .etcd_backup_leader_name.stdout + command: BACKUP_DIR="{{ .etcd.backup.backup_dir }}/install/etcd-v{{ index .etcd_install_version "stdout" "etcd Version" }}-$(date +%Y-%m-%d-%H-%M-%S)" /usr/local/bin/kube-scripts/backup_etcd.sh diff --git a/builtin/core/roles/etcd/files/backup.service b/builtin/core/roles/etcd/install/files/backup.service similarity index 100% rename from builtin/core/roles/etcd/files/backup.service rename to builtin/core/roles/etcd/install/files/backup.service diff --git a/builtin/core/roles/etcd/files/etcd.service b/builtin/core/roles/etcd/install/files/etcd.service similarity index 92% rename from builtin/core/roles/etcd/files/etcd.service rename to builtin/core/roles/etcd/install/files/etcd.service index 178c3e246..eb7d6e797 100644 --- a/builtin/core/roles/etcd/files/etcd.service +++ b/builtin/core/roles/etcd/install/files/etcd.service @@ -13,6 +13,7 @@ NotifyAccess=all RestartSec=10s LimitNOFILE=40000 Restart=always +TimeoutStartSec=10min [Install] WantedBy=multi-user.target diff --git a/builtin/core/roles/etcd/tasks/backup_service.yaml b/builtin/core/roles/etcd/install/tasks/backup_service.yaml similarity index 100% rename from builtin/core/roles/etcd/tasks/backup_service.yaml rename to builtin/core/roles/etcd/install/tasks/backup_service.yaml diff --git a/builtin/core/roles/etcd/tasks/install.yaml b/builtin/core/roles/etcd/install/tasks/install.yaml similarity index 84% rename from builtin/core/roles/etcd/tasks/install.yaml rename to builtin/core/roles/etcd/install/tasks/install.yaml index f562ceb52..fee208441 100644 --- a/builtin/core/roles/etcd/tasks/install.yaml +++ b/builtin/core/roles/etcd/install/tasks/install.yaml @@ -12,11 +12,6 @@ loop: - "{{ .etcd.env.data_dir }}" -- name: Install | Generate etcd environment configuration file - template: - src: etcd.env - dest: /etc/etcd.env - - name: Install | Deploy etcd systemd service file copy: src: etcd.service @@ -31,10 +26,10 @@ - name: Install | Configure network traffic priority for etcd command: | tc qdisc add dev eth0 root handle 1: prio bands 3 - tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip sport 2380 0xffff flowid 1:1 - tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dport 2380 0xffff flowid 1:1 - tc filter add dev eth0 parent 1: protocol ip prio 2 u32 match ip sport 2379 0xffff flowid 1:1 - tc filter add dev eth0 parent 1: protocol ip prio 2 u32 match ip dport 2379 0xffff flowid 1:1 + tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip sport {{ .etcd.peer_port }} 0xffff flowid 1:1 + tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dport {{ .etcd.peer_port }} 0xffff flowid 1:1 + tc filter add dev eth0 parent 1: protocol ip prio 2 u32 match ip sport {{ .etcd.port }} 0xffff flowid 1:1 + tc filter add dev eth0 parent 1: protocol ip prio 2 u32 match ip dport {{ .etcd.port }} 0xffff flowid 1:1 when: .etcd.traffic_priority - name: Install | Start and enable etcd systemd service diff --git a/builtin/core/roles/etcd/install/tasks/main.yaml b/builtin/core/roles/etcd/install/tasks/main.yaml new file mode 100644 index 000000000..6902e27b1 --- /dev/null +++ b/builtin/core/roles/etcd/install/tasks/main.yaml @@ -0,0 +1,14 @@ +--- +- name: Install | Set when + when: + - .etcd_install_LoadState.stdout | eq "not-found" + - .need_installed_etcd | has .inventory_hostname + block: + - name: Install | Render /etc/etcd.env configuration file + template: + src: etcd.env + dest: /etc/etcd.env + + - include_tasks: install.yaml + + - include_tasks: backup_service.yaml \ No newline at end of file diff --git a/builtin/core/roles/etcd/templates/backup.sh b/builtin/core/roles/etcd/install/templates/backup.sh similarity index 60% rename from builtin/core/roles/etcd/templates/backup.sh rename to builtin/core/roles/etcd/install/templates/backup.sh index dbec7ae50..6961dd202 100644 --- a/builtin/core/roles/etcd/templates/backup.sh +++ b/builtin/core/roles/etcd/install/templates/backup.sh @@ -5,11 +5,20 @@ set -o nounset set -o pipefail ETCDCTL_PATH='/usr/local/bin/etcdctl' -{{- if .internal_ipv4 | empty | not }} -ENDPOINTS='https://{{ .internal_ipv4 }}:2379' -{{- else if .internal_ipv6 | empty | not }} -ENDPOINTS='https://{{ .internal_ipv6 }}:2379' +{{- $endpoints := list }} +{{- range .groups.etcd | default list }} + {{- if $.need_uninstall_etcd | default list | has . | not }} + {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" }} + {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" }} + {{- if $internalIPv4 | empty | not }} + {{- $endpoints = append $endpoints (printf "https://%s:%d" $internalIPv4 $.etcd.port) }} + {{- end }} + {{- if $internalIPv6 | empty | not }} + {{- $endpoints = append $endpoints (printf "https://[%s]:%d" $internalIPv6 $.etcd.port) }} + {{- end }} + {{- end }} {{- end }} +ETCD_ENDPOINTS="{{ join "," $endpoints }}" ETCD_DATA_DIR="{{ .etcd.env.data_dir }}" BACKUP_DIR="${BACKUP_DIR:-{{ .etcd.backup.backup_dir }}/timer/etcd-$(date +%Y-%m-%d-%H-%M-%S)}" KEEPBACKUPNUMBER='{{ .etcd.backup.keep_backup_number }}' diff --git a/builtin/core/roles/etcd/templates/backup.timer b/builtin/core/roles/etcd/install/templates/backup.timer similarity index 100% rename from builtin/core/roles/etcd/templates/backup.timer rename to builtin/core/roles/etcd/install/templates/backup.timer diff --git a/builtin/core/roles/etcd/install/templates/etcd.env b/builtin/core/roles/etcd/install/templates/etcd.env new file mode 100644 index 000000000..25bf258f0 --- /dev/null +++ b/builtin/core/roles/etcd/install/templates/etcd.env @@ -0,0 +1,100 @@ +{{- $initialCluster := list -}} +{{- $state := "new" -}} +{{- range .groups.etcd | default list -}} + {{- if $.need_uninstall_etcd | default list | has . | not -}} + {{- $hostname := index $.hostvars . "hostname" -}} + {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" -}} + {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" -}} + {{- if $internalIPv4 | empty | not -}} + {{- $initialCluster = append $initialCluster (printf "%s=https://%s:%d" $hostname $internalIPv4 $.etcd.peer_port) -}} + {{- end -}} + {{- if $internalIPv6 | empty | not -}} + {{- $initialCluster = append $initialCluster (printf "%s=https://[%s]:%d" $hostname $internalIPv6 $.etcd.peer_port) -}} + {{- end -}} + {{ if index $.hostvars . "etcd_install_LoadState" "stdout" | eq "loaded" -}} + {{- $state = "existing" -}} + {{- end -}} + {{- end -}} +{{- end -}} +ETCD_ENABLE_SYSTEMD_NOTIFY=true +ETCD_DATA_DIR={{ .etcd.env.data_dir }} +ETCD_INITIAL_CLUSTER_STATE={{ $state }} +ETCD_INITIAL_CLUSTER_TOKEN={{ .etcd.env.token }} +{{- $advertiseClientURLs := list }} +{{- $advertisePeerURLs := list }} +{{- $listenClientURLs := list (printf "https://localhost:%d" .etcd.port) }} +{{- $listenPeerURLs := list }} +{{- if .internal_ipv4 | empty | not }} + {{- $advertiseClientURLs = append $advertiseClientURLs (printf "https://%s:%d" .internal_ipv4 .etcd.port) }} + {{- $advertisePeerURLs = append $advertisePeerURLs (printf "https://%s:%d" .internal_ipv4 .etcd.peer_port) }} + {{- $listenClientURLs = append $listenClientURLs (printf "https://%s:%d" .internal_ipv4 .etcd.port) }} + {{- $listenPeerURLs = append $listenPeerURLs (printf "https://%s:%d" .internal_ipv4 .etcd.peer_port) }} +{{- end }} +{{- if .internal_ipv6 | empty | not }} + {{- $advertiseClientURLs = append $advertiseClientURLs (printf "https://[%s]:%d" .internal_ipv6 .etcd.port) }} + {{- $advertisePeerURLs = append $advertisePeerURLs (printf "https://[%s]:%d" .internal_ipv6 .etcd.peer_port) }} + {{- $listenClientURLs = append $listenClientURLs (printf "https://[%s]:%d" .internal_ipv6 .etcd.port) }} + {{- $listenPeerURLs = append $listenPeerURLs (printf "https://[%s]:%d" .internal_ipv6 .etcd.peer_port) }} +{{- end }} +ETCD_ADVERTISE_CLIENT_URLS={{ $advertiseClientURLs | join "," }} +ETCD_INITIAL_ADVERTISE_PEER_URLS={{ $advertisePeerURLs | join "," }} +ETCD_LISTEN_CLIENT_URLS={{ $listenClientURLs | join "," }} +ETCD_LISTEN_PEER_URLS={{ $listenPeerURLs | join "," }} + +ETCD_NAME={{ .hostname }} +ETCD_PROXY=off +ETCD_ENABLE_V2=true +{{- if index (.etcd_member_list | default dict) "stdout" | kindIs "map" }} + {{- $existingInitialCluster := list }} + {{- range .etcd_member_list.stdout.members }} + {{- $name := .name }} + {{- range .peerURLs }} + {{- $existingInitialCluster = append $existingInitialCluster (printf "%s=%s" ($name | default $.inventory_hostname) .) }} + {{- end }} + {{- end }} +ETCD_INITIAL_CLUSTER={{ $existingInitialCluster | join "," }} +{{- else }} +ETCD_INITIAL_CLUSTER={{ $initialCluster | join "," }} +{{- end }} +ETCD_ELECTION_TIMEOUT={{ .etcd.env.election_timeout }} +ETCD_HEARTBEAT_INTERVAL={{ .etcd.env.heartbeat_interval }} +ETCD_AUTO_COMPACTION_RETENTION={{ .etcd.env.compaction_retention }} +ETCD_SNAPSHOT_COUNT={{ .etcd.env.snapshot_count }} +{{- if .etcd.metrics }} +ETCD_METRICS={{ .etcd.env.metrics }} +{{- end }} +{{- if .etcd.env.quota_backend_bytes }} +ETCD_QUOTA_BACKEND_BYTES={{ .etcd.env.quota_backend_bytes }} +{{- end }} +{{- if .etcd.env.max_request_bytes }} +ETCD_MAX_REQUEST_BYTES={{ .etcd.env.max_request_bytes }} +{{- end }} +{{- if .etcd.env.max_snapshots }} +ETCD_MAX_SNAPSHOTS={{ .etcd.env.max_snapshots }} +{{- end }} +{{- if .etcd.env.max_wals }} +ETCD_MAX_WALS={{ .etcd.env.max_wals }} +{{- end }} +{{- if .etcd.env.log_level }} +ETCD_LOG_LEVEL={{ .etcd.env.log_level }} +{{- end }} +{{- if .etcd.env.unsupported_arch }} +ETCD_UNSUPPORTED_ARCH={{ .etcd.env.unsupported_arch }} +{{- end }} + +# TLS settings +ETCD_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.crt +ETCD_CERT_FILE=/etc/ssl/etcd/ssl/server.crt +ETCD_KEY_FILE=/etc/ssl/etcd/ssl/server.key +ETCD_CLIENT_CERT_AUTH=true + +ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.crt +ETCD_PEER_CERT_FILE=/etc/ssl/etcd/ssl/server.crt +ETCD_PEER_KEY_FILE=/etc/ssl/etcd/ssl/server.key +ETCD_PEER_CLIENT_CERT_AUTH=true + +# CLI settings +ETCDCTL_ENDPOINTS=https://localhost:{{ .etcd.port }} +ETCDCTL_CACERT=/etc/ssl/etcd/ssl/ca.crt +ETCDCTL_CERT=/etc/ssl/etcd/ssl/server.crt +ETCDCTL_KEY=/etc/ssl/etcd/ssl/server.key diff --git a/builtin/core/roles/etcd/postprocess/tasks/main.yaml b/builtin/core/roles/etcd/postprocess/tasks/main.yaml new file mode 100644 index 000000000..6c89ce2fa --- /dev/null +++ b/builtin/core/roles/etcd/postprocess/tasks/main.yaml @@ -0,0 +1,74 @@ +- name: Postprocess | Refresh etcd.env + command: | + # set ETCD_INITIAL_CLUSTER_STATE to existing + sed -i 's/^ETCD_INITIAL_CLUSTER_STATE=new$/ETCD_INITIAL_CLUSTER_STATE=existing/' /etc/etcd.env + # set ETCD_INITIAL_CLUSTER to all etcd nodes + {{- $initialCluster := list }} + {{- range .groups.etcd | default list }} + {{- if $.need_uninstall_etcd | default list | has . | not }} + {{- $hostname := index $.hostvars . "hostname" }} + {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" }} + {{- if $internalIPv4 | empty | not }} + {{- $initialCluster = append $initialCluster (printf "%s=https://%s:%d" $hostname $internalIPv4 $.etcd.peer_port) }} + {{- end }} + {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" }} + {{- if $internalIPv6 | empty | not }} + {{- $initialCluster = append $initialCluster (printf "%s=https://[%s]:%d" $hostname $internalIPv6 $.etcd.peer_port) }} + {{- end }} + {{- end }} + {{- end -}} + sed -i 's/^ETCD_INITIAL_CLUSTER=.*/ETCD_INITIAL_CLUSTER={{ $initialCluster | join "," }}/' /etc/etcd.env + + +- name: ScalingDown | Restart etcd if only one member left after removal + when: + - .installed_etcd | eq .inventory_hostname + # Calculate: original etcd count - nodes to uninstall = remaining count + # Restart only when remaining count is 1 (single node mode) + - sub (.groups.etcd | len) (.need_uninstall_etcd | len) | eq 1 + block: + - name: ScalingDown | Restart etcd to apply new configuration for single node + command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + + # Force new cluster when scaling down to single node + if grep -q "^ETCD_FORCE_NEW_CLUSTER=" /etc/etcd.env; then + sed -i 's/^ETCD_FORCE_NEW_CLUSTER=.*/ETCD_FORCE_NEW_CLUSTER=true/' /etc/etcd.env + else + sed -i '1i ETCD_FORCE_NEW_CLUSTER=true' /etc/etcd.env + fi + + systemctl restart etcd + + + for ((i=1; i<=12; i++)); do + if ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint health >/dev/null 2>&1; then + echo "✅ etcd is healthy with FORCE_NEW_CLUSTER" + break + fi + sleep 5 + done + + # Remove ETCD_FORCE_NEW_CLUSTER and restart again for clean single node + sed -i '/^ETCD_FORCE_NEW_CLUSTER=/d' /etc/etcd.env + systemctl restart etcd + + for ((i=1; i<=12; i++)); do + if ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint health >/dev/null 2>&1; then + echo "✅ etcd is healthy as clean single node" + exit 0 + fi + sleep 5 + done + echo "❌ etcd did not become healthy within 1 minute" + exit 1 diff --git a/builtin/core/roles/etcd/prepare/tasks/main.yaml b/builtin/core/roles/etcd/prepare/tasks/main.yaml new file mode 100644 index 000000000..a8abae28c --- /dev/null +++ b/builtin/core/roles/etcd/prepare/tasks/main.yaml @@ -0,0 +1,113 @@ +--- +- name: Prepare | Gather etcd node state and membership + block: + - name: Prepare | Detect installed, to-install, and to-remove etcd nodes + run_once: true + add_hostvars: + hosts: all + vars: + installed_etcd: >- + {{- $installed := list -}} + {{- range .groups.etcd -}} + {{- if and ((index $.hostvars . "etcd_install_LoadState" "stdout") | eq "loaded") ($.delete_nodes | default list | has . | not) -}} + {{- $installed = append $installed . -}} + {{- end -}} + {{- end -}} + {{ $installed | first | default "" }} + need_installed_etcd: >- + {{- $needInstalled := list -}} + {{- range .groups.etcd -}} + {{- if and ((index $.hostvars . "etcd_install_LoadState" "stdout") | eq "not-found") ($.delete_nodes | default list | has . | not) -}} + {{- $needInstalled = append $needInstalled . -}} + {{- end -}} + {{- end -}} + {{ $needInstalled | toJson }} + need_uninstall_etcd: >- + {{- $needUnInstalled := list -}} + {{- range .groups.etcd -}} + {{- if $.delete_nodes | default list | has . -}} + {{- $needUnInstalled = append $needUnInstalled . -}} + {{- end -}} + {{- end -}} + {{ $needUnInstalled | toJson }} + +- name: Prepare | Validate installed etcd version + when: .etcd_install_LoadState.stdout | eq "loaded" + block: + - name: Prepare | Ensure target etcd version is not lower than installed version + when: .etcd_install_LoadState.stdout | eq "loaded" + assert: + that: .etcd.etcd_version | semverCompare (printf ">=v%s" (index .etcd_install_version "stdout" "etcd Version")) + fail_msg: >- + Installed etcd version: {{ index .etcd_install_version "stdout" "etcd Version" }} is lower than target etcd version: {{ .etcd.etcd_version }} + +- name: Prepare | Distribute etcd package for install or upgrade + when: >- + or + (.etcd_install_version.error | empty | not) + (.etcd.etcd_version | semverCompare (printf ">v%s" (index .etcd_install_version "stdout" "etcd Version"))) + block: + - name: Prepare | Copy etcd binary package to node + copy: + src: >- + {{ .binary_dir }}/etcd/{{ .etcd.etcd_version }}/{{ .binary_type }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz + dest: >- + {{ .tmp_dir }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz + - name: Prepare | Extract etcd binaries to /usr/local/bin/ + command: | + tar --strip-components=1 -C /usr/local/bin/ -xvf {{ .tmp_dir }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz \ + --wildcards 'etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}/etcd*' + +- name: Prepare | Remove ETCD_FORCE_NEW_CLUSTER if exists and restart etcd + when: + - .installed_etcd | eq .inventory_hostname + - .need_installed_etcd | empty | not + - .groups.etcd | len | eq 1 + block: + - name: Prepare | Check if ETCD_FORCE_NEW_CLUSTER exists in etcd.env + command: grep "^ETCD_FORCE_NEW_CLUSTER=true" /etc/etcd.env + ignore_errors: true + register: force_new_cluster_check + - name: Prepare | Remove ETCD_FORCE_NEW_CLUSTER and restart etcd + when: .force_new_cluster_check.stdout | eq "ETCD_FORCE_NEW_CLUSTER=true" + command: | + sed -i '/^ETCD_FORCE_NEW_CLUSTER=/d' /etc/etcd.env + systemctl restart etcd + + for ((i=1; i<=12; i++)); do + if ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint health >/dev/null 2>&1; then + echo "✅ etcd restarted successfully without FORCE_NEW_CLUSTER" + exit 0 + fi + sleep 5 + done + echo "❌ etcd did not become healthy within 1 minute" + exit 1 + +- name: Prepare | Synchronize certificates to need install node + when: + - .etcd_install_LoadState.stdout | eq "not-found" + - .need_installed_etcd | has .inventory_hostname + block: + - name: Prepare | Synchronize certificates and etcd.env when changed + block: + - name: Prepare | Copy CA certificate to etcd node + copy: + src: >- + {{ .etcd.ca_file }} + dest: /etc/ssl/etcd/ssl/ca.crt + - name: Prepare | Copy server certificate to etcd node + copy: + src: >- + {{ tpl .etcd.server_cert_file .inventory_hostname }} + dest: /etc/ssl/etcd/ssl/server.crt + - name: Prepare | Copy server key to etcd node + copy: + src: >- + {{ tpl .etcd.server_key_file .inventory_hostname }} + dest: /etc/ssl/etcd/ssl/server.key \ No newline at end of file diff --git a/builtin/core/roles/etcd/scaling_down/tasks/main.yaml b/builtin/core/roles/etcd/scaling_down/tasks/main.yaml new file mode 100644 index 000000000..5a086ef4b --- /dev/null +++ b/builtin/core/roles/etcd/scaling_down/tasks/main.yaml @@ -0,0 +1,147 @@ +--- +- name: ScalingDown | Execute actions on etcd nodes scheduled for removal + block: + - name: ScalingDown | Remove etcd member from cluster + run_once: true + delegate_to: "{{ .installed_etcd }}" + when: + - .installed_etcd | empty | not + - .need_uninstall_etcd | has .inventory_hostname + command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + + # Get the member ID of the node to be removed + MEMBER_ID=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | grep $hostname | awk -F',' '{print $1}') + if [ -z "$MEMBER_ID" ]; then + echo "Member does not exist, skipping removal." + exit 0 + fi + echo "Removing member $MEMBER_ID" + # Remove the member from the etcd cluster + ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member remove "$MEMBER_ID" + + ############################################ + # Wait for the member ID to disappear from the list (ensure removal has been committed) + ############################################ + for i in $(seq 1 60); do + STILL_PRESENT=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | awk -F',' '{print $1}' | grep -w "$MEMBER_ID" || true) + if [ -z "$STILL_PRESENT" ]; then + echo "Member successfully removed from the cluster." + break + fi + + sleep 2 + done + + if [ -n "$STILL_PRESENT" ]; then + echo "ERROR: Timeout waiting for member $MEMBER_ID to be removed." + exit 1 + fi + + ############################################ + # Wait for an etcd leader to exist (ensure quorum has recovered) + # Note: If cluster has only 1 member left, it cannot elect a leader (no quorum) + ############################################ + echo "Waiting for etcd leader to be present..." + MEMBER_COUNT=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | wc -l) + # If only 1 member left, skip leader check (single node has no quorum) + if [ "$MEMBER_COUNT" -eq 1 ]; then + echo "Only 1 member left in cluster, skipping leader check (single node mode)" + echo "ETCD member $MEMBER_ID removed. Cluster is now single-node." + continue + fi + + ALL_ENDPOINTS=$(ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member list | awk -F',' '{gsub(/^ +| +$/,"",$5); print $5}' | tr '\n' ',' | sed 's/,$//') + if [ -z "$ALL_ENDPOINTS" ]; then + echo "ERROR: Cannot get endpoints from etcd member list" + exit 1 + fi + + LEADER="" + for i in $(seq 1 60); do + # endpoint status text format fields: + # endpoint, ID, VERSION, DB SIZE, IS LEADER, RAFT TERM, INDEX + # IS LEADER is field 5 (index 5) + LEADER=$(ETCDCTL_API=3 etcdctl \ + --endpoints="$ALL_ENDPOINTS" \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint status 2>/dev/null | awk -F',' '$5 == "true" {print $1}' | head -n1) + if [ -n "$LEADER" ]; then + echo "Leader is present: $LEADER" + break + fi + sleep 2 + done + if [ -z "$LEADER" ]; then + echo "ERROR: No leader found after member removal." + exit 1 + fi + echo "ETCD member $MEMBER_ID removed and quorum is stable." + - name: ScalingDown | delete etcd + # If need_uninstall_etcd is empty, remove the entire cluster. + # If need_uninstall_etcd is not empty, remove only the specified node. + when: >- + or + (.need_uninstall_etcd | empty) + (.need_uninstall_etcd | has .inventory_hostname) + block: + - name: ScalingDown | Stop and disable the etcd systemd service + ignore_errors: true + command: | + systemctl stop etcd.service + systemctl disable etcd.service + rm -rf /etc/systemd/system/etcd.service* + systemctl daemon-reload + systemctl reset-failed etcd.service + - name: ScalingDown | Remove traffic prioritization rules for etcd ports + when: .etcd.traffic_priority + command: | + tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport {{ .etcd.port }} 0xffff + tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport {{ .etcd.peer_port }} 0xffff + - name: ScalingDown | Delete all etcd data, configuration, and binaries + command: | + {{- if .delete.data }} + rm -rf {{ .etcd.env.data_dir }} + {{- end }} + rm -rf /etc/ssl/etcd/ + rm -rf /etc/etcd.env + for bin in containerd containerd-shim containerd-shim-runc-v2 ctr runc; do + if path=$(command -v $bin 2>/dev/null); then + rm -f "$path" + fi + done + - name: ScalingDown | Remove backup-etcd timer, service, and backup scripts + ignore_errors: true + command: | + systemctl disable --now backup-etcd.timer + rm /etc/systemd/system/backup-etcd.timer + rm -rf /etc/systemd/system/backup-etcd.service* + rm /usr/local/bin/kube-scripts/backup_etcd.sh + systemctl daemon-reexec && systemctl daemon-reload diff --git a/builtin/core/roles/etcd/scaling_up/learner/tasks/main.yaml b/builtin/core/roles/etcd/scaling_up/learner/tasks/main.yaml new file mode 100644 index 000000000..aabfb06dd --- /dev/null +++ b/builtin/core/roles/etcd/scaling_up/learner/tasks/main.yaml @@ -0,0 +1,42 @@ +- name: ScalingUp | Waiting installed etcd node become healthy + delegate_to: "{{ .installed_etcd }}" + command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + + for ((i=1; i<=12; i++)); do + if ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint health >/dev/null 2>&1; then + echo "✅ etcd is healthy" + exit 0 + fi + sleep 5 + done + echo "❌ etcd did not become healthy within 60 seconds" + exit 1 + + +- name: ScalingUp | Add new etcd member as learner from existing node + delegate_to: "{{ .installed_etcd }}" + command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + + {{- $listenPeerURLs := list }} + {{- if .internal_ipv4 | empty | not }} + {{- $listenPeerURLs = append $listenPeerURLs (printf "https://%s:%d" .internal_ipv4 .etcd.peer_port) }} + {{- end }} + {{- if .internal_ipv6 | empty | not }} + {{- $listenPeerURLs = append $listenPeerURLs (printf "https://[%s]:%d" .internal_ipv6 .etcd.peer_port) }} + {{- end }} + ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member add {{ .inventory_hostname }} \ + --peer-urls={{ $listenPeerURLs | join "," }} --learner -w json + register: etcd_member_list + register_type: json diff --git a/builtin/core/roles/etcd/scaling_up/promote/tasks/main.yaml b/builtin/core/roles/etcd/scaling_up/promote/tasks/main.yaml new file mode 100644 index 000000000..9fa814032 --- /dev/null +++ b/builtin/core/roles/etcd/scaling_up/promote/tasks/main.yaml @@ -0,0 +1,41 @@ + +- name: ScalingUp | Promote learner member to voting member + delegate_to: "{{ .installed_etcd }}" + command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + + MEMBER_ID=$(printf "%x" {{ .etcd_member_list.stdout.member.ID }}) + for i in $(seq 1 30); do + # try promote member + if ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + member promote "$MEMBER_ID"; then + echo "✅ promote success" + exit 0 + fi + sleep 10 + done + echo "❌ timeout after 5 minutes" + exit 1 + +- name: ScalingUp | Waiting etcd service becomes healthy + command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + + for ((i=1; i<=12; i++)); do + if ETCDCTL_API=3 etcdctl \ + --endpoints=https://localhost:{{ .etcd.port }} \ + --cacert=/etc/ssl/etcd/ssl/ca.crt \ + --cert=/etc/ssl/etcd/ssl/server.crt \ + --key=/etc/ssl/etcd/ssl/server.key \ + endpoint health >/dev/null 2>&1; then + echo "✅ etcd is healthy" + exit 0 + fi + sleep 5 + done + echo "❌ etcd did not become healthy within 60 seconds" + exit 1 \ No newline at end of file diff --git a/builtin/core/roles/etcd/tasks/expansion.yaml b/builtin/core/roles/etcd/tasks/expansion.yaml deleted file mode 100644 index e2fbea33f..000000000 --- a/builtin/core/roles/etcd/tasks/expansion.yaml +++ /dev/null @@ -1,37 +0,0 @@ -- name: Expansion | Expand cluster on existing etcd nodes - when: .etcd_install_LoadState.stdout | eq "loaded" - block: - - name: Expansion | Update /etc/etcd.env configuration file - template: - src: etcd.env - dest: /etc/etcd.env - - name: Expansion | Restart etcd service - command: | - systemctl restart etcd.service - - name: Expansion | Verify etcd service becomes healthy within 1 minute - command: | - for ((i=1; i<=12; i++)); do - if ETCDCTL_API=3 etcdctl \ - --endpoints=https://localhost:2379 \ - --cacert=/etc/ssl/etcd/ssl/ca.crt \ - --cert=/etc/ssl/etcd/ssl/server.crt \ - --key=/etc/ssl/etcd/ssl/server.key \ - endpoint health >/dev/null 2>&1; then - echo "✅ etcd is health" - exit 0 - fi - sleep 5 - done - echo "❌ etcd etcd is not health in 1 minute" - exit 1 - -- name: Expansion | Add new etcd member from non-installed node - when: .etcd_install_LoadState.stdout | eq "not-found" - delegate_to: "{{ .installed_etcd }}" - command: | - ETCDCTL_API=3 etcdctl \ - --endpoints=https://localhost:2379 \ - --cacert=/etc/ssl/etcd/ssl/ca.crt \ - --cert=/etc/ssl/etcd/ssl/server.crt \ - --key=/etc/ssl/etcd/ssl/server.key \ - member add {{ .inventory_hostname }} \ No newline at end of file diff --git a/builtin/core/roles/etcd/tasks/main.yaml b/builtin/core/roles/etcd/tasks/main.yaml deleted file mode 100644 index f65d84cd6..000000000 --- a/builtin/core/roles/etcd/tasks/main.yaml +++ /dev/null @@ -1,20 +0,0 @@ ---- -- include_tasks: prepare.yaml - -- name: ETCD | Upgrade etcd if a newer version is available - when: - - .etcd_install_LoadState.stdout | eq "loaded" - - .etcd.etcd_version | semverCompare (printf ">v%s" (index .etcd_install_version "stdout" "etcd Version")) - include_tasks: upgrade.yaml - -- name: ETCD | Expand the etcd cluster by adding new nodes if required - when: - - .installed_etcd | empty | not - - .need_installed_etcd | empty | not - include_tasks: expansion.yaml - -- name: ETCD | Install etcd and set up the backup service if not already present - when: .etcd_install_LoadState.stdout | eq "not-found" - block: - - include_tasks: install.yaml - - include_tasks: backup_service.yaml diff --git a/builtin/core/roles/etcd/tasks/prepare.yaml b/builtin/core/roles/etcd/tasks/prepare.yaml deleted file mode 100644 index 4e790672d..000000000 --- a/builtin/core/roles/etcd/tasks/prepare.yaml +++ /dev/null @@ -1,82 +0,0 @@ ---- -- name: Prepare | Ensure installed etcd is running and healthy - when: .etcd_install_LoadState.stdout | eq "loaded" - assert: - that: .etcd_install_ActiveState.stdout | eq "active" - fail_msg: >- - etcd service is installed but not running - -- name: Prepare | Set etcd node parameters - block: - - name: Prepare | Identify nodes with installed or missing etcd - run_once: true - add_hostvars: - hosts: etcd - vars: - installed_etcd: >- - {{- $needInstalled := list -}} - {{- range .groups.etcd -}} - {{- if (index $.hostvars . "etcd_install_LoadState" "stdout") | eq "loaded" -}} - {{- $needInstalled = append $needInstalled . -}} - {{- end -}} - {{- end -}} - {{ $needInstalled | first | default "" }} - need_installed_etcd: >- - {{- $needInstalled := list -}} - {{- range .groups.etcd -}} - {{- if (index $.hostvars . "etcd_install_LoadState" "stdout") | eq "not-found" -}} - {{- $needInstalled = append $needInstalled . -}} - {{- end -}} - {{- end -}} - {{/* Ensure value is a string to prevent YAML type coercion */}} - {{ $needInstalled | toJson | quote }} - -- name: Prepare | Check installed etcd version - when: .etcd_install_LoadState.stdout | eq "loaded" - block: - - name: Prepare | Ensure target etcd version is not lower than installed version - when: .etcd_install_LoadState.stdout | eq "loaded" - assert: - that: .etcd.etcd_version | semverCompare (printf ">=v%s" (index .etcd_install_version "stdout" "etcd Version")) - fail_msg: >- - Installed etcd version: {{ index .etcd_install_version "stdout" "etcd Version" }} is lower than target etcd version: {{ .etcd.etcd_version }} - -- name: Prepare | Synchronize etcd package to node if new install or upgrade - when: - - or (.etcd_install_version.error | empty | not) (.etcd.etcd_version | semverCompare (printf ">v%s" (index .etcd_install_version "stdout" "etcd Version"))) - block: - - name: Prepare | Copy etcd binary package to remote node - copy: - src: >- - {{ .binary_dir }}/etcd/{{ .etcd.etcd_version }}/{{ .binary_type }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz - dest: >- - {{ .tmp_dir }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz - - name: Prepare | Extract etcd binary package to /usr/local/bin/ - command: | - tar --strip-components=1 -C /usr/local/bin/ -xvf {{ .tmp_dir }}/etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}.tar.gz \ - --wildcards 'etcd-{{ .etcd.etcd_version }}-linux-{{ .binary_type }}/etcd*' - -- name: Prepare | Synchronize certificates to node for new install or expansion - when: >- - or - (.etcd_install_version.error | empty | not) - (and - (.installed_etcd | empty | not) - (.need_installed_etcd | empty | not) - ) - block: - - name: Prepare | Copy CA certificate to etcd node - copy: - src: >- - {{ .etcd.ca_file }} - dest: /etc/ssl/etcd/ssl/ca.crt - - name: Prepare | Copy server certificate to etcd node - copy: - src: >- - {{ .etcd.cert_file }} - dest: /etc/ssl/etcd/ssl/server.crt - - name: Prepare | Copy server key to etcd node - copy: - src: >- - {{ .etcd.key_file }} - dest: /etc/ssl/etcd/ssl/server.key diff --git a/builtin/core/roles/etcd/templates/etcd.env b/builtin/core/roles/etcd/templates/etcd.env deleted file mode 100644 index a9a74707d..000000000 --- a/builtin/core/roles/etcd/templates/etcd.env +++ /dev/null @@ -1,75 +0,0 @@ -{{- $ips := list -}} -{{- $state := "new" -}} -{{- range .groups.etcd | default list -}} - {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" -}} - {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" -}} - {{- if $internalIPv4 | empty | not -}} - {{- $ips = append $ips (printf "%s=https://%s:2380" (index $.hostvars . "hostname") $internalIPv4) -}} - {{- else if $internalIPv6 | empty | not }} - {{- $ips = append $ips (printf "%s=https://%s:2380" (index $.hostvars . "hostname") $internalIPv6) -}} - {{- end -}} - {{ if index $.hostvars . "etcd_install_LoadState" "stdout" | eq "loaded" -}} - {{- $state := "existing" -}} - {{- end -}} -{{- end -}} -ETCD_DATA_DIR={{ .etcd.env.data_dir }} -ETCD_INITIAL_CLUSTER_STATE={{ $state }} -ETCD_INITIAL_CLUSTER_TOKEN={{ .etcd.env.token }} -{{- if .internal_ipv4 | empty | not }} -ETCD_ADVERTISE_CLIENT_URLS={{ printf "https://%s:2379" .internal_ipv4 }} -ETCD_INITIAL_ADVERTISE_PEER_URLS={{ printf "https://%s:2380" .internal_ipv4 }} -ETCD_LISTEN_CLIENT_URLS={{ printf "https://%s:2379" .internal_ipv4 }},https://127.0.0.1:2379 -ETCD_LISTEN_PEER_URLS={{ printf "https://%s:2380" .internal_ipv4 }} -{{- else if .internal_ipv6 | empty | not }} -ETCD_ADVERTISE_CLIENT_URLS={{ printf "https://%s:2379" .internal_ipv6 }} -ETCD_INITIAL_ADVERTISE_PEER_URLS={{ printf "https://%s:2380" .internal_ipv6 }} -ETCD_LISTEN_CLIENT_URLS={{ printf "https://%s:2379" .internal_ipv6 }},https://::1:2379 -ETCD_LISTEN_PEER_URLS={{ printf "https://%s:2380" .internal_ipv6 }} -{{- end }} - -ETCD_NAME={{ .hostname }} -ETCD_PROXY=off -ETCD_ENABLE_V2=true -ETCD_INITIAL_CLUSTER={{ $ips | join "," }} -ETCD_ELECTION_TIMEOUT={{ .etcd.env.election_timeout }} -ETCD_HEARTBEAT_INTERVAL={{ .etcd.env.heartbeat_interval }} -ETCD_AUTO_COMPACTION_RETENTION={{ .etcd.env.compaction_retention }} -ETCD_SNAPSHOT_COUNT={{ .etcd.env.snapshot_count }} -{{- if .etcd.metrics }} -ETCD_METRICS={{ .etcd.env.metrics }} -{{- end }} -{{- if .etcd.env.quota_backend_bytes }} -ETCD_QUOTA_BACKEND_BYTES={{ .etcd.env.quota_backend_bytes }} -{{- end }} -{{- if .etcd.env.max_request_bytes }} -ETCD_MAX_REQUEST_BYTES={{ .etcd.env.max_request_bytes }} -{{- end }} -{{- if .etcd.env.max_snapshots }} -ETCD_MAX_SNAPSHOTS={{ .etcd.env.max_snapshots }} -{{- end }} -{{- if .etcd.env.max_wals }} -ETCD_MAX_WALS={{ .etcd.env.max_wals }} -{{- end }} -{{- if .etcd.env.log_level }} -ETCD_LOG_LEVEL={{ .etcd.env.log_level }} -{{- end }} -{{- if .etcd.env.unsupported_arch }} -ETCD_UNSUPPORTED_ARCH={{ .etcd.env.unsupported_arch }} -{{- end }} - -# TLS settings -ETCD_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.crt -ETCD_CERT_FILE=/etc/ssl/etcd/ssl/server.crt -ETCD_KEY_FILE=/etc/ssl/etcd/ssl/server.key -ETCD_CLIENT_CERT_AUTH=true - -ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.crt -ETCD_PEER_CERT_FILE=/etc/ssl/etcd/ssl/server.crt -ETCD_PEER_KEY_FILE=/etc/ssl/etcd/ssl/server.key -ETCD_PEER_CLIENT_CERT_AUTH=true - -# CLI settings -ETCDCTL_ENDPOINTS=https://127.0.0.1:2379 -ETCDCTL_CACERT=/etc/ssl/etcd/ssl/ca.crt -ETCDCTL_CERT=/etc/ssl/etcd/ssl/server.crt -ETCDCTL_KEY=/etc/ssl/etcd/ssl/server.key diff --git a/builtin/core/roles/etcd/tasks/upgrade.yaml b/builtin/core/roles/etcd/upgrade/tasks/main.yaml similarity index 66% rename from builtin/core/roles/etcd/tasks/upgrade.yaml rename to builtin/core/roles/etcd/upgrade/tasks/main.yaml index e969b3bdd..f5b3bfe4d 100644 --- a/builtin/core/roles/etcd/tasks/upgrade.yaml +++ b/builtin/core/roles/etcd/upgrade/tasks/main.yaml @@ -1,15 +1,14 @@ -- name: Upgrade | Backup etcd data before upgrade - command: BACKUP_DIR="{{ .etcd.backup.backup_dir }}/install/etcd-v{{ index .etcd_install_version "stdout" "etcd Version" }}-$(date +%Y-%m-%d-%H-%M-%S)" /usr/local/bin/kube-scripts/backup_etcd.sh - - name: Upgrade | Restart etcd service after upgrade command: | systemctl restart etcd.service - name: Upgrade | Ensure etcd service becomes healthy within 1 minute command: | + unset ETCDCTL_ENDPOINTS ETCDCTL_KEY ETCDCTL_CERT ETCDCTL_CACERT + for ((i=1; i<=12; i++)); do if ETCDCTL_API=3 etcdctl \ - --endpoints=https://localhost:2379 \ + --endpoints=https://localhost:{{ .etcd.port }} \ --cacert=/etc/ssl/etcd/ssl/ca.crt \ --cert=/etc/ssl/etcd/ssl/server.crt \ --key=/etc/ssl/etcd/ssl/server.key \ diff --git a/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta3 b/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta3 index 43b76506c..3cfcbbca9 100644 --- a/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta3 +++ b/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta3 @@ -16,7 +16,16 @@ etcd: external: endpoints: {{- range .groups.etcd | default list }} - - https://{{ index $.hostvars . "internal_ipv4" }}:2379 + {{- if $.need_uninstall_etcd | default list | has . | not }} + {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" }} + {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" }} + {{- if $internalIPv4 | empty | not }} + - {{ printf "https://%s:%d" $internalIPv4 $.etcd.port }} + {{- end }} + {{- if $internalIPv6 | empty | not }} + - https://{{ printf "https://[%s]:%d" $internalIPv6 $.etcd.port }} + {{- end }} + {{- end }} {{- end }} caFile: /etc/kubernetes/pki/etcd/ca.crt certFile: /etc/kubernetes/pki/etcd/client.crt diff --git a/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta4 b/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta4 index bc455b367..48b3c326b 100644 --- a/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta4 +++ b/builtin/core/roles/kubernetes/init-kubernetes/templates/kubeadm/kubeadm-init.v1beta4 @@ -17,7 +17,16 @@ etcd: external: endpoints: {{- range .groups.etcd | default list }} - - https://{{ index $.hostvars . "internal_ipv4" }}:2379 + {{- if $.need_uninstall_etcd | default list | has . | not }} + {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" }} + {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" }} + {{- if $internalIPv4 | empty | not }} + - {{ printf "https://%s:%d" $internalIPv4 $.etcd.port }} + {{- end }} + {{- if $internalIPv6 | empty | not }} + - https://{{ printf "https://[%s]:%d" $internalIPv6 $.etcd.port }} + {{- end }} + {{- end }} {{- end }} caFile: /etc/kubernetes/pki/etcd/ca.crt certFile: /etc/kubernetes/pki/etcd/client.crt diff --git a/builtin/core/roles/kubernetes/pre-kubernetes/tasks/main.yaml b/builtin/core/roles/kubernetes/pre-kubernetes/tasks/main.yaml index d18b02730..362cdd570 100644 --- a/builtin/core/roles/kubernetes/pre-kubernetes/tasks/main.yaml +++ b/builtin/core/roles/kubernetes/pre-kubernetes/tasks/main.yaml @@ -58,12 +58,12 @@ - name: PreKubernetes | Copy etcd client certificate to control plane node copy: src: >- - {{ .etcd.cert_file }} + {{ .etcd.client_cert_file }} dest: /etc/kubernetes/pki/etcd/client.crt - name: PreKubernetes | Copy etcd client key to control plane node copy: src: >- - {{ .etcd.key_file }} + {{ .etcd.client_key_file }} dest: /etc/kubernetes/pki/etcd/client.key - name: PreKubernetes | Synchronize front-proxy CA files to control plane nodes diff --git a/builtin/core/roles/kubernetes/sync-etcd-config/tasks/main.yaml b/builtin/core/roles/kubernetes/sync-etcd-config/tasks/main.yaml new file mode 100644 index 000000000..bfe5c9fe7 --- /dev/null +++ b/builtin/core/roles/kubernetes/sync-etcd-config/tasks/main.yaml @@ -0,0 +1,64 @@ +- name: AddNodes | Update kube-apiserver configuration + command: | + {{- $endpoints := list -}} + {{- range .groups.etcd | default list -}} + {{- if $.need_uninstall_etcd | default list | has . | not -}} + {{- $internalIPv4 := index $.hostvars . "internal_ipv4" | default "" -}} + {{- $internalIPv6 := index $.hostvars . "internal_ipv6" | default "" -}} + {{- if $internalIPv4 | empty | not -}} + {{- $endpoints = append $endpoints (printf "https://%s:%d" $internalIPv4 $.etcd.port) -}} + {{- end -}} + {{- if $internalIPv6 | empty | not -}} + {{- $endpoints = append $endpoints (printf "https://[%s]:%d" $internalIPv6 $.etcd.port) -}} + {{- end -}} + {{- end -}} + {{- end -}} + ETCD_ENDPOINTS="{{ join "," $endpoints }}" + + if ! grep -q 'ClusterConfiguration' /etc/kubernetes/kubeadm-config.yaml 2>/dev/null; then + kubectl get cm kubeadm-config -n kube-system -o=jsonpath='{.data.ClusterConfiguration}' > /etc/kubernetes/kubeadm-config.yaml + fi + + awk -v ep="$ETCD_ENDPOINTS" ' + BEGIN { + n = split(ep, arr, ",") + for (i = 1; i <= n; i++) { + print " - " arr[i] + } + } + ' > /etc/kubernetes/kubeadm_new_endpoints.yaml + # delete old endpoint + sed -i '/^[[:space:]]*endpoints:/{ + :loop + N + s/\n[[:space:]]\+-.*//; t loop + s/\n[[:space:]]*\n/\n/g + P + D + }' /etc/kubernetes/kubeadm-config.yaml + # insert new endpoint + sed -i "/^[[:space:]]*endpoints:/r /etc/kubernetes/kubeadm_new_endpoints.yaml" /etc/kubernetes/kubeadm-config.yaml + rm /etc/kubernetes/kubeadm_new_endpoints.yaml + # update kubeadm-config + {{- if .kubernetes.kube_version | semverCompare "/dev/null || \ + curl -sk https://localhost:6443/healthz 2>/dev/null | grep -q "ok"; then + echo "✅ kube-apiserver is ready" + exit 0 + fi + sleep 10 + done + echo "❌ timeout after 5 minutes" + exit 1 + diff --git a/builtin/core/roles/security/tasks/main.yaml b/builtin/core/roles/security/tasks/main.yaml index e549883e5..59179c926 100644 --- a/builtin/core/roles/security/tasks/main.yaml +++ b/builtin/core/roles/security/tasks/main.yaml @@ -3,7 +3,7 @@ command: | chmod 700 /etc/ssl/etcd/ssl && chown root:root /etc/ssl/etcd/ssl chmod 600 /etc/ssl/etcd/ssl/* && chown root:root /etc/ssl/etcd/ssl/* - chmod 700 /var/lib/etcd && chown etcd:etcd /var/lib/etcd + chmod 700 {{ .etcd.env.data_dir }} && chown etcd:etcd {{ .etcd.env.data_dir }} chmod 550 /usr/local/bin/etcd* && chown root:root /usr/local/bin/etcd* when: .groups.etcd | default list | has .inventory_hostname diff --git a/builtin/core/roles/uninstall/etcd/tasks/main.yaml b/builtin/core/roles/uninstall/etcd/tasks/main.yaml deleted file mode 100644 index b0055c372..000000000 --- a/builtin/core/roles/uninstall/etcd/tasks/main.yaml +++ /dev/null @@ -1,35 +0,0 @@ ---- -- name: ETCD | Completely uninstall the etcd service and remove all related files - block: - - name: ETCD | Stop and disable the etcd systemd service - ignore_errors: true - command: | - systemctl stop etcd.service - systemctl disable etcd.service - rm -rf /etc/systemd/system/etcd.service* - systemctl daemon-reload - systemctl reset-failed etcd.service - - name: ETCD | Remove traffic priority rules for etcd ports - command: | - tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport 2379 0xffff - tc filter del dev eth0 parent 1: protocol ip prio 1 u32 match ip sport 2380 0xffff - when: .etcd.traffic_priority - - name: ETCD | Delete all etcd data, configuration, and binaries - command: | - rm -rf {{ .etcd.env.data_dir }} - rm -rf /etc/ssl/etcd/ - rm -rf /etc/etcd.env - for bin in etcd etcdctl; do - if path=$(command -v $bin 2>/dev/null); then - rm -f "$path" - fi - done - -- name: ETCD | Uninstall backup-etcd timer and service, and remove backup scripts - ignore_errors: true - command: | - systemctl disable --now backup-etcd.timer - rm /etc/systemd/system/backup-etcd.timer - rm -rf /etc/systemd/system/backup-etcd.service* - rm /usr/local/bin/kube-scripts/backup_etcd.sh - systemctl daemon-reexec && systemctl daemon-reload diff --git a/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml b/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml index 71ac6a2c0..604e1a5c6 100644 --- a/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml +++ b/builtin/core/roles/uninstall/kubernetes/tasks/kubernetes.yaml @@ -2,7 +2,8 @@ - name: Kubernetes | Completely reset the node using kubeadm ignore_errors: true command: | - kubeadm reset -f + # After Kubernetes v1.27.0, the remove-etcd-member phase will automatically clean up /var/lib/etcd + kubeadm reset -f {{ if .etcd.deployment_type | eq "external" }}--skip-phases remove-etcd-member{{ end }} - name: Kubernetes | Gracefully stop and disable the kubelet service ignore_errors: true @@ -23,7 +24,6 @@ if [ "$SUDO_USER" != "root" ]; then rm -rf /home/$SUDO_USER/.kube/config fi - rm -rf /var/lib/etcd for bin in kubeadm kubelet kubectl; do if path=$(command -v $bin 2>/dev/null); then rm -f "$path" diff --git a/cmd/kk/app/builtin/add.go b/cmd/kk/app/builtin/add.go index 60f3c4638..3078e844d 100644 --- a/cmd/kk/app/builtin/add.go +++ b/cmd/kk/app/builtin/add.go @@ -46,14 +46,17 @@ func newAddNodeCommand() *cobra.Command { Use: "nodes", Aliases: []string{"node"}, Short: "Add nodes to the cluster according to the new nodes information from the specified configuration file", - Long: `There are two executors available for adding nodes: + Long: `There are two ways to add nodes to the cluster: 1. kk add nodes - This will add all nodes listed in the inventory that are not yet installed in the cluster. + Requires all nodes to be pre-defined in inventory.yaml with their assigned groups + (kube_control_plane, kube_worker, etcd). This will add all nodes listed in the + inventory that are not yet installed in the cluster. -2. kk add nodes --control-plane node1,node2 --worker node1,node2 - This will only add the specified nodes to the cluster as control-plane or worker nodes. - Ensure their connection details are provided in the inventory's hosts file.`, +2. kk add nodes --control-plane node1,node2 --worker node1,node2 --etcd node1,node2 + Only requires node connection details (hostname/IP) to be defined in inventory.yaml. + The nodes will be automatically assigned to the specified groups via command-line flags. + The --override flag will update the inventory.yaml with the new group assignments.`, RunE: func(cmd *cobra.Command, args []string) error { // Complete the configuration and create a playbook for adding nodes playbook, err := o.Complete(cmd, []string{"playbooks/add_nodes.yaml"}) @@ -62,7 +65,12 @@ func newAddNodeCommand() *cobra.Command { } // Execute the playbook to add the nodes - return o.Run(cmd.Context(), playbook) + if err := o.Run(cmd.Context(), playbook); err != nil { + return err + } + + // Update inventory file only after successful execution + return o.OverrideInventory() }, } flags := cmd.Flags() diff --git a/cmd/kk/app/builtin/delete.go b/cmd/kk/app/builtin/delete.go index a4411570b..cf54c4ab4 100644 --- a/cmd/kk/app/builtin/delete.go +++ b/cmd/kk/app/builtin/delete.go @@ -96,7 +96,12 @@ func newDeleteNodesCommand() *cobra.Command { } // Execute the playbook to delete the specified nodes - return o.Run(cmd.Context(), playbook) + if err := o.Run(cmd.Context(), playbook); err != nil { + return err + } + + // Update inventory file only after successful execution + return o.OverrideInventory() }, } // Add all relevant flag sets to the command diff --git a/cmd/kk/app/options/builtin/add.go b/cmd/kk/app/options/builtin/add.go index d31f760a7..db330e01c 100644 --- a/cmd/kk/app/options/builtin/add.go +++ b/cmd/kk/app/options/builtin/add.go @@ -21,12 +21,14 @@ package builtin import ( "fmt" + "os" "slices" "strings" "github.com/cockroachdb/errors" kkcorev1 "github.com/kubesphere/kubekey/api/core/v1" "github.com/spf13/cobra" + "gopkg.in/yaml.v3" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" cliflag "k8s.io/component-base/cli/flag" @@ -57,6 +59,13 @@ type AddNodeOptions struct { ControlPlane string // Worker nodes which will to be added. Worker string + // Etcd nodes which will be added. + Etcd string + // Override indicates whether to override the inventory file after successful execution. + // When set to true, the inventory.yaml file will be updated. + Override bool + // addGroupHosts stores the nodes to be added to each group for later inventory update + addGroupHosts map[string][]string } // Flags adds flags for configuring AddNodeOptions to the specified FlagSet @@ -66,6 +75,8 @@ func (o *AddNodeOptions) Flags() cliflag.NamedFlagSets { kfs.StringVar(&o.Kubernetes, "with-kubernetes", o.Kubernetes, fmt.Sprintf("Specify a supported version of kubernetes. default is %s", o.Kubernetes)) kfs.StringVar(&o.ControlPlane, "control-plane", o.ControlPlane, "Which nodes will be installed as control-plane. Multiple nodes are supported, separated by commas (e.g., node1, node2, ...)") kfs.StringVar(&o.Worker, "worker", o.Worker, "Which nodes will be installed as workers. Multiple nodes are supported, separated by commas (e.g., node1, node2, ...)") + kfs.StringVar(&o.Etcd, "etcd", o.Etcd, "Which nodes will be installed as etcd. Multiple nodes are supported, separated by commas (e.g., node1, node2, ...)") + kfs.BoolVar(&o.Override, "override", o.Override, "Override the inventory file after successful execution") return fss } @@ -100,6 +111,212 @@ func (o *AddNodeOptions) Complete(cmd *cobra.Command, args []string) (*kkcorev1. return playbook, o.complete() } +// addNodesToGroup adds nodes to a specific inventory group and returns the added nodes +func (o *AddNodeOptions) addNodesToGroup(nodeList string, groupName string, groups map[string][]string, addGroupHosts map[string][]string) error { + if nodeList == "" { + return nil + } + + var nodes []string + for _, node := range strings.Split(nodeList, ",") { + if !slices.Contains(groups[_const.VariableGroupsAll], node) { + return errors.Errorf("%q is not defined in inventory.", node) + } + if !slices.Contains(groups[groupName], node) { + group := o.Inventory.Spec.Groups[groupName] + group.Hosts = append(group.Hosts, node) + o.Inventory.Spec.Groups[groupName] = group + } + nodes = append(nodes, node) + } + if len(nodes) > 0 { + addGroupHosts[groupName] = nodes + } + return nil +} + +// updateInventoryFile updates the inventory file with new nodes while preserving comments and formatting +func (o *AddNodeOptions) updateInventoryFile(addGroupHosts map[string][]string, existingGroups map[string][]string) error { + // Read original file content + content, err := os.ReadFile(o.InventoryFile) + if err != nil { + return errors.Wrapf(err, "failed to read inventory file %s", o.InventoryFile) + } + lines := strings.Split(string(content), "\n") + + // Parse to get line numbers for each hosts list + var root yaml.Node + if err := yaml.Unmarshal(content, &root); err != nil { + return errors.Wrap(err, "failed to unmarshal inventory file") + } + + // Find insert positions for each group + insertions := o.findInsertPositions(&root, addGroupHosts, lines) + if len(insertions) == 0 { + return nil + } + + // Sort insertions by line number in descending order to insert from bottom to top + sortInsertions(insertions) + + // Insert new lines + for _, ins := range insertions { + newLines := make([]string, len(ins.nodes)) + for i, node := range ins.nodes { + newLines[i] = ins.indent + "- " + node + } + // Insert after the line + pos := ins.lineNum // Convert to 0-based index + if pos < len(lines) { + lines = append(lines[:pos+1], append(newLines, lines[pos+1:]...)...) + } else { + lines = append(lines, newLines...) + } + } + + // Write back + output := strings.Join(lines, "\n") + return errors.Wrapf(os.WriteFile(o.InventoryFile, []byte(output), _const.PermFilePublic), + "failed to write inventory file %s", o.InventoryFile) +} + +// insertion holds information about where to insert new nodes +type insertion struct { + groupName string + lineNum int // 0-based line number where to insert (after this line) + indent string + nodes []string +} + +// findInsertPositions finds the line numbers where new nodes should be inserted +func (o *AddNodeOptions) findInsertPositions(root *yaml.Node, addGroupHosts map[string][]string, lines []string) []insertion { + var insertions []insertion + if len(root.Content) == 0 { + return insertions + } + o.findInsertionsInNode(root.Content[0], addGroupHosts, lines, &insertions) + return insertions +} + +// findInsertionsInNode recursively searches for groups and their hosts +func (o *AddNodeOptions) findInsertionsInNode(node *yaml.Node, addGroupHosts map[string][]string, lines []string, insertions *[]insertion) { + if node.Kind != yaml.MappingNode { + return + } + + for i := 0; i < len(node.Content); i += 2 { + keyNode := node.Content[i] + valueNode := node.Content[i+1] + + if keyNode.Value == "spec" && valueNode.Kind == yaml.MappingNode { + for j := 0; j < len(valueNode.Content); j += 2 { + specKey := valueNode.Content[j] + specValue := valueNode.Content[j+1] + if specKey.Value == "groups" && specValue.Kind == yaml.MappingNode { + o.findHostsInGroups(specValue, addGroupHosts, lines, insertions) + return + } + } + } + } +} + +// getIndentFromLine extracts the leading whitespace from a line +func getIndentFromLine(line string) string { + for i, ch := range line { + if ch != ' ' && ch != '\t' { + return line[:i] + } + } + return line +} + +// findHostsInGroups finds hosts lists within groups +func (o *AddNodeOptions) findHostsInGroups(groupsNode *yaml.Node, addGroupHosts map[string][]string, lines []string, insertions *[]insertion) { + for i := 0; i < len(groupsNode.Content); i += 2 { + groupKey := groupsNode.Content[i] + groupValue := groupsNode.Content[i+1] + + groupName := groupKey.Value + nodesToAdd, ok := addGroupHosts[groupName] + if !ok || len(nodesToAdd) == 0 { + continue + } + + if groupValue.Kind != yaml.MappingNode { + continue + } + + // Find hosts key in group + for j := 0; j < len(groupValue.Content); j += 2 { + hostKey := groupValue.Content[j] + hostValue := groupValue.Content[j+1] + + if hostKey.Value == "hosts" && hostValue.Kind == yaml.SequenceNode { + // Read existing hosts from file content + existingNodes := make(map[string]struct{}) + for _, node := range hostValue.Content { + if node.Kind == yaml.ScalarNode { + existingNodes[node.Value] = struct{}{} + } + } + + var newNodes []string + for _, node := range nodesToAdd { + if _, exists := existingNodes[node]; !exists { + newNodes = append(newNodes, node) + } + } + + if len(newNodes) > 0 { + var lineIdx int + var indent string + + if len(hostValue.Content) > 0 { + // Get the last node in the hosts list + lastNode := hostValue.Content[len(hostValue.Content)-1] + lineIdx = lastNode.Line - 1 // Convert to 0-based + } else { + // Empty hosts list, insert after hosts: line + lineIdx = hostKey.Line - 1 // Convert to 0-based + } + + // Get indent from the actual line in the file + if lineIdx >= 0 && lineIdx < len(lines) { + baseIndent := getIndentFromLine(lines[lineIdx]) + if len(hostValue.Content) > 0 { + indent = baseIndent // Use same indent as existing items + } else { + // Add extra indentation for new list items (2 more spaces) + indent = baseIndent + " " + } + } else { + indent = " " // fallback: 8 spaces + } + + *insertions = append(*insertions, insertion{ + groupName: groupName, + lineNum: lineIdx, + indent: indent, + nodes: newNodes, + }) + } + } + } + } +} + +// sortInsertions sorts insertions by line number in descending order +func sortInsertions(insertions []insertion) { + for i := 0; i < len(insertions)-1; i++ { + for j := i + 1; j < len(insertions); j++ { + if insertions[i].lineNum < insertions[j].lineNum { + insertions[i], insertions[j] = insertions[j], insertions[i] + } + } + } +} + // complete updates the configuration with container manager and kubernetes version settings func (o *AddNodeOptions) complete() error { if _, ok, _ := unstructured.NestedFieldNoCopy(o.Config.Value(), "kubernetes", "kube_version"); !ok { @@ -108,39 +325,47 @@ func (o *AddNodeOptions) complete() error { } } - var addNodes []string + o.addGroupHosts = make(map[string][]string) groups := variable.ConvertGroup(*o.Inventory) - // add nodes to control_plane group - if o.ControlPlane != "" { - for _, node := range strings.Split(o.ControlPlane, ",") { - if !slices.Contains(groups[_const.VariableGroupsAll], node) { - return errors.Errorf("%q is not defined in inventory.", node) - } - if !slices.Contains(groups[defaultGroupControlPlane], node) { - group := o.Inventory.Spec.Groups[defaultGroupControlPlane] - group.Hosts = append(group.Hosts, node) - o.Inventory.Spec.Groups[defaultGroupControlPlane] = group - } - addNodes = append(addNodes, node) - } + + // add nodes to groups + if err := o.addNodesToGroup(o.ControlPlane, defaultGroupControlPlane, groups, o.addGroupHosts); err != nil { + return err } - // add nodes to worker group - if o.Worker != "" { - for _, node := range strings.Split(o.Worker, ",") { - if !slices.Contains(groups[_const.VariableGroupsAll], node) { - return errors.Errorf("%q is not defined in inventory.", node) - } - if !slices.Contains(groups[defaultGroupWorker], node) { - group := o.Inventory.Spec.Groups[defaultGroupWorker] - group.Hosts = append(group.Hosts, node) - o.Inventory.Spec.Groups[defaultGroupWorker] = group - } - addNodes = append(addNodes, node) + if err := o.addNodesToGroup(o.Worker, defaultGroupWorker, groups, o.addGroupHosts); err != nil { + return err + } + if err := o.addNodesToGroup(o.Etcd, defaultGroupEtcd, groups, o.addGroupHosts); err != nil { + return err + } + + // collect unique addNodes + addNodesSet := make(map[string]struct{}) + for _, nodes := range o.addGroupHosts { + for _, node := range nodes { + addNodesSet[node] = struct{}{} } } + addNodes := make([]string, 0, len(addNodesSet)) + for node := range addNodesSet { + addNodes = append(addNodes, node) + } + if err := unstructured.SetNestedStringSlice(o.Config.Value(), addNodes, "add_nodes"); err != nil { return errors.Wrapf(err, "failed to set %q to config", "add_nodes") } return nil } + +// OverrideInventory updates the inventory.yaml file after successful execution. +// This should be called only when the Run method succeeds and Override flag is set. +func (o *AddNodeOptions) OverrideInventory() error { + // Only update inventory file when --override flag is set + if !o.Override || o.InventoryFile == "" || len(o.addGroupHosts) == 0 { + return nil + } + + groups := variable.ConvertGroup(*o.Inventory) + return o.updateInventoryFile(o.addGroupHosts, groups) +} diff --git a/cmd/kk/app/options/builtin/add_test.go b/cmd/kk/app/options/builtin/add_test.go new file mode 100644 index 000000000..46a2aae36 --- /dev/null +++ b/cmd/kk/app/options/builtin/add_test.go @@ -0,0 +1,336 @@ +//go:build builtin +// +build builtin + +/* +Copyright 2025 The KubeSphere Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package builtin + +import ( + "os" + "path/filepath" + "strings" + "testing" + + kkcorev1 "github.com/kubesphere/kubekey/api/core/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/kubesphere/kubekey/v4/cmd/kk/app/options" +) + +func TestUpdateInventoryFile(t *testing.T) { + tests := []struct { + name string + inventory string + addGroupHosts map[string][]string + expectedOutput string + wantErr bool + }{ + { + name: "add node to group with standard indentation (6 spaces)", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node1 + etcd: + hosts: + - node1 +`, + addGroupHosts: map[string][]string{ + "kube_worker": {"node2"}, + }, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node1 + - node2 + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + { + name: "add node to group with 4-space indentation (no space after hosts)", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node1 + etcd: + hosts: + - node1 +`, + addGroupHosts: map[string][]string{ + "kube_worker": {"node2"}, + }, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node1 + - node2 + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + + { + name: "add node to empty group with standard indentation", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: [] + etcd: + hosts: + - node1 +`, + addGroupHosts: map[string][]string{ + "kube_worker": {"node2"}, + }, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: [] + - node2 + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + { + name: "add multiple nodes to different groups", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node1 + etcd: + hosts: + - node1 +`, + addGroupHosts: map[string][]string{ + "kube_worker": {"node2", "node3"}, + "etcd": {"node2"}, + }, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node1 + - node2 + - node3 + etcd: + hosts: + - node1 + - node2 +`, + wantErr: false, + }, + { + name: "add node to group with 4-space indentation in empty group", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: [] + etcd: + hosts: + - node1 +`, + addGroupHosts: map[string][]string{ + "kube_worker": {"node2"}, + }, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: [] + - node2 + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create temporary directory + tmpDir := t.TempDir() + inventoryFile := filepath.Join(tmpDir, "inventory.yaml") + + // Write initial inventory + err := os.WriteFile(inventoryFile, []byte(tt.inventory), 0644) + require.NoError(t, err) + + // Create AddNodeOptions with inventory + o := &AddNodeOptions{ + CommonOptions: options.CommonOptions{ + InventoryFile: inventoryFile, + Inventory: &kkcorev1.Inventory{ + Spec: kkcorev1.InventorySpec{ + Hosts: kkcorev1.InventoryHost{ + "node1": {Raw: []byte("{}")}, + "node2": {Raw: []byte("{}")}, + "node3": {Raw: []byte("{}")}, + }, + Groups: map[string]kkcorev1.InventoryGroup{ + "kube_control_plane": {Hosts: []string{"node1"}}, + "kube_worker": {Hosts: []string{"node1"}}, + "etcd": {Hosts: []string{"node1"}}, + }, + }, + }, + }, + } + + // Prepare existing groups + existingGroups := map[string][]string{ + "all": {"node1", "node2", "node3"}, + "kube_control_plane": {"node1"}, + "kube_worker": {"node1"}, + "etcd": {"node1"}, + } + + // Call the function + err = o.updateInventoryFile(tt.addGroupHosts, existingGroups) + + if tt.wantErr { + assert.Error(t, err) + return + } + + require.NoError(t, err) + + // Read the result + content, err := os.ReadFile(inventoryFile) + require.NoError(t, err) + + // Compare (normalize line endings) + actual := strings.TrimSpace(string(content)) + expected := strings.TrimSpace(tt.expectedOutput) + assert.Equal(t, expected, actual) + }) + } +} diff --git a/cmd/kk/app/options/builtin/builtin.go b/cmd/kk/app/options/builtin/builtin.go index 389ccc311..bec56ce9a 100644 --- a/cmd/kk/app/options/builtin/builtin.go +++ b/cmd/kk/app/options/builtin/builtin.go @@ -33,12 +33,13 @@ import ( ) const ( - defaultKubeVersion = "v1.33.1" + defaultKubeVersion = "v1.34.3" ) const ( defaultGroupControlPlane = "kube_control_plane" defaultGroupWorker = "kube_worker" + defaultGroupEtcd = "etcd" ) var getInventory options.InventoryFunc = func() (*kkcorev1.Inventory, error) { diff --git a/cmd/kk/app/options/builtin/delete.go b/cmd/kk/app/options/builtin/delete.go index 50096f068..0556b379c 100644 --- a/cmd/kk/app/options/builtin/delete.go +++ b/cmd/kk/app/options/builtin/delete.go @@ -21,6 +21,9 @@ package builtin import ( "fmt" + "os" + "slices" + "strings" "github.com/cockroachdb/errors" kkcorev1 "github.com/kubesphere/kubekey/api/core/v1" @@ -30,6 +33,8 @@ import ( cliflag "k8s.io/component-base/cli/flag" "github.com/kubesphere/kubekey/v4/cmd/kk/app/options" + _const "github.com/kubesphere/kubekey/v4/pkg/const" + "github.com/kubesphere/kubekey/v4/pkg/variable" ) // ====================================================================================== @@ -159,6 +164,11 @@ type DeleteNodesOptions struct { Kubernetes string DeleteAllComponents bool DeleteData bool + // Override indicates whether to override the inventory file after successful execution. + // When set to true, the inventory.yaml file will be updated. + Override bool + // deleteNodes stores the nodes to be deleted for later inventory update + deleteNodes []string } // Flags returns the flag sets for DeleteNodesOptions @@ -169,6 +179,7 @@ func (o *DeleteNodesOptions) Flags() cliflag.NamedFlagSets { kfs.StringVar(&o.Kubernetes, "with-kubernetes", o.Kubernetes, fmt.Sprintf("Specify a supported version of kubernetes. default is %s", o.Kubernetes)) kfs.BoolVar(&o.DeleteAllComponents, "all", o.DeleteAllComponents, "Delete all cluster components, including cri, etcd, dns, and the image registry.") kfs.BoolVar(&o.DeleteData, "with-data", o.DeleteData, "Also delete data directories (harbor data, registry data, etc.). Use with caution.") + kfs.BoolVar(&o.Override, "override", o.Override, "Override the inventory file after successful execution") return fss } @@ -210,6 +221,9 @@ func (o *DeleteNodesOptions) Complete(cmd *cobra.Command, args []string) (*kkcor // completeConfig updates the configuration with container manager settings func (o *DeleteNodesOptions) completeConfig(nodes []string) error { + // Store nodes for later inventory update + o.deleteNodes = nodes + // If kube_version is not set in config, set it to the specified Kubernetes version if _, ok, _ := unstructured.NestedFieldNoCopy(o.Config.Value(), "kubernetes", "kube_version"); !ok { if err := unstructured.SetNestedField(o.Config.Value(), o.Kubernetes, "kubernetes", "kube_version"); err != nil { @@ -220,6 +234,7 @@ func (o *DeleteNodesOptions) completeConfig(nodes []string) error { if err := unstructured.SetNestedStringSlice(o.Config.Value(), nodes, "delete_nodes"); err != nil { return errors.Wrapf(err, "failed to set %q to config", "delete_nodes") } + if o.DeleteAllComponents { if err := unstructured.SetNestedField(o.Config.Value(), true, "delete", "cri"); err != nil { return errors.Wrapf(err, "failed to set %q to config", "delete_cri") @@ -243,6 +258,150 @@ func (o *DeleteNodesOptions) completeConfig(nodes []string) error { return nil } +// OverrideInventory updates the inventory.yaml file after successful execution. +// This should be called only when the Run method succeeds and Override flag is set. +func (o *DeleteNodesOptions) OverrideInventory() error { + // Only update inventory file when --override flag is set + if !o.Override || o.InventoryFile == "" || len(o.deleteNodes) == 0 { + return nil + } + + return o.removeNodesFromInventoryFile(o.deleteNodes) +} + +// removeNodesFromInventoryFile removes nodes from inventory groups (kube_control_plane, kube_worker, etcd) +// and updates the inventory.yaml file while preserving comments and formatting +func (o *DeleteNodesOptions) removeNodesFromInventoryFile(nodes []string) error { + // Read original file content + content, err := os.ReadFile(o.InventoryFile) + if err != nil { + return errors.Wrapf(err, "failed to read inventory file %s", o.InventoryFile) + } + lines := strings.Split(string(content), "\n") + + // Get groups from inventory + groups := variable.ConvertGroup(*o.Inventory) + + // Find which groups contain the nodes to be deleted + deleteGroupHosts := make(map[string][]string) + + // Check if etcd should be deleted from config + deleteEtcd, _, _ := unstructured.NestedBool(o.Config.Value(), "delete", "etcd") + + for _, node := range nodes { + // Check if node exists in inventory + if !slices.Contains(groups[_const.VariableGroupsAll], node) { + return errors.Errorf("%q is not defined in inventory.", node) + } + // Check each group for the node + for _, groupName := range []string{defaultGroupControlPlane, defaultGroupWorker, defaultGroupEtcd} { + // Only delete from etcd group if delete.etcd is true in config + if groupName == defaultGroupEtcd && !deleteEtcd { + continue + } + if slices.Contains(groups[groupName], node) { + if _, ok := deleteGroupHosts[groupName]; !ok { + deleteGroupHosts[groupName] = []string{} + } + deleteGroupHosts[groupName] = append(deleteGroupHosts[groupName], node) + // Remove node from the group in memory + group := o.Inventory.Spec.Groups[groupName] + group.Hosts = slices.DeleteFunc(group.Hosts, func(h string) bool { + return h == node + }) + o.Inventory.Spec.Groups[groupName] = group + } + } + } + + if len(deleteGroupHosts) == 0 { + return nil + } + + // Find and remove lines containing the nodes from groups + linesToRemove := o.findNodesToRemove(lines, deleteGroupHosts) + if len(linesToRemove) == 0 { + return nil + } + + // Sort lines to remove in descending order to avoid index shifting + slices.SortFunc(linesToRemove, func(a, b int) int { + return b - a + }) + + // Remove lines + for _, lineNum := range linesToRemove { + if lineNum >= 0 && lineNum < len(lines) { + lines = append(lines[:lineNum], lines[lineNum+1:]...) + } + } + + // Write back + output := strings.Join(lines, "\n") + return errors.Wrapf(os.WriteFile(o.InventoryFile, []byte(output), _const.PermFilePublic), + "failed to write inventory file %s", o.InventoryFile) +} + +// findNodesToRemove finds the line numbers of nodes that should be removed from groups +func (o *DeleteNodesOptions) findNodesToRemove(lines []string, deleteGroupHosts map[string][]string) []int { + var linesToRemove []int + currentGroup := "" + inHosts := false + + for i, line := range lines { + trimmed := strings.TrimSpace(line) + + // Check if we're entering a group + for groupName := range deleteGroupHosts { + if trimmed == groupName+":" { + currentGroup = groupName + inHosts = false + break + } + } + + // Check if we're in the hosts section of a group + if currentGroup != "" && strings.TrimSpace(trimmed) == "hosts:" { + inHosts = true + continue + } + + // Check if this line contains a node to remove + if inHosts && currentGroup != "" { + if nodes, ok := deleteGroupHosts[currentGroup]; ok { + // Check if this line is a list item with a node to remove + for _, node := range nodes { + // Match patterns like "- node1" or " - node1" + if strings.TrimSpace(trimmed) == "- "+node { + linesToRemove = append(linesToRemove, i) + break + } + } + } + } + + // If we encounter a new top-level key, reset the group context + if !strings.HasPrefix(line, " ") && !strings.HasPrefix(line, "\t") && trimmed != "" && !strings.HasPrefix(trimmed, "#") { + if currentGroup != "" && !strings.HasPrefix(trimmed, currentGroup) { + // Check if this is a new group or section + isGroup := false + for groupName := range deleteGroupHosts { + if strings.HasPrefix(trimmed, groupName) { + isGroup = true + break + } + } + if !isGroup && !strings.HasPrefix(trimmed, "hosts") { + currentGroup = "" + inHosts = false + } + } + } + } + + return linesToRemove +} + // ====================================================================================== // delete registry // ====================================================================================== diff --git a/cmd/kk/app/options/builtin/delete_test.go b/cmd/kk/app/options/builtin/delete_test.go new file mode 100644 index 000000000..663c0c143 --- /dev/null +++ b/cmd/kk/app/options/builtin/delete_test.go @@ -0,0 +1,309 @@ +//go:build builtin +// +build builtin + +/* +Copyright 2025 The KubeSphere Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package builtin + +import ( + "os" + "path/filepath" + "strings" + "testing" + + kkcorev1 "github.com/kubesphere/kubekey/api/core/v1" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/kubesphere/kubekey/v4/cmd/kk/app/options" +) + +func TestRemoveNodesFromInventoryFile(t *testing.T) { + tests := []struct { + name string + inventory string + nodesToRemove []string + expectedOutput string + wantErr bool + errMsg string + }{ + { + name: "remove node from single group", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + - node2 + kube_worker: + hosts: + - node2 + - node3 + etcd: + hosts: + - node1 + - node2 +`, + nodesToRemove: []string{"node2"}, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node3 + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + { + name: "remove multiple nodes from different groups", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + - node2 + kube_worker: + hosts: + - node2 + - node3 + etcd: + hosts: + - node1 + - node2 +`, + nodesToRemove: []string{"node2", "node3"}, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + { + name: "remove non-existent node", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + groups: + kube_control_plane: + hosts: + - node1 +`, + nodesToRemove: []string{"node999"}, + wantErr: true, + errMsg: "not defined in inventory", + }, + { + name: "remove node with 4-space indentation (no space after hosts)", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + - node2 + kube_worker: + hosts: + - node2 + - node3 + etcd: + hosts: + - node1 + - node2 +`, + nodesToRemove: []string{"node2"}, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node3 + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + { + name: "remove node with 5-space indentation (1 space after hosts)", + inventory: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + - node2 + kube_worker: + hosts: + - node2 + - node3 + etcd: + hosts: + - node1 + - node2 +`, + nodesToRemove: []string{"node2"}, + expectedOutput: `apiVersion: kubekey.kubesphere.io/v1 +kind: Inventory +metadata: + name: default +spec: + hosts: + node1: + node2: + node3: + groups: + kube_control_plane: + hosts: + - node1 + kube_worker: + hosts: + - node3 + etcd: + hosts: + - node1 +`, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create temporary directory + tmpDir := t.TempDir() + inventoryFile := filepath.Join(tmpDir, "inventory.yaml") + + // Write initial inventory + err := os.WriteFile(inventoryFile, []byte(tt.inventory), 0644) + require.NoError(t, err) + + // Create DeleteNodesOptions with inventory + o := &DeleteNodesOptions{ + CommonOptions: options.CommonOptions{ + InventoryFile: inventoryFile, + Inventory: &kkcorev1.Inventory{ + Spec: kkcorev1.InventorySpec{ + Hosts: kkcorev1.InventoryHost{ + "node1": {Raw: []byte("{}")}, + "node2": {Raw: []byte("{}")}, + "node3": {Raw: []byte("{}")}, + }, + Groups: map[string]kkcorev1.InventoryGroup{ + "kube_control_plane": {Hosts: []string{"node1", "node2"}}, + "kube_worker": {Hosts: []string{"node2", "node3"}}, + "etcd": {Hosts: []string{"node1", "node2"}}, + }, + }, + }, + }, + } + + // Call the function + err = o.removeNodesFromInventoryFile(tt.nodesToRemove) + + if tt.wantErr { + assert.Error(t, err) + if tt.errMsg != "" { + assert.Contains(t, err.Error(), tt.errMsg) + } + return + } + + require.NoError(t, err) + + // Read the result + content, err := os.ReadFile(inventoryFile) + require.NoError(t, err) + + // Compare (normalize line endings) + actual := strings.TrimSpace(string(content)) + expected := strings.TrimSpace(tt.expectedOutput) + assert.Equal(t, expected, actual) + }) + } +} diff --git a/pkg/executor/task_executor.go b/pkg/executor/task_executor.go index 6092c5b93..d02ac5223 100644 --- a/pkg/executor/task_executor.go +++ b/pkg/executor/task_executor.go @@ -2,6 +2,7 @@ package executor import ( "context" + "encoding/json" "fmt" "os" "strings" @@ -12,7 +13,7 @@ import ( "github.com/schollz/progressbar/v3" "gopkg.in/yaml.v3" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/util/json" + k8sjson "k8s.io/apimachinery/pkg/util/json" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" @@ -210,7 +211,7 @@ func (e *taskExecutor) execTaskHost(i int, h string) func(ctx context.Context) { var rawItem runtime.RawExtension if rendered != nil { - if bs, err := json.Marshal(rendered); err == nil { + if bs, err := k8sjson.Marshal(rendered); err == nil { rawItem = runtime.RawExtension{Raw: bs} } } @@ -456,7 +457,10 @@ func (e *taskExecutor) dealRegister(host string, loopResults []kkcorev1alpha1.Lo switch e.task.Spec.RegisterType { case "json": // Attempt to unmarshal as JSON. - if err := json.Unmarshal([]byte(s), &out); err != nil { + // Use Decoder with UseNumber() to preserve large integers precision. + decoder := json.NewDecoder(strings.NewReader(s)) + decoder.UseNumber() + if err := decoder.Decode(&out); err != nil { klog.V(5).ErrorS(err, "failed to register json value") return s } diff --git a/plugins/roles/etcd/backup/tasks/main.yaml b/plugins/roles/etcd/backup/tasks/main.yaml index a5a3bd714..839f5faec 100644 --- a/plugins/roles/etcd/backup/tasks/main.yaml +++ b/plugins/roles/etcd/backup/tasks/main.yaml @@ -1,6 +1,10 @@ --- - name: Generate backup from etcd command: | + unset ETCDCTL_ENDPOINTS + unset ETCDCTL_KEY + unset ETCDCTL_CERT + unset ETCDCTL_CACERT if [ ! -d /tmp/kubekey/etcd/ ]; then mkdir -p /tmp/kubekey/etcd/ fi diff --git a/plugins/roles/etcd/restore/tasks/main.yaml b/plugins/roles/etcd/restore/tasks/main.yaml index 01a4a7c0d..3cb616267 100644 --- a/plugins/roles/etcd/restore/tasks/main.yaml +++ b/plugins/roles/etcd/restore/tasks/main.yaml @@ -13,6 +13,10 @@ - name: Restore etcd by snapshot command: | + unset ETCDCTL_ENDPOINTS + unset ETCDCTL_KEY + unset ETCDCTL_CERT + unset ETCDCTL_CACERT export $(cat /etc/etcd.env | grep ETCDCTL_CACERT) export $(cat /etc/etcd.env | grep ETCDCTL_CERT) export $(cat /etc/etcd.env | grep ETCDCTL_KEY)