From cec7c54fd70628ab5c9d99bf036c5c222fa2f6de Mon Sep 17 00:00:00 2001 From: Marian Krcmarik Date: Mon, 9 Dec 2024 13:16:21 +0100 Subject: [PATCH] DNM ci_dcn_site: Add scaling down of a DCN site --- playbooks/dcn.yml | 19 ++ roles/ci_dcn_site/README.md | 1 + roles/ci_dcn_site/defaults/main.yml | 1 + roles/ci_dcn_site/tasks/deploy_site.yml | 34 ++ roles/ci_dcn_site/tasks/main.yml | 23 +- roles/ci_dcn_site/tasks/scaledown_site.yml | 308 ++++++++++++++++++ .../templates/service-values.yaml.j2 | 22 +- 7 files changed, 389 insertions(+), 19 deletions(-) create mode 100644 roles/ci_dcn_site/tasks/deploy_site.yml create mode 100644 roles/ci_dcn_site/tasks/scaledown_site.yml diff --git a/playbooks/dcn.yml b/playbooks/dcn.yml index 862b81da93..1f2220c324 100644 --- a/playbooks/dcn.yml +++ b/playbooks/dcn.yml @@ -63,6 +63,25 @@ - _ceph_bootstrap_node != '' ansible.builtin.include_role: name: ci_dcn_site + when: cifmw_ci_dcn_site_scaledown_az is not defined or cifmw_ci_dcn_site_scaledown_az == "" + + - name: The map for az0 contains all AZ backends + ansible.builtin.set_fact: + az_to_group_map: + az0: computes + az1: dcn1-computes + az2: dcn2-computes + + - name: Scaledown the {{ cifmw_ci_dcn_site_scaledown_az }} site + vars: + _az_to_scaledown: "{{ cifmw_ci_dcn_site_scaledown_az }}" + _subnet: "subnet{{ (az_to_group_map | dict2items | map(attribute='key') | list) + | ansible.utils.index_of('eq', cifmw_ci_dcn_site_scaledown_az) + 1 }}" + _group_name: "{{ az_to_group_map[cifmw_ci_dcn_site_scaledown_az] }}" + _group_hosts: "{{ groups[az_to_group_map[cifmw_ci_dcn_site_scaledown_az]] }}" + ansible.builtin.include_role: + name: ci_dcn_site + when: cifmw_ci_dcn_site_scaledown_az is defined and cifmw_ci_dcn_site_scaledown_az != "" - name: Find all created CRs ansible.builtin.find: diff --git a/roles/ci_dcn_site/README.md b/roles/ci_dcn_site/README.md index 562474432c..a6ac97c9a3 100644 --- a/roles/ci_dcn_site/README.md +++ b/roles/ci_dcn_site/README.md @@ -12,6 +12,7 @@ with a collocated Ceph cluster. ## Parameters * `_az`: The name of the availability zone for the AZ, e.g. `az1` +* `_az_to_scaledown`: The name of the availability zone for the deployed AZ to be scale-downed. * `_group_name`: The name of the group of nodes to be deployed, e.g. `dcn1-computes` * `_subnet`: The name of the subnet the DCN site will use, e.g. `subnet2` * `_subnet_network_range`: The range of the subnet the DCN site will use, e.g. `192.168.133.0/24` diff --git a/roles/ci_dcn_site/defaults/main.yml b/roles/ci_dcn_site/defaults/main.yml index ea30f552b7..857c256f9a 100644 --- a/roles/ci_dcn_site/defaults/main.yml +++ b/roles/ci_dcn_site/defaults/main.yml @@ -26,3 +26,4 @@ ci_dcn_site_search_storagemgmt_network_names: - "storagemgmtdcn1" - "storagemgmtdcn2" cifmw_ci_dcn_site_enable_network_az: false +_az_to_scaledown: "" diff --git a/roles/ci_dcn_site/tasks/deploy_site.yml b/roles/ci_dcn_site/tasks/deploy_site.yml new file mode 100644 index 0000000000..e4bcd93c09 --- /dev/null +++ b/roles/ci_dcn_site/tasks/deploy_site.yml @@ -0,0 +1,34 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Render and apply pre-ceph CRs in DCN context + ansible.builtin.include_tasks: pre-ceph.yml + +- name: Deploy Ceph in DCN context + ansible.builtin.include_tasks: ceph.yml + +- name: Render and apply post-ceph CRs in DCN context + ansible.builtin.include_tasks: post-ceph.yml + +- name: Run Nova cell discovery for new DCN hosts + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: nova-cell0-conductor-0 + command: nova-manage cell_v2 discover_hosts --verbose + +- name: Create new AZ and add new hosts to it + ansible.builtin.include_tasks: az.yml diff --git a/roles/ci_dcn_site/tasks/main.yml b/roles/ci_dcn_site/tasks/main.yml index d449ba9838..074aae0713 100644 --- a/roles/ci_dcn_site/tasks/main.yml +++ b/roles/ci_dcn_site/tasks/main.yml @@ -22,21 +22,10 @@ - name: Set Network related facts ansible.builtin.include_tasks: set_network_facts.yml -- name: Render and apply pre-ceph CRs in DCN context - ansible.builtin.include_tasks: pre-ceph.yml +- name: Deploy a DCN site + ansible.builtin.include_tasks: deploy_site.yml + when: _az_to_scaledown == "" -- name: Deploy Ceph in DCN context - ansible.builtin.include_tasks: ceph.yml - -- name: Render and apply post-ceph CRs in DCN context - ansible.builtin.include_tasks: post-ceph.yml - -- name: Run Nova cell discovery for new DCN hosts - kubernetes.core.k8s_exec: - api_key: "{{ _auth_results.openshift_auth.api_key }}" - namespace: openstack - pod: nova-cell0-conductor-0 - command: nova-manage cell_v2 discover_hosts --verbose - -- name: Create new AZ and add new hosts to it - ansible.builtin.include_tasks: az.yml +- name: Scale a DCN site down + ansible.builtin.include_tasks: scaledown_site.yml + when: _az_to_scaledown is defined and _az_to_scaledown != "" diff --git a/roles/ci_dcn_site/tasks/scaledown_site.yml b/roles/ci_dcn_site/tasks/scaledown_site.yml new file mode 100644 index 0000000000..d7d2d00cfd --- /dev/null +++ b/roles/ci_dcn_site/tasks/scaledown_site.yml @@ -0,0 +1,308 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Get compute nodes from the host aggregate + register: az_hosts + ignore_errors: true + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack aggregate show {{ _az_to_scaledown }} -c hosts -f value + +- name: Get compute nodes from the scale-downed AZ + register: az_compute_hosts + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + sh -c "openstack compute service list -c Host -c Zone -f value | grep {{ _az_to_scaledown }} | awk '{print $1}'" + +- name: Convert az_hosts string to list and remove extra text + ansible.builtin.set_fact: + az_hosts_list: > + {{ az_hosts.stdout + | default([]) + | from_yaml + | list }} + when: not az_hosts.failed + +- name: Delete the compute nodes from the aggregate + loop: "{{ az_hosts_list }}" + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack aggregate remove host {{ _az_to_scaledown }} {{ item }} + when: not az_hosts.failed + +- name: Delete the host aggregate + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack aggregate delete {{ _az_to_scaledown }} + when: not az_hosts.failed + +- name: Disable the compute service on scale-downed compute nodes + loop: "{{ az_compute_hosts.stdout_lines }}" + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack compute service set {{ item }} nova-compute --disable + +- name: Find all ceph variable files + register: _ceph_vars_files + ansible.builtin.find: + paths: "/tmp" + patterns: "ceph_client_az*.yml" + recurse: false + +- name: Load all ceph vars from files + loop: "{{ _ceph_vars_files.files | map(attribute='path') | list }}" + register: _ceph_vars + ansible.builtin.include_vars: + file: "{{ item }}" + +- name: Combine ceph variables into a list of dictionaries + loop: "{{ _ceph_vars.results }}" + ansible.builtin.set_fact: + _ceph_vars_list: "{{ _ceph_vars_list | union([item.ansible_facts]) }}" + +- name: Define _all_azs list for all Ceph backends + loop: "{{ _ceph_vars_list }}" + ansible.builtin.set_fact: + _all_azs: "{{ _all_azs | default([]) + [ item.cifmw_ceph_client_cluster ] }}" + +- name: The map for az0 contains all AZ backends + ansible.builtin.set_fact: + ci_dcn_site_glance_map: "{{ { 'az0': _all_azs } }}" + +- name: The map for AZs other than az0 contains backends for az0 and itself + loop: "{{ _all_azs }}" + when: item != "az0" + ansible.builtin.set_fact: + ci_dcn_site_glance_map: "{{ ci_dcn_site_glance_map | combine( { item: ['az0', item ] } ) }}" + +- name: List instances which are running on the scale-downed AZ + register: osp_instances + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack server list --availability-zone {{ _az_to_scaledown }} --all-projects -f value -c ID + +- name: Clean the running instances from the AZ up before deleting the hosts from Cell + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + openstack server delete --force {{ item }} + loop: "{{ osp_instances.stdout_lines }}" + +- name: Get the Cell UUID + register: cell_uuid + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: nova-cell0-conductor-0 + command: >- + sh -c "nova-manage cell_v2 list_hosts | grep {{ az_compute_hosts.stdout_lines[0] }} | awk '{print $4}'" + +- name: Remove the compute hosts from the cell + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: nova-cell0-conductor-0 + command: >- + nova-manage cell_v2 delete_host --cell_uuid {{ cell_uuid.stdout }} --host {{ item }} + loop: "{{ az_compute_hosts.stdout_lines }}" + +- name: Remove the compute cell + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: nova-cell0-conductor-0 + command: >- + nova-manage cell_v2 delete_cell --cell_uuid {{ cell_uuid.stdout }} + +- name: Render the scale-downed control plane service-values.yaml + ansible.builtin.template: + mode: "0644" + backup: true + src: "templates/service-values.yaml.j2" + dest: "{{ ci_dcn_site_arch_path }}/control-plane/scaledown/service-values.yaml" + +- name: Kustomize scale-downed OpenStackControlPlane + ansible.builtin.set_fact: + scaledown_controlplane_cr: >- + {{ lookup('kubernetes.core.kustomize', + dir=ci_dcn_site_arch_path + '/control-plane/scaledown') }} + +- name: Save the post-ceph NodeSet CR + ansible.builtin.copy: + mode: "0644" + dest: "{{ ci_dcn_site_arch_path }}/control-plane-scale-downed_{{ _az_to_scaledown }}.yaml" + content: "{{ scaledown_controlplane_cr }}" + backup: true + +- name: Apply post-ceph NodeSet CR + register: result + retries: 5 + delay: 10 + until: result is not failed + kubernetes.core.k8s: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + state: present + apply: true + src: "{{ ci_dcn_site_arch_path }}/control-plane-scale-downed_{{ _az_to_scaledown }}.yaml" + +- name: Delete rabbitmqcluster + vars: + az_to_cell_map: + az0: cell1 + az1: cell2 + az2: cell3 + ansible.builtin.shell: | + oc delete rabbitmqclusters rabbitmq-{{ az_to_cell_map[_az_to_scaledown] }} + +- name: Delete the cinder-volume service + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: cinder-scheduler-0 + command: >- + cinder-manage service remove cinder-volume cinder-volume-{{ _az_to_scaledown }}-0@ceph + +- name: Fetch ceph-conf-files secret + register: secret_info + kubernetes.core.k8s_info: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + kind: Secret + name: ceph-conf-files + namespace: openstack + +- name: Save secret data to files + ansible.builtin.copy: + content: "{{ secret_info.resources[0].data[key] | b64decode | regex_replace('(?m)^\\s*\\n', '') }}" + dest: "/tmp/{{ key }}" + loop: "{{ secret_info.resources[0].data.keys() }}" + loop_control: + loop_var: key + +- name: Delete the Ceph cluster's secrets of removed cluster and default site cluster + kubernetes.core.k8s: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + kind: Secret + name: "{{ item }}" + namespace: openstack + state: absent + loop: + - "ceph-conf-files-{{ _az_to_scaledown }}" + - "ceph-conf-files" + +- name: Find all ceph variable files + register: all_ceph_conf_files + ansible.builtin.find: + paths: "/tmp" + patterns: "az*.c*" + recurse: false + +- name: Set fact for base64-encoded file data of ceph-conf-files Secret + vars: + file_list: "{{ all_ceph_conf_files.files | map(attribute='path') | reject('search', _az_to_scaledown) | list }}" + ansible.builtin.set_fact: + ceph_conf_cmdline: >- + {% for file in file_list %} + --from-file={{ file }}{% if not loop.last %} {% endif %} + {% endfor %} + +- name: Recreate the secret while omitting deleted ceph cluster + ansible.builtin.shell: | + oc create secret generic ceph-conf-files {{ ceph_conf_cmdline }} + +- name: Stop the ovn_controller service + ansible.builtin.service: + name: edpm_ovn_controller + state: stopped + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Stop the ovn metadata agent service + ansible.builtin.service: + name: edpm_ovn_metadata_agent + state: stopped + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Stop the nova-compute service + ansible.builtin.service: + name: edpm_nova_compute + state: stopped + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Remove the systemd unit files of the ovn and nova-compute containers + ansible.builtin.shell: | + rm -f /etc/systemd/system/edpm_ovn_controller.service + rm -f /etc/systemd/system/edpm_ovn_metadata_agent.service + rm -f /etc/systemd/system/edpm_nova_compute.service + become: true + delegate_to: "{{ item }}" + with_items: "{{ groups[_group_name] }}" + +- name: Delete the network agents on scale-downed compute nodes + kubernetes.core.k8s_exec: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + namespace: openstack + pod: openstackclient + command: >- + sh -c "openstack network agent list --host {{ item }} -c ID -f value | xargs openstack network agent delete" + loop: "{{ az_compute_hosts.stdout_lines }}" + +- name: Fetch OpenStackDataPlaneNodeSet resource + register: osdpns_info + kubernetes.core.k8s_info: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + api_version: dataplane.openstack.org/v1beta1 + kind: OpenStackDataPlaneNodeSet + name: "{{ _group_name }}-edpm" + namespace: openstack + +- name: Delete OpenStackDataPlaneNodeSet + kubernetes.core.k8s: + api_key: "{{ _auth_results.openshift_auth.api_key }}" + api_version: dataplane.openstack.org/v1beta1 + state: absent + kind: OpenStackDataPlaneNodeSet + name: "{{ _group_name }}-edpm" + namespace: openstack + +- name: Delete each Secret which contains TLS certificate for the NodeSet nodes + ansible.builtin.command: + cmd: oc delete Secret {{ item }} + loop: "{{ osdpns_info.resources[0].status.secretHashes.keys() | select('search', 'cert') | list }}" diff --git a/roles/ci_dcn_site/templates/service-values.yaml.j2 b/roles/ci_dcn_site/templates/service-values.yaml.j2 index 4c4d684c37..953fcac428 100644 --- a/roles/ci_dcn_site/templates/service-values.yaml.j2 +++ b/roles/ci_dcn_site/templates/service-values.yaml.j2 @@ -26,6 +26,7 @@ data: storage_availability_zone = az0 cinderVolumes: {% for _ceph in _ceph_vars_list %} +{% if _ceph.cifmw_ceph_client_cluster != _az_to_scaledown %} {{ _ceph.cifmw_ceph_client_cluster }}: customServiceConfig: | [DEFAULT] @@ -41,6 +42,7 @@ data: rbd_secret_uuid = {{ _ceph.cifmw_ceph_client_fsid }} rbd_cluster_name = {{ _ceph.cifmw_ceph_client_cluster }} backend_availability_zone = {{ _ceph.cifmw_ceph_client_cluster }} +{% endif %} {% endfor %} galera: templates: @@ -50,7 +52,11 @@ data: storageRequest: 5G {% for index in range(1, _all_azs | length + 1) %} openstack-cell{{ index }}: +{% if "az" ~ (index - 1) != _az_to_scaledown %} replicas: 1 +{% else %} + replicas: 0 +{% endif %} secret: osp-secret storageRequest: 5G {% endfor %} @@ -58,20 +64,25 @@ data: keystoneEndpoint: az0 glanceAPIs: {% for _ceph in _ceph_vars_list %} +{% if _ceph.cifmw_ceph_client_cluster != _az_to_scaledown %} {{ _ceph.cifmw_ceph_client_cluster }}: customServiceConfig: | [DEFAULT] enabled_import_methods = [web-download,copy-image,glance-direct] - enabled_backends = {{ ci_dcn_site_glance_map[_ceph.cifmw_ceph_client_cluster] | join(':rbd,') + ':rbd'}} + enabled_backends = {{ ci_dcn_site_glance_map[_ceph.cifmw_ceph_client_cluster] + | reject('equalto', _az_to_scaledown) + | join(':rbd,') + ':rbd' }} [glance_store] default_backend = {{ _ceph.cifmw_ceph_client_cluster }} {% for _ceph_az in ci_dcn_site_glance_map[_ceph.cifmw_ceph_client_cluster] %} +{% if _ceph_az != _az_to_scaledown %} [{{ _ceph_az }}] rbd_store_ceph_conf = /etc/ceph/{{ _ceph_az }}.conf store_description = "{{ _ceph_az }} RBD backend" rbd_store_pool = images rbd_store_user = openstack rbd_thin_provisioning = True +{% endif %} {% endfor %} networkAttachments: - storage @@ -92,6 +103,7 @@ data: replicas: 1 type: edge {% endif %} +{% endif %} {% endfor %} manila: enabled: false @@ -152,6 +164,7 @@ data: cellDatabaseAccount: nova-cell0 hasAPIAccess: true {% for index in range(1, _all_azs | length + 1) %} +{% if "az" ~ (index - 1) != _az_to_scaledown %} cell{{ index }}: cellDatabaseInstance: openstack-cell{{ index }} cellDatabaseAccount: nova-cell{{ index }} @@ -171,6 +184,7 @@ data: spec: type: LoadBalancer replicas: 3 +{% endif %} {% endfor %} rabbitmq: templates: @@ -194,7 +208,11 @@ data: metallb.universe.tf/loadBalancerIPs: 172.17.0.8{{ 5 + index }} spec: type: LoadBalancer +{% if "az" ~ (index - 1) != _az_to_scaledown %} replicas: 3 +{% else %} + replicas: 0 +{% endif %} {% endfor %} extraMounts: - name: v1 @@ -215,7 +233,7 @@ data: mountPath: /etc/ceph readOnly: true {% for _ceph in _ceph_vars_list %} -{% if _ceph.cifmw_ceph_client_cluster != 'az0' %} +{% if _ceph.cifmw_ceph_client_cluster != 'az0' and _ceph.cifmw_ceph_client_cluster != _az_to_scaledown %} - propagation: - {{ _ceph.cifmw_ceph_client_cluster }} extraVolType: Ceph