Skip to content

Commit

Permalink
Add preflight OS and NIC and other checks
Browse files Browse the repository at this point in the history
- Implemented OS preflight checks to validate system requirements before Ceph cluster creation.
- Checks include:
  - OS version (RHEL 9+ required)
  - SELinux enforcing mode
  - Firewalld installation and status
  - Required package availability (rpcbind, podman, firewalld)
  - Podman version check (>= 3.3)
  - RHEL software profile validation
  - Tuned profile check
  - CPU, RAM, Swap, and Filesystem (part of other checks)
  - Check whether jumbo frames are enabled
  - Is it configured with DHCP or static IP
  - Is the bandwidth sufficient
  - Collect and output current NIC options set (e.g. Bonding, not bridged or virtual)
  - Check and report network latency (ping) with all hosts provided in the inventory file
  - Listing all NICs

Signed-off-by: Kushal Deb <Kushal.Deb@ibm.com>
  • Loading branch information
Kushal-deb committed Feb 19, 2025
1 parent 1d3efbc commit 11764b3
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 0 deletions.
2 changes: 2 additions & 0 deletions ceph_defaults/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ infra_pkgs:
- podman
- lvm2
- sos
- rpcbind
- firewalld
client_group: clients
12 changes: 12 additions & 0 deletions cephadm-preflight.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@
- name: variables validations
ansible.builtin.import_playbook: validate/preflight.yml

- name: Run Preflight Checks
ansible.builtin.import_playbook: preflight-checks.yml

- hosts: all
become: true
gather_facts: true
vars:
preflight_results: []
repos_4_to_disable:
- rhceph-4-tools-for-rhel-{{ ansible_facts['distribution_major_version'] }}-{{ ansible_facts['architecture'] }}-rpms
- rhceph-4-mon-for-rhel-{{ ansible_facts['distribution_major_version'] }}-{{ ansible_facts['architecture'] }}-rpms
Expand Down Expand Up @@ -214,6 +218,14 @@
state: started
enabled: true

- name: Ensure firewalld is enabled and running
ansible.builtin.systemd:
name: firewalld
state: started
enabled: true
register: firewall_status
failed_when: false

- name: Ubuntu related tasks
when: ansible_facts['distribution'] == 'Ubuntu'
block:
Expand Down
213 changes: 213 additions & 0 deletions preflight-checks.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
- name: Preflight Checks for Ceph Deployment
hosts: all
become: true
gather_facts: true

tasks:
- name: Initialize preflight results list
ansible.builtin.set_fact:
preflight_results: []
preflight_failures: []

- name: import_role ceph_defaults
import_role:
name: ceph_defaults

- name: Collect installed package facts
package_facts:
manager: auto

- name: Check if OS is RHEL 9+
ansible.builtin.set_fact:
os_check: "{{ 'PASS' if ansible_facts['distribution'] == 'RedHat' and ansible_facts['distribution_major_version'] | int >= 9 else 'FAIL' }}"
os_reason: "{{ 'Ceph requires RHEL 9+. Detected: ' ~ ansible_facts['distribution'] ~ ' ' ~ ansible_facts['distribution_version'] if ansible_facts['distribution_major_version'] | int < 9 else 'N/A' }}"

- name: Ensure SELinux is set to Enforcing mode
ansible.posix.selinux:
policy: targeted
state: enforcing
register: selinux_status
changed_when: false
failed_when: selinux_status.failed

- name: Determine SELinux Check Result
ansible.builtin.set_fact:
selinux_check: "{{ 'PASS' if ansible_facts['selinux']['status'] == 'enabled' and ansible_facts['selinux']['mode'] == 'enforcing' else 'FAIL' }}"

- name: Determine SELinux Failure Reason
ansible.builtin.set_fact:
selinux_reason: "{{ 'SELinux was not in enforcing mode and could not be enforced automatically' if selinux_check == 'FAIL' else 'N/A' }}"

- name: Determine Package Installation Check Result
ansible.builtin.set_fact:
package_check: "{{ 'PASS' if infra_pkgs | difference(ansible_facts.packages.keys()) | length == 0 else 'FAIL' }}"

- name: Determine Package Installation Failure Reason
ansible.builtin.set_fact:
package_reason: "{{ 'Missing packages: ' ~ (infra_pkgs | difference(ansible_facts.packages.keys()) | join(', ')) if package_check == 'FAIL' else 'N/A' }}"

- name: Fetch Firewalld status
ansible.builtin.systemd:
name: firewalld
state: started
register: firewall_status
changed_when: false
failed_when: false

- name: Extract Podman version if installed
ansible.builtin.set_fact:
podman_version: "{{ ansible_facts.packages['podman'][0].version if 'podman' in ansible_facts.packages else '0.0' }}"

- name: Determine if Podman meets version requirement (>=3.3)
ansible.builtin.set_fact:
podman_check: "{{ 'PASS' if ('podman' in ansible_facts.packages and (podman_version.split('.')[0] | int > 3 or (podman_version.split('.')[0] | int == 3 and podman_version.split('.')[1] | int >= 3))) else 'FAIL' }}"
podman_reason: "{{ 'Podman is not installed, required for Ceph' if 'podman' not in ansible_facts.packages else 'Podman version is ' ~ podman_version }}"

- name: Validate RHEL software profile
ansible.builtin.command: subscription-manager list --consumed
register: rhel_profile
changed_when: false
failed_when: false

- name: Define RHEL Profile Check Result
ansible.builtin.set_fact:
rhel_profile_check: "{{ 'PASS' if ('Server' in rhel_profile.stdout and 'File and Storage Server' in rhel_profile.stdout) else 'FAIL' }}"

- name: Define RHEL Profile Check Reason
ansible.builtin.set_fact:
rhel_profile_reason: "{{ 'Incorrect RHEL software profile. Expected: Server with File and Storage Server.' if rhel_profile_check == 'FAIL' else 'N/A' }}"

- name: Get current tuned profile
ansible.builtin.command: tuned-adm active
register: tuned_profile
changed_when: false
failed_when: false

- name: Define Tuned Profile Check Result
ansible.builtin.set_fact:
tuned_profile_check: "{{ 'PASS' if 'throughput-performance' in tuned_profile.stdout else 'FAIL' }}"

- name: Define Tuned Profile Check Reason
ansible.builtin.set_fact:
tuned_profile_reason: "{{ 'Incorrect tuned profile. Expected: throughput-performance' if tuned_profile_check == 'FAIL' else 'N/A' }}"

- name: Check CPU x86-64-v2 support
ansible.builtin.shell: "lscpu | grep -q 'avx2' && echo 'yes' || echo 'no'"
register: cpu_supports_x86_64_v2
changed_when: false
failed_when: false

- name: Define CPU, RAM, Swap, and Filesystem Check Variables
ansible.builtin.set_fact:
cpu_checks:
x86_64_v2:
result: "{{ 'PASS' if cpu_supports_x86_64_v2.stdout | trim == 'yes' else 'FAIL' }}"
reason: "{{ 'AVX2 instruction set missing. RHEL 9 requires AVX2 support.' if cpu_supports_x86_64_v2.stdout | trim != 'yes' else 'N/A' }}"
cores:
result: "{{ 'PASS' if ansible_facts['processor_vcpus'] | int >= 4 else 'FAIL' }}"
reason: "{{ 'System has only ' ~ ansible_facts['processor_vcpus'] ~ ' cores, required: 4' if ansible_facts['processor_vcpus'] | int < 4 else 'N/A' }}"

memory_checks:
ram:
result: "{{ 'PASS' if ansible_facts['memtotal_mb'] | int >= 8192 else 'FAIL' }}"
reason: "{{ 'System has only ' ~ ansible_facts['memtotal_mb'] ~ ' MB RAM, required: 8192MB' if ansible_facts['memtotal_mb'] | int < 8192 else 'N/A' }}"
swap:
required: "{{ ((ansible_facts['memtotal_mb'] | int * 1.5) | round) | int }}"
actual: "{{ ansible_facts['swaptotal_mb'] | int }}"
result: "{{ 'PASS' if (ansible_facts['swaptotal_mb'] | int) >= ((ansible_facts['memtotal_mb'] * 1.5) | round) | int else 'FAIL' }}"
reason: "{{ 'System has only ' ~ ansible_facts['swaptotal_mb'] ~ ' MB Swap, required: ' ~ ((ansible_facts['memtotal_mb'] * 1.5) | round) | int ~ ' MB' if ansible_facts['swaptotal_mb'] | int < ((ansible_facts['memtotal_mb'] * 1.5) | round) | int else 'N/A' }}"

filesystem_checks:
var_partition:
result: "{{ 'PASS' if (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0) else 'FAIL' }}"
reason: "{{ 'N/A' if (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/var') | list | length > 0) else '/var is not a separate partition' }}"
root_fs:
size_gb: "{{ (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) }}"
result: "{{ 'PASS' if ((ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) >= 100) else 'FAIL' }}"
reason: "{{ 'Root FS is only ' ~ (ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) ~ 'GB, required: 100GB' if ((ansible_facts['mounts'] | selectattr('mount', 'equalto', '/') | map(attribute='size_total') | first | default(0) | int // 1024**3) < 100) else 'N/A' }}"

- name: Ping all hosts in inventory to measure latency
ansible.builtin.shell: "ping -c 4 {{ item }} | grep 'rtt min/avg/max/mdev' | awk -F'/' '{print $5}'"
register: ping_results
changed_when: false
failed_when: false
delegate_to: "{{ item }}"
with_items: "{{ groups['all'] }}"

- name: Define networking facts
ansible.builtin.set_fact:
primary_nic: "{{ ansible_facts['default_ipv4']['interface'] | default('Unknown') }}"
primary_mtu: "{{ ansible_facts.get(ansible_facts['default_ipv4']['interface'], {}).get('mtu', '0') | int }}"
primary_speed: "{{ ansible_facts.get(ansible_facts['default_ipv4']['interface'], {}).get('speed', '-1') | int }}"
primary_dhcp: "{{ 'dhcp' if ansible_facts['default_ipv4'].get('gateway') else 'manual' }}"
network_interfaces: "{{ ansible_facts['interfaces'] | difference(['lo']) }}"

- name: Store all preflight check results
ansible.builtin.set_fact:
preflight_results: "{{ preflight_results + [
{'Check': 'OS Version', 'Result': os_check, 'Reason': os_reason},
{'Check': 'Tuned Profile', 'Result': tuned_profile_check, 'Reason': tuned_profile_reason},
{'Check': 'RHEL Profile', 'Result': rhel_profile_check, 'Reason': rhel_profile_reason},
{'Check': 'Firewalld Running', 'Result': ('PASS' if firewall_status.status.ActiveState == 'active' else 'FAIL'),
'Reason': ('Firewalld was not running and could not be started' if firewall_status.failed else 'N/A')},
{'Check': 'Podman Installed', 'Result': podman_check, 'Reason': podman_reason},
{'Check': 'SELinux', 'Result': selinux_check, 'Reason': selinux_reason},
{'Check': 'Required Packages Installed', 'Result': package_check, 'Reason': package_reason},
{'Check': 'Minimum RAM (8GB)', 'Result': memory_checks['ram']['result'], 'Reason': memory_checks['ram']['reason']},
{'Check': 'Swap Space (1.5x RAM)', 'Result': memory_checks['swap']['result'], 'Reason': memory_checks['swap']['reason']},
{'Check': 'CPU x86-64-v2', 'Result': cpu_checks['x86_64_v2']['result'], 'Reason': cpu_checks['x86_64_v2']['reason']},
{'Check': 'CPU Cores >= 4', 'Result': cpu_checks['cores']['result'], 'Reason': cpu_checks['cores']['reason']},
{'Check': '/var is a separate partition', 'Result': filesystem_checks['var_partition']['result'], 'Reason': filesystem_checks['var_partition']['reason']},
{'Check': 'Root Filesystem >= 100GB', 'Result': filesystem_checks['root_fs']['result'], 'Reason': filesystem_checks['root_fs']['reason']},
{'Check': 'NIC Configuration', 'Result': 'INFO',
'Reason': 'Available network interfaces: ' ~ (network_interfaces | default([]) | join(', ')) ~
' | Speeds (Mbps): ' ~ (network_interfaces | default([]) | map('extract', ansible_facts) | map(attribute='speed') | list | join(', '))},
{'Check': 'Jumbo Frames Enabled', 'Result': ('PASS' if (primary_mtu | int) > 1500 else 'FAIL'),
'Reason': ('MTU is ' ~ (primary_mtu | int) ~ ', recommended > 1500' if (primary_mtu | int) <= 1500 else 'N/A')},
{'Check': 'NIC Static IP Configuration', 'Result': ('PASS' if primary_dhcp == 'manual' else 'FAIL'),
'Reason': ('NIC is using DHCP, static IP is recommended' if primary_dhcp != 'manual' else 'N/A')},
{'Check': 'NIC Bandwidth (10GbE Recommended)', 'Result': ('PASS' if (primary_speed | int) >= 10000 else 'FAIL'),
'Reason': ('NIC speed is ' ~ primary_speed ~ ' Mbps, recommended is 10GbE' if (primary_speed | int) < 10000 else 'N/A')},
{'Check': 'Network Latency', 'Result': 'INFO', 'Reason': 'Average latency (ms): ' ~ (ping_results.results | map(attribute='stdout') | list)}
] }}"

preflight_failures: "{{ preflight_failures +
(['OS Version'] if os_check == 'FAIL' else []) +
(['Tuned Profile'] if tuned_profile_check == 'FAIL' else []) +
(['RHEL Profile'] if rhel_profile_check == 'FAIL' else []) +
(['SELinux'] if selinux_check == 'FAIL' else []) +
(['Required Packages'] if package_check == 'FAIL' else preflight_failures) +
(['Firewalld Running'] if firewall_status.status.ActiveState != 'active' else []) +
(['Podman Installed'] if podman_check == 'FAIL' else []) +
(['Minimum RAM'] if memory_checks['ram']['result'] == 'FAIL' else []) +
(['Swap Space'] if memory_checks['swap']['result'] == 'FAIL' else []) +
(['CPU x86-64-v2'] if cpu_checks['x86_64_v2']['result'] == 'FAIL' else []) +
(['CPU Cores'] if cpu_checks['cores']['result'] == 'FAIL' else []) +
(['/var Partition'] if filesystem_checks['var_partition']['result'] == 'FAIL' else []) +
(['Root Filesystem'] if filesystem_checks['root_fs']['result'] == 'FAIL' else []) +
(['Jumbo Frames Enabled'] if primary_mtu | int <= 1500 else []) +
(['NIC Static IP Configuration'] if primary_dhcp != 'manual' else []) +
(['NIC Bandwidth'] if primary_speed | int < 10000 else [])
}}"

- name: Generate preflight check report file
ansible.builtin.template:
src: preflight_report.j2
dest: ./preflight_report.txt
delegate_to: localhost
run_once: true
become: false

- name: Load the preflight check report
ansible.builtin.set_fact:
report_content: "{{ lookup('template', 'preflight_report.j2') }}"

- name: Show Preflight Check Report
ansible.builtin.debug:
msg: "{{ lookup('template', 'preflight_report.j2') | split('\n') | join('\n') }}"

- name: Final Check - Fail if any critical checks failed
ansible.builtin.fail:
msg: "Preflight checks failed for the following: {{ preflight_failures | join(', ') }}. Please resolve these issues before proceeding."
when: preflight_failures | length > 0

25 changes: 25 additions & 0 deletions templates/preflight_report.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
==================================================
** Preflight Check Report **
==================================================

System Checks
--------------------------------------------------
{% for item in preflight_results %}
- {{ item['Check'] }}: {% if item['Result'] == 'PASS' %}✅ Passed{% elif item['Result'] == 'FAIL' %}❌ Failed{% else %}ℹ️ INFO{% endif %}

{% if item['Result'] == 'FAIL' or item['Result'] == 'INFO' %}
- Reason: {{ item['Reason'] }}
{% endif %}

{% endfor %}
==================================================
** Summary **
--------------------------------------------------
{% if preflight_failures | length > 0 %}
❌ Critical Failures Detected:
- {{ preflight_failures | join(', ') }}

** Action Required: Please resolve these issues before proceeding.
{% else %}
✅ All Critical Checks Passed! You are good to go.
{% endif %}

0 comments on commit 11764b3

Please sign in to comment.