pve admin tooling

This commit is contained in:
2025-08-02 16:49:43 +03:00
parent 898d733ebf
commit d4ebd93920
10 changed files with 254 additions and 1 deletions

View File

@@ -1,6 +1,32 @@
# Proxmox Virtual Environment
User-facing docs: https://wiki.k-space.ee/en/hosting/proxmox
## K-Space Hyper Converged CEPH setup
## Adding new node
1. Upgrade existing nodes.
1. Install new nodes:
- Hostname `pveXX.proxmox.infra.k-space.ee`
- Boot disk ZRAID-1
- 172.21 or DHCP may be used as initial IP. Installer configuration will be overwritten by cluster join and ansible.
1. Add `non-free-firmware` as component to `/etc/apt/sources.list` to debian (not PVE) bookworm, bookworm-updates, bookworm-security (next to `main` and `contrib`)
1. Upgrade new nodes
- (unsure if needed nowdays: disabling pve-enterprise, and enabling pve-no-subscription)
1. Add new node to DNS (secretspace/ns1) and Ansible.
1. Apply Ansible and reboot.
1. `$ systemctl status watchdog-mux` should say `Watchdog driver 'IPMI', version 1` and NOT `Software Watchdog`
1. Join to cluster in UI → Datacenter.
- IP to use is the last, ipv6 with vmbr0 <!-- TODO: might have changed -->
1. `$ passwd` on new node
1. `$ vim ~/.ssh/authorized_keys` → sort the new key. **Keys are managed manually** since PVE manages the file as well.
TODO: prometheus node exporter
TODO: create-external-cluster-resources.py in pve90
TODO: PVE backup server. We want local snapshots and offsite.
TODO: reinstate restic for /etc and /root
TODO: d12 discard
## K-SPACE Hyper-Converged CEPH setup
> [!WARNING]
> K-SPACE kubernetes uses PVE's CEPH cluster, k8s pools are not visible in general PVE UI.
1. Configure a mesh network

View File

@@ -0,0 +1,7 @@
#!/bin/bash
source /root/telegram.env
tgmsg 'booted; Check nomigrate Start'
sleep 300 # nonmigrate kube minimum uptime to take an another node offline
tgmsg "$(uptime -p)"

View File

@@ -0,0 +1,32 @@
#!/bin/bash
set -euo pipefail
if ! which jq >> /dev/null; then
echo "jq not found"
exit 1
fi
includingNomigrate=0
if [[ "$#" -gt 0 ]]; then
includingNomigrate="$1"
fi
host="$(hostname)"
if [[ "$#" -gt 1 ]]; then
host="$2"
fi
function running_ids {
if [[ "$includingNomigrate" == 1 ]]; then
pvesh get "/nodes/${host}/qemu" --output-format json |\
jq -r 'map(select( .status == "running" ) | .vmid) | sort | @csv'
else
pvesh get "/nodes/${host}/qemu" --output-format json |\
jq -r 'map(select( .status == "running" and (.tags | split(";") | all(.!="nomigrate")) ) | .vmid) | sort | @csv'
fi
}
running_ids="$(running_ids)"
if [[ "$running_ids" != "" ]]; then
echo "ERROR: VMs running on $host: $running_ids"
exit 1
fi

View File

@@ -0,0 +1,4 @@
#!/bin/bash
set -euo pipefail
pvesh get /access/acl -o json | jq -r '.[] | select(.roleid == "K-SPACE_VM_USER") | .ugid' | sort | uniq

View File

@@ -0,0 +1,25 @@
#!/bin/bash
#set -e
if ! which jq >> /dev/null; then
echo "jq not found"
exit 1
fi
#echo "USAGE: $0 [node (=hostname))]"
host="$HOSTNAME"
if [[ "$#" -ge 1 ]]; then
host="$1"
fi
function shutdown_ids {
pvesh get "/nodes/${host}/qemu" --output-format json |\
jq -r 'map(select( .status == "running" and (.tags | split(";") | any(.=="nomigrate")) ) | .vmid)[]'
}
shutdown_ids | while IFS= read -r vmid; do
pvesh create "/nodes/${host}/qemu/${vmid}/status/shutdown" -timeout 1 &
sleep 1
done
wait

View File

@@ -0,0 +1,54 @@
#!/bin/bash
set -e
# https://wiki.k-space.ee/en/hosting/proxmox
# image does not come from debian, but whole thing probably replaced with self-service thing anyway
if [[ "$#" -ne 2 ]]; then
echo "ERROR: expected exactly 2 arguments"
echo "USAGE: $0 <storage> <vmid>"
exit 1
fi
storage="$1"
vmid="$2"
img=debian-12
ident=d12.v2
size=100G
# will error if vmid exists
qm create "$vmid" \
--cpu x86-64-v3 --numa 1 \
--cores 16 --vcpus 8 \
--memory 4096 \
--scsihw virtio-scsi-pci \
--ide0 none,media=cdrom --ide2 "$storage":cloudinit,format=raw \
--boot order='ide0;scsi0' \
--serial0 socket --vga serial0 \
--net0 virtio,bridge=vmbr0 \
--ipconfig0 ip='193.40.103.99/24',gw='193.40.103.1',ip6='2001:bb8:4008:20::99/64',gw6='2001:bb8:4008:20::1' \
--searchdomain zoo.k-space.ee --nameserver '1.1.1.1 8.8.8.8' \
--ostype l26 --hotplug disk,network,usb,memory,cpu --onboot 1 \
--agent 1,fstrim_cloned_disks=1 \
--name "$ident.$HOSTNAME" --description "https://wiki.k-space.ee/en/hosting/proxmox"$'\n\n'"Base template: $ident"$'\n\n'"User: UNDOCUMENTED"
# Whole script supposed to be replaced by self-service suite anyway.
#TODO: virt-builder version is crap, replacing it with drop-in:
wget 'https://cdimage.debian.org/images/cloud/bookworm/latest/debian-12-genericcloud-amd64.raw' -O "$img".img
function cleanup {
rm "$img".img
}
trap cleanup EXIT
virt-customize -a "$img".img \
--root-password disabled \
--install qemu-guest-agent,sshguard \
--run-command 'unattended-upgrades unattended-upgrades/enable_auto_updates boolean true | debconf-set-selections' \
--run-command 'dpkg-reconfigure -f noninteractive unattended-upgrades' \
--append-line '/lib/udev/rules.d/80-hotplug-cpu-mem.rules:SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online}="1"' \
--append-line '/lib/udev/rules.d/80-hotplug-cpu-mem.rules:SUBSYSTEM=="memory", ACTION=="add", TEST=="state", ATTR{state}=="offline", ATTR{state}="online"'
qm set "$vmid" --scsi0 "$storage":0,import-from="$PWD/$img.img",discard=on,ssd=1
qm disk resize "$vmid" scsi0 "$size"
qm template "$vmid"

View File

@@ -19,8 +19,94 @@
dest: /etc/network/interfaces
notify: reload networking
- name: admin convenience packages
tags: dep
apt:
state: latest
pkg:
- byobu
- mosh
- vim
- ncdu
- htop
# - git
- name: scripting dependencies
tags: dep
apt:
state: latest
pkg:
- jq
- yq
- curl
- guestfs-tools
- restic
# adding non-free-firmware component currently left manual, as it is hard to do reliably across upgrades + format will change with next major upg + not planning to add new nodes atm
- name: CPU microcode (Intel)
tags: dep
when: "'GenuineIntel' in ansible_processor"
apt:
state: latest
pkg: intel-microcode
- name: CPU microcode (AMD)
tags: dep
when: "'AuthenticAMD' in ansible_processor"
apt:
state: latest
pkg: amd64-microcode
- name: enable hardware watchdog
tags: dep
ansible.builtin.lineinfile:
path: /etc/default/pve-ha-manager
regexp: 'WATCHDOG_MODULE=ipmi_watchdog$'
line: 'WATCHDOG_MODULE=ipmi_watchdog'
- name: dedup on rpool
ansible.builtin.shell: zfs set dedup=on rpool
# https://forum.proxmox.com/threads/problem-activating-memory-hotplug.66790/ https://lists.proxmox.com/pipermail/pve-devel/2016-December/024519.html can reproduce in 2020, 2022, 2025
- name: increase max_mem_regions
ansible.builtin.copy:
content: 'options vhost max_mem_regions=512'
dest: /etc/modprobe.d/vhost.conf
handlers:
- name: reload networking
ansible.builtin.systemd_service:
name: networking.service
state: reloaded
- name: PVE admin tooling
hosts: proxmox
tasks:
- name: README
ansible.builtin.copy:
content: |
https://git.k-space.ee/k-space/ansible/src/branch/main/proxmox
https://wiki.k-space.ee/en/hosting/proxmox
dest: /root/README
- name: admin_scripts directory
ansible.builtin.copy:
src: admin_scripts/
dest: /root/admin_scripts/
- name: load secrets
ansible.builtin.include_vars:
file: ../secrets/pve-telegram.yaml
- name: install telegram.env
ansible.builtin.template:
src: templates/telegram.env.j2
dest: /root/telegram.env
- name: install broadcast_reboot.service
ansible.builtin.copy:
src: templates/broadcast_reboot.service
dest: /etc/systemd/system/broadcast_reboot.service
- name: enable broadcast_reboot.service
ansible.builtin.systemd_service:
name: broadcast_reboot.service
enabled: true
state: started

View File

@@ -0,0 +1,10 @@
[Unit]
Description=Broadcasts boot
After=network-online.target
[Service]
Type=simple
ExecStart=/root/admin_scripts/broadcast_reboot.sh
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,8 @@
#!/bin/bash
tg_token='{{ tgtoken }}'
chatid='{{ tgchatid }}'
function tgmsg {
clustername="$(pvesh get /cluster/status --output-format json | jq -r '.[] | select(.id == "cluster") | .name')"
curl -X POST -H 'Content-Type: application/json' -d '{"chat_id": "'"${chatid}"'", "text": "'"$HOSTNAME@$clustername: $1"'"}' https://api.telegram.org/bot$tg_token/sendMessage
}