pve admin tooling
This commit is contained in:
@@ -1,6 +1,32 @@
|
||||
# Proxmox Virtual Environment
|
||||
User-facing docs: https://wiki.k-space.ee/en/hosting/proxmox
|
||||
|
||||
## K-Space Hyper Converged CEPH setup
|
||||
## Adding new node
|
||||
1. Upgrade existing nodes.
|
||||
1. Install new nodes:
|
||||
- Hostname `pveXX.proxmox.infra.k-space.ee`
|
||||
- Boot disk ZRAID-1
|
||||
- 172.21 or DHCP may be used as initial IP. Installer configuration will be overwritten by cluster join and ansible.
|
||||
1. Add `non-free-firmware` as component to `/etc/apt/sources.list` to debian (not PVE) bookworm, bookworm-updates, bookworm-security (next to `main` and `contrib`)
|
||||
1. Upgrade new nodes
|
||||
- (unsure if needed nowdays: disabling pve-enterprise, and enabling pve-no-subscription)
|
||||
1. Add new node to DNS (secretspace/ns1) and Ansible.
|
||||
1. Apply Ansible and reboot.
|
||||
1. `$ systemctl status watchdog-mux` should say `Watchdog driver 'IPMI', version 1` and NOT `Software Watchdog`
|
||||
1. Join to cluster in UI → Datacenter.
|
||||
- IP to use is the last, ipv6 with vmbr0 <!-- TODO: might have changed -->
|
||||
1. `$ passwd` on new node
|
||||
1. `$ vim ~/.ssh/authorized_keys` → sort the new key. **Keys are managed manually** since PVE manages the file as well.
|
||||
|
||||
TODO: prometheus node exporter
|
||||
TODO: create-external-cluster-resources.py in pve90
|
||||
TODO: PVE backup server. We want local snapshots and offsite.
|
||||
TODO: reinstate restic for /etc and /root
|
||||
TODO: d12 discard
|
||||
|
||||
## K-SPACE Hyper-Converged CEPH setup
|
||||
> [!WARNING]
|
||||
> K-SPACE kubernetes uses PVE's CEPH cluster, k8s pools are not visible in general PVE UI.
|
||||
|
||||
1. Configure a mesh network
|
||||
|
||||
|
7
proxmox/admin_scripts/broadcast_reboot.sh
Executable file
7
proxmox/admin_scripts/broadcast_reboot.sh
Executable file
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
source /root/telegram.env
|
||||
|
||||
tgmsg 'booted; Check nomigrate Start'
|
||||
|
||||
sleep 300 # nonmigrate kube minimum uptime to take an another node offline
|
||||
tgmsg "$(uptime -p)"
|
32
proxmox/admin_scripts/confirm_norunning.sh
Executable file
32
proxmox/admin_scripts/confirm_norunning.sh
Executable file
@@ -0,0 +1,32 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
if ! which jq >> /dev/null; then
|
||||
echo "jq not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
includingNomigrate=0
|
||||
if [[ "$#" -gt 0 ]]; then
|
||||
includingNomigrate="$1"
|
||||
fi
|
||||
host="$(hostname)"
|
||||
if [[ "$#" -gt 1 ]]; then
|
||||
host="$2"
|
||||
fi
|
||||
|
||||
function running_ids {
|
||||
if [[ "$includingNomigrate" == 1 ]]; then
|
||||
pvesh get "/nodes/${host}/qemu" --output-format json |\
|
||||
jq -r 'map(select( .status == "running" ) | .vmid) | sort | @csv'
|
||||
else
|
||||
pvesh get "/nodes/${host}/qemu" --output-format json |\
|
||||
jq -r 'map(select( .status == "running" and (.tags | split(";") | all(.!="nomigrate")) ) | .vmid) | sort | @csv'
|
||||
fi
|
||||
}
|
||||
|
||||
running_ids="$(running_ids)"
|
||||
if [[ "$running_ids" != "" ]]; then
|
||||
echo "ERROR: VMs running on $host: $running_ids"
|
||||
exit 1
|
||||
fi
|
4
proxmox/admin_scripts/list_users.sh
Executable file
4
proxmox/admin_scripts/list_users.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
pvesh get /access/acl -o json | jq -r '.[] | select(.roleid == "K-SPACE_VM_USER") | .ugid' | sort | uniq
|
25
proxmox/admin_scripts/shutdown_nomigrates.sh
Executable file
25
proxmox/admin_scripts/shutdown_nomigrates.sh
Executable file
@@ -0,0 +1,25 @@
|
||||
#!/bin/bash
|
||||
#set -e
|
||||
|
||||
if ! which jq >> /dev/null; then
|
||||
echo "jq not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#echo "USAGE: $0 [node (=hostname))]"
|
||||
host="$HOSTNAME"
|
||||
if [[ "$#" -ge 1 ]]; then
|
||||
host="$1"
|
||||
fi
|
||||
|
||||
function shutdown_ids {
|
||||
pvesh get "/nodes/${host}/qemu" --output-format json |\
|
||||
jq -r 'map(select( .status == "running" and (.tags | split(";") | any(.=="nomigrate")) ) | .vmid)[]'
|
||||
}
|
||||
|
||||
shutdown_ids | while IFS= read -r vmid; do
|
||||
pvesh create "/nodes/${host}/qemu/${vmid}/status/shutdown" -timeout 1 &
|
||||
sleep 1
|
||||
done
|
||||
|
||||
wait
|
54
proxmox/admin_scripts/template.sh
Executable file
54
proxmox/admin_scripts/template.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# https://wiki.k-space.ee/en/hosting/proxmox
|
||||
# image does not come from debian, but whole thing probably replaced with self-service thing anyway
|
||||
|
||||
if [[ "$#" -ne 2 ]]; then
|
||||
echo "ERROR: expected exactly 2 arguments"
|
||||
echo "USAGE: $0 <storage> <vmid>"
|
||||
exit 1
|
||||
fi
|
||||
storage="$1"
|
||||
vmid="$2"
|
||||
img=debian-12
|
||||
ident=d12.v2
|
||||
size=100G
|
||||
|
||||
# will error if vmid exists
|
||||
qm create "$vmid" \
|
||||
--cpu x86-64-v3 --numa 1 \
|
||||
--cores 16 --vcpus 8 \
|
||||
--memory 4096 \
|
||||
--scsihw virtio-scsi-pci \
|
||||
--ide0 none,media=cdrom --ide2 "$storage":cloudinit,format=raw \
|
||||
--boot order='ide0;scsi0' \
|
||||
--serial0 socket --vga serial0 \
|
||||
--net0 virtio,bridge=vmbr0 \
|
||||
--ipconfig0 ip='193.40.103.99/24',gw='193.40.103.1',ip6='2001:bb8:4008:20::99/64',gw6='2001:bb8:4008:20::1' \
|
||||
--searchdomain zoo.k-space.ee --nameserver '1.1.1.1 8.8.8.8' \
|
||||
--ostype l26 --hotplug disk,network,usb,memory,cpu --onboot 1 \
|
||||
--agent 1,fstrim_cloned_disks=1 \
|
||||
--name "$ident.$HOSTNAME" --description "https://wiki.k-space.ee/en/hosting/proxmox"$'\n\n'"Base template: $ident"$'\n\n'"User: UNDOCUMENTED"
|
||||
|
||||
# Whole script supposed to be replaced by self-service suite anyway.
|
||||
#TODO: virt-builder version is crap, replacing it with drop-in:
|
||||
wget 'https://cdimage.debian.org/images/cloud/bookworm/latest/debian-12-genericcloud-amd64.raw' -O "$img".img
|
||||
|
||||
function cleanup {
|
||||
rm "$img".img
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
virt-customize -a "$img".img \
|
||||
--root-password disabled \
|
||||
--install qemu-guest-agent,sshguard \
|
||||
--run-command 'unattended-upgrades unattended-upgrades/enable_auto_updates boolean true | debconf-set-selections' \
|
||||
--run-command 'dpkg-reconfigure -f noninteractive unattended-upgrades' \
|
||||
--append-line '/lib/udev/rules.d/80-hotplug-cpu-mem.rules:SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online}="1"' \
|
||||
--append-line '/lib/udev/rules.d/80-hotplug-cpu-mem.rules:SUBSYSTEM=="memory", ACTION=="add", TEST=="state", ATTR{state}=="offline", ATTR{state}="online"'
|
||||
|
||||
qm set "$vmid" --scsi0 "$storage":0,import-from="$PWD/$img.img",discard=on,ssd=1
|
||||
qm disk resize "$vmid" scsi0 "$size"
|
||||
|
||||
qm template "$vmid"
|
@@ -19,8 +19,94 @@
|
||||
dest: /etc/network/interfaces
|
||||
notify: reload networking
|
||||
|
||||
- name: admin convenience packages
|
||||
tags: dep
|
||||
apt:
|
||||
state: latest
|
||||
pkg:
|
||||
- byobu
|
||||
- mosh
|
||||
- vim
|
||||
- ncdu
|
||||
- htop
|
||||
# - git
|
||||
|
||||
- name: scripting dependencies
|
||||
tags: dep
|
||||
apt:
|
||||
state: latest
|
||||
pkg:
|
||||
- jq
|
||||
- yq
|
||||
- curl
|
||||
- guestfs-tools
|
||||
- restic
|
||||
|
||||
# adding non-free-firmware component currently left manual, as it is hard to do reliably across upgrades + format will change with next major upg + not planning to add new nodes atm
|
||||
|
||||
- name: CPU microcode (Intel)
|
||||
tags: dep
|
||||
when: "'GenuineIntel' in ansible_processor"
|
||||
apt:
|
||||
state: latest
|
||||
pkg: intel-microcode
|
||||
|
||||
- name: CPU microcode (AMD)
|
||||
tags: dep
|
||||
when: "'AuthenticAMD' in ansible_processor"
|
||||
apt:
|
||||
state: latest
|
||||
pkg: amd64-microcode
|
||||
|
||||
- name: enable hardware watchdog
|
||||
tags: dep
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/default/pve-ha-manager
|
||||
regexp: 'WATCHDOG_MODULE=ipmi_watchdog$'
|
||||
line: 'WATCHDOG_MODULE=ipmi_watchdog'
|
||||
|
||||
- name: dedup on rpool
|
||||
ansible.builtin.shell: zfs set dedup=on rpool
|
||||
|
||||
# https://forum.proxmox.com/threads/problem-activating-memory-hotplug.66790/ https://lists.proxmox.com/pipermail/pve-devel/2016-December/024519.html can reproduce in 2020, 2022, 2025
|
||||
- name: increase max_mem_regions
|
||||
ansible.builtin.copy:
|
||||
content: 'options vhost max_mem_regions=512'
|
||||
dest: /etc/modprobe.d/vhost.conf
|
||||
|
||||
handlers:
|
||||
- name: reload networking
|
||||
ansible.builtin.systemd_service:
|
||||
name: networking.service
|
||||
state: reloaded
|
||||
|
||||
- name: PVE admin tooling
|
||||
hosts: proxmox
|
||||
tasks:
|
||||
- name: README
|
||||
ansible.builtin.copy:
|
||||
content: |
|
||||
https://git.k-space.ee/k-space/ansible/src/branch/main/proxmox
|
||||
https://wiki.k-space.ee/en/hosting/proxmox
|
||||
dest: /root/README
|
||||
- name: admin_scripts directory
|
||||
ansible.builtin.copy:
|
||||
src: admin_scripts/
|
||||
dest: /root/admin_scripts/
|
||||
|
||||
- name: load secrets
|
||||
ansible.builtin.include_vars:
|
||||
file: ../secrets/pve-telegram.yaml
|
||||
- name: install telegram.env
|
||||
ansible.builtin.template:
|
||||
src: templates/telegram.env.j2
|
||||
dest: /root/telegram.env
|
||||
- name: install broadcast_reboot.service
|
||||
ansible.builtin.copy:
|
||||
src: templates/broadcast_reboot.service
|
||||
dest: /etc/systemd/system/broadcast_reboot.service
|
||||
- name: enable broadcast_reboot.service
|
||||
ansible.builtin.systemd_service:
|
||||
name: broadcast_reboot.service
|
||||
enabled: true
|
||||
state: started
|
||||
|
10
proxmox/templates/broadcast_reboot.service
Normal file
10
proxmox/templates/broadcast_reboot.service
Normal file
@@ -0,0 +1,10 @@
|
||||
[Unit]
|
||||
Description=Broadcasts boot
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/root/admin_scripts/broadcast_reboot.sh
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
8
proxmox/templates/telegram.env.j2
Normal file
8
proxmox/templates/telegram.env.j2
Normal file
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
tg_token='{{ tgtoken }}'
|
||||
chatid='{{ tgchatid }}'
|
||||
|
||||
function tgmsg {
|
||||
clustername="$(pvesh get /cluster/status --output-format json | jq -r '.[] | select(.id == "cluster") | .name')"
|
||||
curl -X POST -H 'Content-Type: application/json' -d '{"chat_id": "'"${chatid}"'", "text": "'"$HOSTNAME@$clustername: $1"'"}' https://api.telegram.org/bot$tg_token/sendMessage
|
||||
}
|
Reference in New Issue
Block a user