pve admin tooling
This commit is contained in:
@@ -1,6 +1,32 @@
|
|||||||
# Proxmox Virtual Environment
|
# Proxmox Virtual Environment
|
||||||
|
User-facing docs: https://wiki.k-space.ee/en/hosting/proxmox
|
||||||
|
|
||||||
## K-Space Hyper Converged CEPH setup
|
## Adding new node
|
||||||
|
1. Upgrade existing nodes.
|
||||||
|
1. Install new nodes:
|
||||||
|
- Hostname `pveXX.proxmox.infra.k-space.ee`
|
||||||
|
- Boot disk ZRAID-1
|
||||||
|
- 172.21 or DHCP may be used as initial IP. Installer configuration will be overwritten by cluster join and ansible.
|
||||||
|
1. Add `non-free-firmware` as component to `/etc/apt/sources.list` to debian (not PVE) bookworm, bookworm-updates, bookworm-security (next to `main` and `contrib`)
|
||||||
|
1. Upgrade new nodes
|
||||||
|
- (unsure if needed nowdays: disabling pve-enterprise, and enabling pve-no-subscription)
|
||||||
|
1. Add new node to DNS (secretspace/ns1) and Ansible.
|
||||||
|
1. Apply Ansible and reboot.
|
||||||
|
1. `$ systemctl status watchdog-mux` should say `Watchdog driver 'IPMI', version 1` and NOT `Software Watchdog`
|
||||||
|
1. Join to cluster in UI → Datacenter.
|
||||||
|
- IP to use is the last, ipv6 with vmbr0 <!-- TODO: might have changed -->
|
||||||
|
1. `$ passwd` on new node
|
||||||
|
1. `$ vim ~/.ssh/authorized_keys` → sort the new key. **Keys are managed manually** since PVE manages the file as well.
|
||||||
|
|
||||||
|
TODO: prometheus node exporter
|
||||||
|
TODO: create-external-cluster-resources.py in pve90
|
||||||
|
TODO: PVE backup server. We want local snapshots and offsite.
|
||||||
|
TODO: reinstate restic for /etc and /root
|
||||||
|
TODO: d12 discard
|
||||||
|
|
||||||
|
## K-SPACE Hyper-Converged CEPH setup
|
||||||
|
> [!WARNING]
|
||||||
|
> K-SPACE kubernetes uses PVE's CEPH cluster, k8s pools are not visible in general PVE UI.
|
||||||
|
|
||||||
1. Configure a mesh network
|
1. Configure a mesh network
|
||||||
|
|
||||||
|
7
proxmox/admin_scripts/broadcast_reboot.sh
Executable file
7
proxmox/admin_scripts/broadcast_reboot.sh
Executable file
@@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
source /root/telegram.env
|
||||||
|
|
||||||
|
tgmsg 'booted; Check nomigrate Start'
|
||||||
|
|
||||||
|
sleep 300 # nonmigrate kube minimum uptime to take an another node offline
|
||||||
|
tgmsg "$(uptime -p)"
|
32
proxmox/admin_scripts/confirm_norunning.sh
Executable file
32
proxmox/admin_scripts/confirm_norunning.sh
Executable file
@@ -0,0 +1,32 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
if ! which jq >> /dev/null; then
|
||||||
|
echo "jq not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
includingNomigrate=0
|
||||||
|
if [[ "$#" -gt 0 ]]; then
|
||||||
|
includingNomigrate="$1"
|
||||||
|
fi
|
||||||
|
host="$(hostname)"
|
||||||
|
if [[ "$#" -gt 1 ]]; then
|
||||||
|
host="$2"
|
||||||
|
fi
|
||||||
|
|
||||||
|
function running_ids {
|
||||||
|
if [[ "$includingNomigrate" == 1 ]]; then
|
||||||
|
pvesh get "/nodes/${host}/qemu" --output-format json |\
|
||||||
|
jq -r 'map(select( .status == "running" ) | .vmid) | sort | @csv'
|
||||||
|
else
|
||||||
|
pvesh get "/nodes/${host}/qemu" --output-format json |\
|
||||||
|
jq -r 'map(select( .status == "running" and (.tags | split(";") | all(.!="nomigrate")) ) | .vmid) | sort | @csv'
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
running_ids="$(running_ids)"
|
||||||
|
if [[ "$running_ids" != "" ]]; then
|
||||||
|
echo "ERROR: VMs running on $host: $running_ids"
|
||||||
|
exit 1
|
||||||
|
fi
|
4
proxmox/admin_scripts/list_users.sh
Executable file
4
proxmox/admin_scripts/list_users.sh
Executable file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
pvesh get /access/acl -o json | jq -r '.[] | select(.roleid == "K-SPACE_VM_USER") | .ugid' | sort | uniq
|
25
proxmox/admin_scripts/shutdown_nomigrates.sh
Executable file
25
proxmox/admin_scripts/shutdown_nomigrates.sh
Executable file
@@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#set -e
|
||||||
|
|
||||||
|
if ! which jq >> /dev/null; then
|
||||||
|
echo "jq not found"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
#echo "USAGE: $0 [node (=hostname))]"
|
||||||
|
host="$HOSTNAME"
|
||||||
|
if [[ "$#" -ge 1 ]]; then
|
||||||
|
host="$1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
function shutdown_ids {
|
||||||
|
pvesh get "/nodes/${host}/qemu" --output-format json |\
|
||||||
|
jq -r 'map(select( .status == "running" and (.tags | split(";") | any(.=="nomigrate")) ) | .vmid)[]'
|
||||||
|
}
|
||||||
|
|
||||||
|
shutdown_ids | while IFS= read -r vmid; do
|
||||||
|
pvesh create "/nodes/${host}/qemu/${vmid}/status/shutdown" -timeout 1 &
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
wait
|
54
proxmox/admin_scripts/template.sh
Executable file
54
proxmox/admin_scripts/template.sh
Executable file
@@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# https://wiki.k-space.ee/en/hosting/proxmox
|
||||||
|
# image does not come from debian, but whole thing probably replaced with self-service thing anyway
|
||||||
|
|
||||||
|
if [[ "$#" -ne 2 ]]; then
|
||||||
|
echo "ERROR: expected exactly 2 arguments"
|
||||||
|
echo "USAGE: $0 <storage> <vmid>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
storage="$1"
|
||||||
|
vmid="$2"
|
||||||
|
img=debian-12
|
||||||
|
ident=d12.v2
|
||||||
|
size=100G
|
||||||
|
|
||||||
|
# will error if vmid exists
|
||||||
|
qm create "$vmid" \
|
||||||
|
--cpu x86-64-v3 --numa 1 \
|
||||||
|
--cores 16 --vcpus 8 \
|
||||||
|
--memory 4096 \
|
||||||
|
--scsihw virtio-scsi-pci \
|
||||||
|
--ide0 none,media=cdrom --ide2 "$storage":cloudinit,format=raw \
|
||||||
|
--boot order='ide0;scsi0' \
|
||||||
|
--serial0 socket --vga serial0 \
|
||||||
|
--net0 virtio,bridge=vmbr0 \
|
||||||
|
--ipconfig0 ip='193.40.103.99/24',gw='193.40.103.1',ip6='2001:bb8:4008:20::99/64',gw6='2001:bb8:4008:20::1' \
|
||||||
|
--searchdomain zoo.k-space.ee --nameserver '1.1.1.1 8.8.8.8' \
|
||||||
|
--ostype l26 --hotplug disk,network,usb,memory,cpu --onboot 1 \
|
||||||
|
--agent 1,fstrim_cloned_disks=1 \
|
||||||
|
--name "$ident.$HOSTNAME" --description "https://wiki.k-space.ee/en/hosting/proxmox"$'\n\n'"Base template: $ident"$'\n\n'"User: UNDOCUMENTED"
|
||||||
|
|
||||||
|
# Whole script supposed to be replaced by self-service suite anyway.
|
||||||
|
#TODO: virt-builder version is crap, replacing it with drop-in:
|
||||||
|
wget 'https://cdimage.debian.org/images/cloud/bookworm/latest/debian-12-genericcloud-amd64.raw' -O "$img".img
|
||||||
|
|
||||||
|
function cleanup {
|
||||||
|
rm "$img".img
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
virt-customize -a "$img".img \
|
||||||
|
--root-password disabled \
|
||||||
|
--install qemu-guest-agent,sshguard \
|
||||||
|
--run-command 'unattended-upgrades unattended-upgrades/enable_auto_updates boolean true | debconf-set-selections' \
|
||||||
|
--run-command 'dpkg-reconfigure -f noninteractive unattended-upgrades' \
|
||||||
|
--append-line '/lib/udev/rules.d/80-hotplug-cpu-mem.rules:SUBSYSTEM=="cpu", ACTION=="add", TEST=="online", ATTR{online}=="0", ATTR{online}="1"' \
|
||||||
|
--append-line '/lib/udev/rules.d/80-hotplug-cpu-mem.rules:SUBSYSTEM=="memory", ACTION=="add", TEST=="state", ATTR{state}=="offline", ATTR{state}="online"'
|
||||||
|
|
||||||
|
qm set "$vmid" --scsi0 "$storage":0,import-from="$PWD/$img.img",discard=on,ssd=1
|
||||||
|
qm disk resize "$vmid" scsi0 "$size"
|
||||||
|
|
||||||
|
qm template "$vmid"
|
@@ -19,8 +19,94 @@
|
|||||||
dest: /etc/network/interfaces
|
dest: /etc/network/interfaces
|
||||||
notify: reload networking
|
notify: reload networking
|
||||||
|
|
||||||
|
- name: admin convenience packages
|
||||||
|
tags: dep
|
||||||
|
apt:
|
||||||
|
state: latest
|
||||||
|
pkg:
|
||||||
|
- byobu
|
||||||
|
- mosh
|
||||||
|
- vim
|
||||||
|
- ncdu
|
||||||
|
- htop
|
||||||
|
# - git
|
||||||
|
|
||||||
|
- name: scripting dependencies
|
||||||
|
tags: dep
|
||||||
|
apt:
|
||||||
|
state: latest
|
||||||
|
pkg:
|
||||||
|
- jq
|
||||||
|
- yq
|
||||||
|
- curl
|
||||||
|
- guestfs-tools
|
||||||
|
- restic
|
||||||
|
|
||||||
|
# adding non-free-firmware component currently left manual, as it is hard to do reliably across upgrades + format will change with next major upg + not planning to add new nodes atm
|
||||||
|
|
||||||
|
- name: CPU microcode (Intel)
|
||||||
|
tags: dep
|
||||||
|
when: "'GenuineIntel' in ansible_processor"
|
||||||
|
apt:
|
||||||
|
state: latest
|
||||||
|
pkg: intel-microcode
|
||||||
|
|
||||||
|
- name: CPU microcode (AMD)
|
||||||
|
tags: dep
|
||||||
|
when: "'AuthenticAMD' in ansible_processor"
|
||||||
|
apt:
|
||||||
|
state: latest
|
||||||
|
pkg: amd64-microcode
|
||||||
|
|
||||||
|
- name: enable hardware watchdog
|
||||||
|
tags: dep
|
||||||
|
ansible.builtin.lineinfile:
|
||||||
|
path: /etc/default/pve-ha-manager
|
||||||
|
regexp: 'WATCHDOG_MODULE=ipmi_watchdog$'
|
||||||
|
line: 'WATCHDOG_MODULE=ipmi_watchdog'
|
||||||
|
|
||||||
|
- name: dedup on rpool
|
||||||
|
ansible.builtin.shell: zfs set dedup=on rpool
|
||||||
|
|
||||||
|
# https://forum.proxmox.com/threads/problem-activating-memory-hotplug.66790/ https://lists.proxmox.com/pipermail/pve-devel/2016-December/024519.html can reproduce in 2020, 2022, 2025
|
||||||
|
- name: increase max_mem_regions
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: 'options vhost max_mem_regions=512'
|
||||||
|
dest: /etc/modprobe.d/vhost.conf
|
||||||
|
|
||||||
handlers:
|
handlers:
|
||||||
- name: reload networking
|
- name: reload networking
|
||||||
ansible.builtin.systemd_service:
|
ansible.builtin.systemd_service:
|
||||||
name: networking.service
|
name: networking.service
|
||||||
state: reloaded
|
state: reloaded
|
||||||
|
|
||||||
|
- name: PVE admin tooling
|
||||||
|
hosts: proxmox
|
||||||
|
tasks:
|
||||||
|
- name: README
|
||||||
|
ansible.builtin.copy:
|
||||||
|
content: |
|
||||||
|
https://git.k-space.ee/k-space/ansible/src/branch/main/proxmox
|
||||||
|
https://wiki.k-space.ee/en/hosting/proxmox
|
||||||
|
dest: /root/README
|
||||||
|
- name: admin_scripts directory
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: admin_scripts/
|
||||||
|
dest: /root/admin_scripts/
|
||||||
|
|
||||||
|
- name: load secrets
|
||||||
|
ansible.builtin.include_vars:
|
||||||
|
file: ../secrets/pve-telegram.yaml
|
||||||
|
- name: install telegram.env
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: templates/telegram.env.j2
|
||||||
|
dest: /root/telegram.env
|
||||||
|
- name: install broadcast_reboot.service
|
||||||
|
ansible.builtin.copy:
|
||||||
|
src: templates/broadcast_reboot.service
|
||||||
|
dest: /etc/systemd/system/broadcast_reboot.service
|
||||||
|
- name: enable broadcast_reboot.service
|
||||||
|
ansible.builtin.systemd_service:
|
||||||
|
name: broadcast_reboot.service
|
||||||
|
enabled: true
|
||||||
|
state: started
|
||||||
|
10
proxmox/templates/broadcast_reboot.service
Normal file
10
proxmox/templates/broadcast_reboot.service
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
[Unit]
|
||||||
|
Description=Broadcasts boot
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/root/admin_scripts/broadcast_reboot.sh
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
8
proxmox/templates/telegram.env.j2
Normal file
8
proxmox/templates/telegram.env.j2
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
tg_token='{{ tgtoken }}'
|
||||||
|
chatid='{{ tgchatid }}'
|
||||||
|
|
||||||
|
function tgmsg {
|
||||||
|
clustername="$(pvesh get /cluster/status --output-format json | jq -r '.[] | select(.id == "cluster") | .name')"
|
||||||
|
curl -X POST -H 'Content-Type: application/json' -d '{"chat_id": "'"${chatid}"'", "text": "'"$HOSTNAME@$clustername: $1"'"}' https://api.telegram.org/bot$tg_token/sendMessage
|
||||||
|
}
|
@@ -56,6 +56,7 @@
|
|||||||
- misc
|
- misc
|
||||||
- kubernetes
|
- kubernetes
|
||||||
- doors
|
- doors
|
||||||
|
# do NOT put proxmox here! PVE manages its keys and admin keys are manual
|
||||||
tasks:
|
tasks:
|
||||||
- name: Generate /root/.ssh/authorized_keys
|
- name: Generate /root/.ssh/authorized_keys
|
||||||
ansible.builtin.copy:
|
ansible.builtin.copy:
|
||||||
|
Reference in New Issue
Block a user