From 17fc13592935fc7c8119e24980ea3a83f9773043 Mon Sep 17 00:00:00 2001 From: lassulus Date: Thu, 17 Nov 2022 11:58:59 +0100 Subject: l sync-containers3: don't kill container on lock loss --- lass/3modules/sync-containers3.nix | 124 ++++++++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 22 deletions(-) (limited to 'lass/3modules') diff --git a/lass/3modules/sync-containers3.nix b/lass/3modules/sync-containers3.nix index 5b6fa37e1..104b92b24 100644 --- a/lass/3modules/sync-containers3.nix +++ b/lass/3modules/sync-containers3.nix @@ -99,7 +99,7 @@ in { set -efux consul lock sync_${ctr.name} ${pkgs.writers.writeDash "${ctr.name}-sync" '' set -efux - if ping -c 1 ${ctr.name}.r; then + if /run/wrappers/bin/ping -c 1 ${ctr.name}.r; then touch "$HOME"/incomplete rsync -a -e "ssh -i $CREDENTIALS_DIRECTORY/ssh_key" --inplace container_sync@${ctr.name}.r:disk "$HOME"/disk rm "$HOME"/incomplete @@ -108,6 +108,54 @@ in { ''; }; }; } + { "${ctr.name}_watcher" = { + path = with pkgs; [ + coreutils + consul + cryptsetup + curl + mount + util-linux + jq + ]; + serviceConfig = { + ExecStart = pkgs.writers.writeDash "${ctr.name}_watcher" '' + set -efux + while sleep 5; do + # get the payload + # check if the host reacted recently + case $(curl -s -o /dev/null --retry 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in + 404) + echo 'got 404 from kv, should kill the container' + break + ;; + 500) + echo 'got 500 from kv, will kill container' + break + ;; + 200) + # echo 'got 200 from kv, will check payload' + export payload=$(consul kv get containers/${ctr.name}) + if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then + # echo 'we are the host, continuing' + continue + else + echo 'we are not host, killing container' + break + fi + ;; + *) + echo 'unknown state, continuing' + continue + ;; + esac + done + /run/current-system/sw/bin/nixos-container stop ${ctr.name} || : + umount /var/lib/sync-containers3/${ctr.name}/state || : + cryptsetup luksClose ${ctr.name} || : + ''; + }; + }; } { "${ctr.name}_scheduler" = { wantedBy = [ "multi-user.target" ]; path = with pkgs; [ @@ -116,36 +164,68 @@ in { cryptsetup mount util-linux + curl systemd + jq retry + bc ]; - serviceConfig = let - containerDirectory = lib.removeSuffix "/%i" config.systemd.services."container@${ctr.name}".environment.root; - in { + serviceConfig = { Restart = "always"; - RestartSec = "5s"; - ExecStart = "${pkgs.consul}/bin/consul lock -verbose -monitor-retry 100 container_${ctr.name} ${pkgs.writers.writeBash "${ctr.name}-start" '' + RestartSec = "30s"; + ExecStart = pkgs.writers.writeDash "${ctr.name}_scheduler" '' set -efux + # get the payload + # check if the host reacted recently + case $(curl -s -o /dev/null --retry 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in + 404) + # echo 'got 404 from kv, will create container' + ;; + 500) + # echo 'got 500 from kv, retrying again' + exit 0 + ;; + 200) + # echo 'got 200 from kv, will check payload' + export payload=$(consul kv get containers/${ctr.name}) + if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then + echo 'we are the host, starting container' + else + # echo 'we are not host, checking timestamp' + # if [ $(echo "$(date +%s) - $(jq -rn 'env.payload | fromjson.time') > 100" | bc) -eq 1 ]; then + if [ "$(jq -rn 'env.payload | fromjson.time | now - tonumber > 100')" = 'true' ]; then + echo 'last beacon is more than 100s ago, taking over' + else + # echo 'last beacon was recent. trying again' + exit 0 + fi + fi + ;; + *) + echo 'unknown state, bailing out' + exit 0 + ;; + esac if test -e /var/lib/sync-containers3/${ctr.name}/incomplete; then echo 'data is inconistent, start aborted' exit 1 fi - trap ${pkgs.writers.writeDash "stop-${ctr.name}" '' - set -efux - /run/current-system/sw/bin/nixos-container stop ${ctr.name} || : - umount /var/lib/sync-containers3/${ctr.name}/state || : - cryptsetup luksClose ${ctr.name} || : - ''} INT TERM EXIT - consul kv put containers/${ctr.name}/host ${config.networking.hostName} - cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name} - mkdir -p /var/lib/sync-containers3/${ctr.name}/state - mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state - /run/current-system/sw/bin/nixos-container start ${ctr.name} - set +x - until /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null; do sleep 5; done - while retry -t 5 -d 60 -- /run/wrappers/bin/ping -q -c 3 ${ctr.name}.r > /dev/null; do sleep 5; done - echo "lost tinc connection to container, shutting down" - ''}"; + consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null + consul lock -verbose -monitor-retry=100 -timeout 30s -name container_${ctr.name} container_${ctr.name} ${pkgs.writers.writeBash "${ctr.name}-start" '' + set -efu + cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name} || : + mkdir -p /var/lib/sync-containers3/${ctr.name}/state + mountpoint /var/lib/sync-containers3/${ctr.name}/state || mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state + /run/current-system/sw/bin/nixos-container start ${ctr.name} + # wait for system to become reachable for the first time + retry -t 10 -d 10 -- /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null + systemctl start ${ctr.name}_watcher.service + while systemctl is-active container@${ctr.name}.service >/devnull && /run/wrappers/bin/ping -q -c 3 ${ctr.name}.r >/dev/null; do + consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null + sleep 10 + done + ''} + ''; }; }; } ]) (lib.attrValues cfg.containers))); -- cgit v1.2.3