debian-forge-composer/test/cases/openshift_virtualization.sh
Alexander Todorov 85ce42570d Refactor how the 'trap' command is used to avoid double calls
in many files there was a secondary call to `trap` for the sole purpose
of killing jornalctl (watching worker logs) so that GitLab CI doesn't
hang.

The issue with this is that sometimes the cleared the trap which invokes
the cleanup() function without reinstating it again (not everywhere).

Instead of doing this back-and-forth just make sure we don't leave any
journalctl processes dangling in the background!

NOTES:

- for some scripts, mainly ostree- ones there was no cleanup trap
  present, but instead `trap` was configured inside the build_image() function.
  The trouble is that this function is executed multiple times and
  $WORKER_JOURNAL_PID changes value between these multiple executions.
  That's why these scripts introduce the cleanup_on_exit() function where
  we make sure to kill any possible dangling journalctl processes.
- The name `cleanup_on_exit()` is chosed because these same scripts
  often have a helper function named clean_up() which is sometimes used to remove
  virtual machines and other artifacts between calls of build_image().
2024-04-19 13:16:11 +03:00

306 lines
9.1 KiB
Bash
Executable file

#!/bin/bash
set -euo pipefail
source /usr/libexec/osbuild-composer-test/set-env-variables.sh
source /usr/libexec/tests/osbuild-composer/shared_lib.sh
BRANCH_NAME="${CI_COMMIT_BRANCH:-local}"
BUILD_ID="${CI_JOB_ID:-$(uuidgen)}"
# Provision the software under test.
/usr/libexec/osbuild-composer-test/provision.sh none
TEMPDIR=$(mktemp -d)
function cleanup() {
greenprint "== Script execution stopped or finished - Cleaning up =="
# kill dangling journalctl processes to prevent GitLab CI from hanging
sudo pkill journalctl || echo "Nothing killed"
# since this function can be called at any time, ensure that we don't expand unbound variables
OC_CLI="${OC_CLI:-}"
VM_NAME="${VM_NAME:-}"
PVC_NAME="${PVC_NAME:-}"
[[ "$OC_CLI" && "$VM_NAME" ]] && $OC_CLI delete vm "$VM_NAME"
[[ "$OC_CLI" && "$PVC_NAME" ]] && $OC_CLI delete pvc "$PVC_NAME"
sudo rm -rf "$TEMPDIR"
}
trap cleanup EXIT
ARCH=$(uname -m)
# OpenShift doesn't like upper-case, dots and underscores
TEST_ID=$(echo "$DISTRO_CODE-$ARCH-$BRANCH_NAME-$BUILD_ID" | tr "[:upper:]" "[:lower:]" | tr "_." "-")
IMAGE_KEY=image-${TEST_ID}
ARTIFACTS="${ARTIFACTS:-/tmp/artifacts}"
# Set up temporary files.
BLUEPRINT_FILE=${TEMPDIR}/blueprint.toml
COMPOSE_START=${TEMPDIR}/compose-start-${IMAGE_KEY}.json
COMPOSE_INFO=${TEMPDIR}/compose-info-${IMAGE_KEY}.json
SSH_KEY="${TEMPDIR}/id_ssh"
ssh-keygen -t rsa-sha2-512 -f "$SSH_KEY" -N ""
SSH_PUB_KEY=$(cat "$SSH_KEY.pub")
# Get the compose log.
get_compose_log () {
COMPOSE_ID=$1
LOG_FILE=${ARTIFACTS}/osbuild-${ID}-${VERSION_ID}-openshift-virtualization.log
# Download the logs.
sudo composer-cli compose log "$COMPOSE_ID" | tee "$LOG_FILE" > /dev/null
}
# Get the compose metadata.
get_compose_metadata () {
COMPOSE_ID=$1
METADATA_FILE=${ARTIFACTS}/osbuild-${ID}-${VERSION_ID}-openshift-virtualization.json
# Download the metadata.
sudo composer-cli compose metadata "$COMPOSE_ID" > /dev/null
# Find the tarball and extract it.
TARBALL=$(basename "$(find . -maxdepth 1 -type f -name "*-metadata.tar")")
sudo tar -xf "$TARBALL"
sudo rm -f "$TARBALL"
# Move the JSON file into place.
sudo cat "${COMPOSE_ID}".json | jq -M '.' | tee "$METADATA_FILE" > /dev/null
}
# Write a basic blueprint for our image.
tee "$BLUEPRINT_FILE" > /dev/null << EOF
name = "bash"
description = "A base system with bash"
version = "0.0.1"
[[packages]]
name = "bash"
[[packages]]
name = "cloud-init"
[customizations.services]
enabled = ["sshd", "cloud-init", "cloud-init-local", "cloud-config", "cloud-final"]
[[customizations.user]]
name = "admin"
description = "admin"
key = "$SSH_PUB_KEY"
home = "/home/admin/"
shell = "/usr/bin/bash"
groups = ["users", "wheel"]
EOF
# Prepare the blueprint for the compose.
greenprint "📋 Preparing blueprint"
sudo composer-cli blueprints push "$BLUEPRINT_FILE"
sudo composer-cli blueprints depsolve bash
# Get worker unit file so we can watch the journal.
WORKER_UNIT=$(sudo systemctl list-units | grep -o -E "osbuild.*worker.*\.service")
sudo journalctl -af -n 1 -u "${WORKER_UNIT}" &
WORKER_JOURNAL_PID=$!
# Start the compose and upload to OpenShift
greenprint "🚀 Starting compose"
sudo composer-cli --json compose start bash qcow2 | tee "$COMPOSE_START"
COMPOSE_ID=$(get_build_info ".build_id" "$COMPOSE_START")
# Wait for the compose to finish.
greenprint "⏱ Waiting for compose to finish: ${COMPOSE_ID}"
while true; do
sudo composer-cli --json compose info "${COMPOSE_ID}" | tee "$COMPOSE_INFO" > /dev/null
COMPOSE_STATUS=$(get_build_info ".queue_status" "$COMPOSE_INFO")
# Is the compose finished?
if [[ $COMPOSE_STATUS != RUNNING ]] && [[ $COMPOSE_STATUS != WAITING ]]; then
break
fi
# Wait 30 seconds and try again.
sleep 30
done
# Capture the compose logs from osbuild.
greenprint "💬 Getting compose log and metadata"
get_compose_log "$COMPOSE_ID"
get_compose_metadata "$COMPOSE_ID"
# Kill the journal monitor
sudo pkill -P ${WORKER_JOURNAL_PID}
# Did the compose finish with success?
if [[ $COMPOSE_STATUS != FINISHED ]]; then
redprint "Something went wrong with the compose. 😢"
exit 1
fi
greenprint "📥 Downloading the image"
sudo composer-cli compose image "${COMPOSE_ID}" > /dev/null
IMAGE_FILENAME="${COMPOSE_ID}-disk.qcow2"
# allow anyone to read b/c this file is owned by root
sudo chmod a+r "${IMAGE_FILENAME}"
# Delete the compose so we don't run out of disk space
sudo composer-cli compose delete "${COMPOSE_ID}" > /dev/null
# install the OpenShift cli & virtctl binary
sudo dnf -y install wget
# https://docs.openshift.com/container-platform/4.13/cli_reference/openshift_cli/getting-started-cli.html
wget --no-check-certificate https://downloads-openshift-console.apps.ocp-virt.prod.psi.redhat.com/amd64/linux/oc.tar --directory-prefix "$TEMPDIR"
# https://docs.openshift.com/container-platform/4.13/virt/virt-using-the-cli-tools.html
wget --no-check-certificate https://hyperconverged-cluster-cli-download-openshift-cnv.apps.ocp-virt.prod.psi.redhat.com/amd64/linux/virtctl.tar.gz --directory-prefix "$TEMPDIR"
pushd "$TEMPDIR"
tar -xvf oc.tar
tar -xzvf virtctl.tar.gz
popd
OC_CLI="$TEMPDIR/oc"
VIRTCTL="$TEMPDIR/virtctl"
chmod a+x "$OC_CLI"
chmod a+x "$VIRTCTL"
# Authenticate via the gitlab-ci service account
# oc describe secret gitab-ci-token-g7sw2
$OC_CLI login --token="$OPENSHIFT_TOKEN" --server=https://api.ocp-virt.prod.psi.redhat.com:6443 --insecure-skip-tls-verify=true
$OC_CLI whoami
OPENSHIFT_PROJECT="image-builder"
$OC_CLI project $OPENSHIFT_PROJECT
# import the image into a data volume; total quota on the namespace seems to be 40GiB
# Note: ocs-external-storagecluster-ceph-rbd is the default StorageClass, see
# https://console-openshift-console.apps.ocp-virt.prod.psi.redhat.com/k8s/cluster/storage.k8s.io~v1~StorageClass
PVC_NAME="image-builder-data-volume-$TEST_ID"
$VIRTCTL image-upload --insecure dv "$PVC_NAME" --size=10Gi --storage-class=ocs-external-storagecluster-ceph-rbd --image-path="${IMAGE_FILENAME}"
# Note: --size=10Gi corresponds to the size of the filesystem inside the image, not the actual size of the qcow2 file
PVC_VOLUME_ID=$($OC_CLI get pvc "$PVC_NAME" -o json | jq -r ".spec.volumeName")
VM_NAME="image-builder-vm-$TEST_ID"
VM_YAML_FILE=${TEMPDIR}/vm.yaml
tee "$VM_YAML_FILE" > /dev/null << EOF
apiVersion: kubevirt.io/v1alpha3
kind: VirtualMachine
metadata:
name: $VM_NAME
namespace: $OPENSHIFT_PROJECT
labels:
app: $VM_NAME
spec:
dataVolumeTemplates:
- apiVersion: cdi.kubevirt.io/v1alpha1
kind: DataVolume
metadata:
name: $PVC_NAME
spec:
pvc:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 15G
storageClassName: ocs-external-storagecluster-ceph-rbd
volumeName: $PVC_VOLUME_ID
volumeMode: Filesystem
source:
pvc:
name: $PVC_NAME
namespace: $OPENSHIFT_PROJECT
running: true
template:
spec:
domain:
cpu:
cores: 1
sockets: 1
threads: 1
devices:
disks:
- bootOrder: 1
disk:
bus: virtio
name: disk-0
- disk:
bus: virtio
name: cloudinitdisk
interfaces:
- bootOrder: 2
masquerade: {}
model: virtio
name: nic0
networkInterfaceMultiqueue: true
rng: {}
machine:
type: pc-q35-rhel8.2.0
resources:
requests:
memory: 2Gi
evictionStrategy: LiveMigrate
hostname: $VM_NAME
networks:
- name: nic0
pod: {}
terminationGracePeriodSeconds: 0
volumes:
- dataVolume:
name: $PVC_NAME
name: disk-0
- cloudInitNoCloud:
userData: |
#cloud-config
ssh_pwauth: True
chpasswd:
list: |
root:password
expire: False
hostname: $VM_NAME
name: cloudinitdisk
EOF
# deploy VM using cloned PVC
# https://www.redhat.com/en/blog/getting-started-with-openshift-virtualization
$OC_CLI create -f "$VM_YAML_FILE"
# dump VM info
$OC_CLI describe vm "$VM_NAME"
# Wait for VM to become running
greenprint "⏱ Waiting for $VM_NAME to become running"
while true; do
VM_STATUS=$($OC_CLI get vm "$VM_NAME" -o json | jq -r ".status.printableStatus")
if [[ $VM_STATUS == Running ]]; then
break
fi
# Wait 30 seconds and try again.
sleep 30
done
# ssh into VM and check if it is running
greenprint "🛃 Checking that $VM_NAME is running"
set +e
for LOOP_COUNTER in {0..10}; do
STATUS=$($VIRTCTL --namespace "$OPENSHIFT_PROJECT" -i "$SSH_KEY" --local-ssh-opts="-o StrictHostKeyChecking=no" ssh admin@"$VM_NAME" --command 'systemctl --wait is-system-running')
if [[ $STATUS == running || $STATUS == degraded ]]; then
greenprint "💚 Success"
exit 0
fi
greenprint "... retrying $LOOP_COUNTER"
sleep 10
done
set -e
redprint "❌ Failed after $LOOP_COUNTER retries"
exit 1