sonic-buildimage/files/initramfs-tools/arista-convertfs.j2

255 lines
7.1 KiB
Plaintext
Raw Permalink Normal View History

#!/bin/sh
case $1 in
prereqs)
exit 0
;;
esac
set -e
# set -x
total_mem=$(free | awk '/^Mem:/{print $2}')
tmpfs_size=$(( $total_mem / 20 * 17 ))
free_mem_thres=$(( $total_mem / 20 * 18 ))
tmp_mnt='/mnt/ramdisk-convfs'
root_mnt='/mnt/root-convfs'
root_dev=''
flash_dev=''
block_flash=''
aboot_flag=''
backup_file=''
prev_os=''
[kdump] Fix OOM events in crashkernel (#6447) A few issues where discovered with crashkernel on Arista platforms. 1) platforms using `docker_inram=on` would end up OOM in kdump environment. This happens because the same initramfs is used by SONiC and the crashkernel. With `docker_inram=on` the `dockerfs.tar.gz` is extracted in a `tmpfs` created for the occasion. Since `dockerfs.tar.gz` weights more than 1.5G, it doesn't fit into the kdump environment and ends up OOM. This OOM event can in turn trigger a panic. 2) Arista platforms with `secureboot` enabled would fail to load the crashkernel because the kernel parameter would be discarded on boot. This happens because the `boot0` in secureboot mode is strict about kernel parameter injection. 3) The secureboot path allowlist would remove kernel crash reports. 4) The kdump service would fail on Arista products since `/boot/` is empty in `secureboot` **- How I did it** 1) To prevent an OOM event in the crashkernel the fix is to avoid the codepaths in `union-mount` that create tmpfs and populate them. Some more codepath specific to Arista devices are also skipped to make the kdump process faster. This relies on detecting that the initramfs is starting in a kdump environment and skipping some initialization. The `/usr/sbin/kdump-config` tool appends a few kernel cmdline arguments when loading the crashkernel. The most unique one is `systemd.unit=kdump-tools.service` which is used in a few initramfs hooks to set `in_kdump`. 2) To allow `kdump` to work in `secureboot` environment the cmdline generation in boot0 was slightly modified. The codepath to load kernel parameters changed by SONiC is now running for booting in secure mode. It was altered to prevent an append only behavior which would grow the `kernel-cmdline` at every reboot. This ever growing behavior would lead `kexec` to fail to load the kernel due to a too long cmdline. 3) To get the kernel crash under /var/crash this path has to be added to `allowlist_paths` 4) The `/host/image-XXX/boot` folder is now populated in `secureboot` mode but not used. **- How to verify it** Regular boot: - enable kdump - enable docker_inram=on via kernel-params - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness OOM events on the console - after: crash kernel works and crash available under /var/crash Secure boot: - enable kdump - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness no kdump - after: crash kernel works and crash available under /var/crash Co-authored-by: Boyang Yu <byu@arista.com>
2021-02-02 03:55:09 -06:00
sonic_fast_reboot=false
in_kdump=false
# Wait until get the fullpath of flash device, e.g., /dev/sda
wait_get_flash_dev() {
local try_rounds=30
while [ $try_rounds -gt 0 ]; do
for dev in $(ls /sys/block); do
local is_mmc=$(echo "$dev" | grep 'mmcblk.*boot.*' | cat)
if [ -n "$is_mmc" ]; then
continue
fi
local devid="$(realpath "/sys/block/$dev/device")/"
local is_device=$(echo "$devid" | grep '^/sys/devices/' | cat)
local is_flash=$(echo "$devid" | grep "$block_flash" | cat)
if [ -n "$is_device" -a -n "$is_flash" ]; then
flash_dev="/dev/$dev"
return 0
fi
done
sleep 1
try_rounds=$(( $try_rounds - 1 ))
done
return 1
}
# Wait for root_dev to be ready
wait_for_root_dev() {
local try_rounds=30
while [ $try_rounds -gt 0 ]; do
if blkid | sed 's/"//g' | grep -q "$root_dev"; then
return 0
fi
if [ -e "$root_dev" ]; then
return 0
fi
sleep 1
try_rounds=$(( $try_rounds - 1 ))
done
return 1
}
# Alway run cleanup before exit
cleanup() {
if grep -q "$root_mnt" /proc/mounts; then
umount "$root_mnt"
fi
if grep -q "$tmp_mnt" /proc/mounts; then
umount "$tmp_mnt"
fi
[ -e "$root_mnt" ] && rmdir "$root_mnt"
[ -e "$tmp_mnt" ] && rmdir "$tmp_mnt"
}
trap cleanup EXIT
notification() {
cat << EOF
A failure happend in modifying the root file system which stopped the upgrade. Manual interventions are needed to fix the issue. Note that:
1) files in the old root file system may have been lost and the old partition table may have been corrupted;
2) The files in the old root file system were copied to $tmp_mnt;
3) The old partition table was dumped to the file $tmp_mnt/$backup_file by sfdisk;
4) Quitting the current shell will lose all files mentioned above permanently.
EOF
}
run_cmd() {
if ! eval "$1"; then
echo "$2"
notification
sh
exit 1
fi
}
fixup_flash_permissions() {
# properly set flash permissions for others
# this allows the sonic admin user to have read access on the flash
local flash_mnt="$1"
chmod o+rx "$flash_mnt"
# remove all the filesystem acls from the flash
setfacl -Rb "$flash_mnt"
}
# Update ROOT device referring to the path under block_flash
# This is for the occasional name swap between /dev/sda and /dev/sdb
update_root() {
# Check that root=/dev/*, ignoring any cases like root=UUID=*
[ "${ROOT#/dev}" = "${ROOT}" ] && return 0
# Replace the beginning chars of ROOT by the ones of flash_dev with same index
{% raw %}
prefix_length="${#flash_dev}"
{% endraw %}
part_id="${ROOT:$prefix_length}"
ROOT="$flash_dev$part_id"
echo "ROOT=$ROOT" > /conf/param.conf
}
# Extract kernel parameters
set -- $(cat /proc/cmdline)
for x in "$@"; do
case "$x" in
block_flash=*)
block_flash="${x#block_flash=}"
;;
Aboot=*)
aboot_flag="${x#Aboot=}"
;;
loop=*)
2017-04-21 19:23:36 -05:00
x1="${x#loop=}"
image_dir="${x1%/*}"
;;
docker_inram=*)
docker_inram="${x#docker_inram=}"
;;
prev_os=*)
prev_os="${x#prev_os=}"
;;
SONIC_BOOT_TYPE=warm*|SONIC_BOOT_TYPE=fast*)
sonic_fast_reboot=true
;;
[kdump] Fix OOM events in crashkernel (#6447) A few issues where discovered with crashkernel on Arista platforms. 1) platforms using `docker_inram=on` would end up OOM in kdump environment. This happens because the same initramfs is used by SONiC and the crashkernel. With `docker_inram=on` the `dockerfs.tar.gz` is extracted in a `tmpfs` created for the occasion. Since `dockerfs.tar.gz` weights more than 1.5G, it doesn't fit into the kdump environment and ends up OOM. This OOM event can in turn trigger a panic. 2) Arista platforms with `secureboot` enabled would fail to load the crashkernel because the kernel parameter would be discarded on boot. This happens because the `boot0` in secureboot mode is strict about kernel parameter injection. 3) The secureboot path allowlist would remove kernel crash reports. 4) The kdump service would fail on Arista products since `/boot/` is empty in `secureboot` **- How I did it** 1) To prevent an OOM event in the crashkernel the fix is to avoid the codepaths in `union-mount` that create tmpfs and populate them. Some more codepath specific to Arista devices are also skipped to make the kdump process faster. This relies on detecting that the initramfs is starting in a kdump environment and skipping some initialization. The `/usr/sbin/kdump-config` tool appends a few kernel cmdline arguments when loading the crashkernel. The most unique one is `systemd.unit=kdump-tools.service` which is used in a few initramfs hooks to set `in_kdump`. 2) To allow `kdump` to work in `secureboot` environment the cmdline generation in boot0 was slightly modified. The codepath to load kernel parameters changed by SONiC is now running for booting in secure mode. It was altered to prevent an append only behavior which would grow the `kernel-cmdline` at every reboot. This ever growing behavior would lead `kexec` to fail to load the kernel due to a too long cmdline. 3) To get the kernel crash under /var/crash this path has to be added to `allowlist_paths` 4) The `/host/image-XXX/boot` folder is now populated in `secureboot` mode but not used. **- How to verify it** Regular boot: - enable kdump - enable docker_inram=on via kernel-params - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness OOM events on the console - after: crash kernel works and crash available under /var/crash Secure boot: - enable kdump - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness no kdump - after: crash kernel works and crash available under /var/crash Co-authored-by: Boyang Yu <byu@arista.com>
2021-02-02 03:55:09 -06:00
systemd.unit=kdump-tools.service)
in_kdump=true
;;
esac
done
# Check aboot
[ -z "$aboot_flag" ] && exit 0
[kdump] Fix OOM events in crashkernel (#6447) A few issues where discovered with crashkernel on Arista platforms. 1) platforms using `docker_inram=on` would end up OOM in kdump environment. This happens because the same initramfs is used by SONiC and the crashkernel. With `docker_inram=on` the `dockerfs.tar.gz` is extracted in a `tmpfs` created for the occasion. Since `dockerfs.tar.gz` weights more than 1.5G, it doesn't fit into the kdump environment and ends up OOM. This OOM event can in turn trigger a panic. 2) Arista platforms with `secureboot` enabled would fail to load the crashkernel because the kernel parameter would be discarded on boot. This happens because the `boot0` in secureboot mode is strict about kernel parameter injection. 3) The secureboot path allowlist would remove kernel crash reports. 4) The kdump service would fail on Arista products since `/boot/` is empty in `secureboot` **- How I did it** 1) To prevent an OOM event in the crashkernel the fix is to avoid the codepaths in `union-mount` that create tmpfs and populate them. Some more codepath specific to Arista devices are also skipped to make the kdump process faster. This relies on detecting that the initramfs is starting in a kdump environment and skipping some initialization. The `/usr/sbin/kdump-config` tool appends a few kernel cmdline arguments when loading the crashkernel. The most unique one is `systemd.unit=kdump-tools.service` which is used in a few initramfs hooks to set `in_kdump`. 2) To allow `kdump` to work in `secureboot` environment the cmdline generation in boot0 was slightly modified. The codepath to load kernel parameters changed by SONiC is now running for booting in secure mode. It was altered to prevent an append only behavior which would grow the `kernel-cmdline` at every reboot. This ever growing behavior would lead `kexec` to fail to load the kernel due to a too long cmdline. 3) To get the kernel crash under /var/crash this path has to be added to `allowlist_paths` 4) The `/host/image-XXX/boot` folder is now populated in `secureboot` mode but not used. **- How to verify it** Regular boot: - enable kdump - enable docker_inram=on via kernel-params - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness OOM events on the console - after: crash kernel works and crash available under /var/crash Secure boot: - enable kdump - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness no kdump - after: crash kernel works and crash available under /var/crash Co-authored-by: Boyang Yu <byu@arista.com>
2021-02-02 03:55:09 -06:00
# Check kdump
[ "$in_kdump" = true ] && exit 0
# Skip this script for warm-reboot/fast-reboot from sonic
[kdump] Fix OOM events in crashkernel (#6447) A few issues where discovered with crashkernel on Arista platforms. 1) platforms using `docker_inram=on` would end up OOM in kdump environment. This happens because the same initramfs is used by SONiC and the crashkernel. With `docker_inram=on` the `dockerfs.tar.gz` is extracted in a `tmpfs` created for the occasion. Since `dockerfs.tar.gz` weights more than 1.5G, it doesn't fit into the kdump environment and ends up OOM. This OOM event can in turn trigger a panic. 2) Arista platforms with `secureboot` enabled would fail to load the crashkernel because the kernel parameter would be discarded on boot. This happens because the `boot0` in secureboot mode is strict about kernel parameter injection. 3) The secureboot path allowlist would remove kernel crash reports. 4) The kdump service would fail on Arista products since `/boot/` is empty in `secureboot` **- How I did it** 1) To prevent an OOM event in the crashkernel the fix is to avoid the codepaths in `union-mount` that create tmpfs and populate them. Some more codepath specific to Arista devices are also skipped to make the kdump process faster. This relies on detecting that the initramfs is starting in a kdump environment and skipping some initialization. The `/usr/sbin/kdump-config` tool appends a few kernel cmdline arguments when loading the crashkernel. The most unique one is `systemd.unit=kdump-tools.service` which is used in a few initramfs hooks to set `in_kdump`. 2) To allow `kdump` to work in `secureboot` environment the cmdline generation in boot0 was slightly modified. The codepath to load kernel parameters changed by SONiC is now running for booting in secure mode. It was altered to prevent an append only behavior which would grow the `kernel-cmdline` at every reboot. This ever growing behavior would lead `kexec` to fail to load the kernel due to a too long cmdline. 3) To get the kernel crash under /var/crash this path has to be added to `allowlist_paths` 4) The `/host/image-XXX/boot` folder is now populated in `secureboot` mode but not used. **- How to verify it** Regular boot: - enable kdump - enable docker_inram=on via kernel-params - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness OOM events on the console - after: crash kernel works and crash available under /var/crash Secure boot: - enable kdump - reboot - generate a crash `echo c > /proc/sysrq-trigger` - before: witness no kdump - after: crash kernel works and crash available under /var/crash Co-authored-by: Boyang Yu <byu@arista.com>
2021-02-02 03:55:09 -06:00
[ "$sonic_fast_reboot" = true ] && [ "$prev_os" != eos ] && exit 0
# Get flash dev name
if [ -z "$block_flash" ]; then
echo "Error: flash device info is not provided"
exit 1
fi
if ! wait_get_flash_dev; then
echo "Error: flash device is not found"
exit 1
fi
# If root=/dev/*, update ROOT to the device under block_flash
update_root
root_dev="$ROOT"
if [ -z "$root_dev" ]; then
echo "Error: root device name is not provided"
exit 1
fi
if ! wait_for_root_dev; then
echo "Error: timeout in waiting for $root_dev"
exit 1
fi
# exit when the root is ext4
if ! blkid | grep "$root_dev.*vfat" -q; then
mkdir -p "$root_mnt"
mount -t ext4 "$root_dev" "$root_mnt"
fixup_flash_permissions "$root_mnt"
umount "$root_mnt"
rmdir "$root_mnt"
exit 0
fi
# Check memory size for tmpfs
free_mem=$(free | awk '/^Mem:/{print $4}')
if [ "$free_mem" -lt "$free_mem_thres" ]; then
echo "Error: memory is not enough"
exit 1
fi
# Backup partition table
mkdir -p "$root_mnt"
mount "$root_dev" "$root_mnt"
backup_file=backup.$(date +%Y-%m-%d.%H-%M-%S)
sfdisk -d "$flash_dev" > "$root_mnt/$backup_file"
# Check total size of files in root
total_file_size=$(du -s "$root_mnt" | awk '{print $1}')
if [ "$total_file_size" -gt "$tmpfs_size" ]; then
echo "Error: total file size is too large"
exit 1
fi
# Create tmpfs, and copy files to tmpfs
mkdir -p "$tmp_mnt"
mount -t tmpfs -o size="${tmpfs_size}k" tmpfs "$tmp_mnt"
cp -a "$root_mnt/." "$tmp_mnt/"
umount "$root_mnt"
#### Lines below will modify the root file system, so any failure will be trapped to shell for manual interventions.
if [ $(echo -n "$root_dev" | tail -c 1) == "1" ]; then
# Create a new partition table (content in flash_dev will be deleted)
err_msg="Error: Failed to zero out first MB"
cmd="dd if=/dev/zero of=$flash_dev bs=512 count=2048"
run_cmd "$cmd" "$err_msg"
err_msg="Error: repartitioning $flash_dev failed"
cmd="echo '2048' | sfdisk $flash_dev || (sleep 3; blockdev --rereadpt $flash_dev && fdisk -l $flash_dev | grep -q ${root_dev}.*Linux)"
run_cmd "$cmd" "$err_msg"
fi
sleep 2
err_msg="Error: timeout in waiting for $root_dev after repartition"
cmd="wait_for_root_dev"
run_cmd "$cmd" "$err_msg"
err_msg="Error: formatting to ext4 failed"
cmd="/usr/local/sbin/mke2fs -t ext4 -F -O '^huge_file,^metadata_csum' $root_dev"
run_cmd "$cmd" "$err_msg"
err_msg="Error: mounting $root_dev to $root_mnt failed"
cmd="mount -t ext4 $root_dev $root_mnt"
run_cmd "$cmd" "$err_msg"
if [ x"$docker_inram" != x"on" ]; then
err_msg="Error: extract docker directory"
cmd="[ -f $tmp_mnt/$image_dir/{{ FILESYSTEM_DOCKERFS }} ] && rm -rf $root_mnt/$image_dir/{{ DOCKERFS_DIR }} && mkdir -p $root_mnt/$image_dir/{{ DOCKERFS_DIR }} && tar xzf $tmp_mnt/$image_dir/{{ FILESYSTEM_DOCKERFS }} -C $root_mnt/$image_dir/{{ DOCKERFS_DIR }} && rm -f $tmp_mnt/$image_dir/{{ FILESYSTEM_DOCKERFS }}"
run_cmd "$cmd" "$err_msg"
fi
err_msg="Error: copying files form $tmp_mnt to $root_mnt failed"
cmd="cp -a $tmp_mnt/. $root_mnt/"
run_cmd "$cmd" "$err_msg"
fixup_flash_permissions "$root_mnt"