Applying some fixes for K8s agents in CI (#15493)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
This commit is contained in:
Alexei-V-Ivanov-AMD 2025-03-26 15:35:11 -05:00 committed by GitHub
parent 27df5199d9
commit dd8a29da99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 8 additions and 5 deletions

View File

@ -134,9 +134,10 @@ if [[ $commands == *"--shard-id="* ]]; then
# assign shard-id for each shard
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
echo "Shard ${GPU} commands:$commands_gpu"
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
@ -163,9 +164,10 @@ if [[ $commands == *"--shard-id="* ]]; then
fi
done
else
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--rm \
-e HIP_VISIBLE_DEVICES=0 \

View File

@ -12,7 +12,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
# Install some basic utilities
RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl
# Remove sccache
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"