From 7026acebcf4dc6490095459bdf9dfe62c2106dd8 Mon Sep 17 00:00:00 2001 From: "@dt21e8" Date: Thu, 13 Apr 2023 12:33:48 +0300 Subject: [PATCH] (add) deepspeed_mpi specific container, deepspeed_config for MPI with nodetaints --- examples/v2beta1/deepspeed/Dockerfile | 31 +++++++ .../v2beta1/deepspeed/deepspeed-config.yaml | 93 +++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 examples/v2beta1/deepspeed/Dockerfile create mode 100644 examples/v2beta1/deepspeed/deepspeed-config.yaml diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile new file mode 100644 index 000000000..9fd9b861e --- /dev/null +++ b/examples/v2beta1/deepspeed/Dockerfile @@ -0,0 +1,31 @@ +# Official PyTorch image with CUDA support +FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime + +# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need to disable UserKnownHostsFile to avoid write permissions. +# Disable StrictModes avoids directory and files read permission checks and update system packages & install dependencies +RUN apt-get update && apt-get install -y \ + git \ + wget \ + build-essential \ + cmake \ + libopenmpi-dev \ + openssh-server \ + && rm -rf /var/lib/apt/lists/* \ + && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ + && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config + +# Install DeepSpeed library +RUN pip install deepspeed +RUN mkdir /deepspeed + +# Workspace for DeepSpeed examples +WORKDIR "/deepspeed" + +# Clone the DeepSpeedExamples from repository +RUN git clone https://github.com/microsoft/DeepSpeedExamples/ + +# Set the working directory to DeepSpeedExamples/training for models +WORKDIR "/deepspeed/DeepSpeedExamples/training" + +# Set the default command to bash +CMD ["/bin/bash"] diff --git a/examples/v2beta1/deepspeed/deepspeed-config.yaml b/examples/v2beta1/deepspeed/deepspeed-config.yaml new file mode 100644 index 000000000..ed038d51d --- /dev/null +++ b/examples/v2beta1/deepspeed/deepspeed-config.yaml @@ -0,0 +1,93 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: deepspeed-mpijob +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support + # Change your image name and version in here + - image: : + name: deepspeed-mpijob-container + command: + - mpirun + - --allow-run-as-root + - -np + - "2" + - -bind-to + - none + - -map-by + - slot + - -x + - NCCL_DEBUG=INFO + - -x + - LD_LIBRARY_PATH + - -x + - PATH + - -mca + - pml + - ob1 + - -mca + - btl + - ^openib + - python + - cifar/cifar10_deepspeed.py + - --deepspeed_mpi + - --deepspeed + - --deepspeed_config + - ds_config.json + - $@ + Worker: + replicas: 2 + template: + spec: + # OPTIONAL: Taint toleration for the specific nodepool + # + # Taints and tolerations are used to ensure that the DeepSpeed worker pods + # are scheduled on the desired nodes. By applying taints to nodes, you can + # repel pods that do not have the corresponding tolerations. This is useful + # in situations where you want to reserve nodes with specific resources + # (e.g. GPU nodes) for particular workloads, like the DeepSpeed training + # job. + # + # In this example, the tolerations are set to allow the DeepSpeed worker + # pods to be scheduled on nodes with the specified taints (i.e., the node + # pool with GPU resources). This ensures that the training job can + # utilize the available GPU resources on those nodes, improving the + # efficiency and performance of the training process. + # + # You can remove the taint tolerations if you do not have any taints on your cluster. + tolerations: + # Change the nodepool name in here + - effect: NoSchedule + key: nodepool + operator: Equal + value: nodepool-256ram32cpu2gpu-0 + # Taint toleration effect for GPU nodes + - effect: NoSchedule + key: nvidia.com/gpu + operator: Equal + value: present + containers: + # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support + # Change your image name and version in here + - image: : + name: deepspeed-mpijob-container + resources: + limits: + # Optional: varies to nodepool group + cpu: 30 + memory: 230Gi + nvidia.com/gpu: 2 + requests: + # Optional: varies to nodepool group + cpu: 16 + memory: 128Gi + nvidia.com/gpu: 1