From 7026acebcf4dc6490095459bdf9dfe62c2106dd8 Mon Sep 17 00:00:00 2001
From: "@dt21e8" <dogukanuraztuna@gmail.com>
Date: Thu, 13 Apr 2023 12:33:48 +0300
Subject: [PATCH] (add) deepspeed_mpi specific container, deepspeed_config for
 MPI with nodetaints

---
 examples/v2beta1/deepspeed/Dockerfile         | 31 +++++++
 .../v2beta1/deepspeed/deepspeed-config.yaml   | 93 +++++++++++++++++++
 2 files changed, 124 insertions(+)
 create mode 100644 examples/v2beta1/deepspeed/Dockerfile
 create mode 100644 examples/v2beta1/deepspeed/deepspeed-config.yaml
diff --git a/examples/v2beta1/deepspeed/Dockerfile b/examples/v2beta1/deepspeed/Dockerfile
new file mode 100644
index 000000000..9fd9b861e
--- /dev/null
+++ b/examples/v2beta1/deepspeed/Dockerfile
@@ -0,0 +1,31 @@
+# Official PyTorch image with CUDA support
+FROM pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+
+# mpi-operator mounts the .ssh folder from a Secret. For that to work, we need to disable UserKnownHostsFile to avoid write permissions.
+# Disable StrictModes avoids directory and files read permission checks and update system packages & install dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    build-essential \
+    cmake \
+    libopenmpi-dev \
+    openssh-server \
+    && rm -rf /var/lib/apt/lists/* \
+    && echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \
+    && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+# Install DeepSpeed library
+RUN pip install deepspeed
+RUN mkdir /deepspeed
+
+# Workspace for DeepSpeed examples
+WORKDIR "/deepspeed"
+
+# Clone the DeepSpeedExamples from repository
+RUN git clone https://github.com/microsoft/DeepSpeedExamples/
+
+# Set the working directory to DeepSpeedExamples/training for models
+WORKDIR "/deepspeed/DeepSpeedExamples/training"
+
+# Set the default command to bash
+CMD ["/bin/bash"]
diff --git a/examples/v2beta1/deepspeed/deepspeed-config.yaml b/examples/v2beta1/deepspeed/deepspeed-config.yaml
new file mode 100644
index 000000000..ed038d51d
--- /dev/null
+++ b/examples/v2beta1/deepspeed/deepspeed-config.yaml
@@ -0,0 +1,93 @@
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: deepspeed-mpijob
+spec:
+  slotsPerWorker: 1
+  runPolicy:
+    cleanPodPolicy: Running
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          containers:
+          # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
+          # Change your image name and version in here
+          - image: <YOUR-DEEPSPEED-CONTAINER-NAME>:<VERSION>
+            name: deepspeed-mpijob-container
+            command:
+              - mpirun
+              - --allow-run-as-root
+              - -np
+              - "2"
+              - -bind-to
+              - none
+              - -map-by
+              - slot
+              - -x
+              - NCCL_DEBUG=INFO
+              - -x
+              - LD_LIBRARY_PATH
+              - -x
+              - PATH
+              - -mca
+              - pml
+              - ob1
+              - -mca
+              - btl
+              - ^openib
+              - python
+              - cifar/cifar10_deepspeed.py
+              - --deepspeed_mpi
+              - --deepspeed
+              - --deepspeed_config
+              - ds_config.json
+              - $@
+    Worker:
+      replicas: 2
+      template:
+        spec:
+          # OPTIONAL: Taint toleration for the specific nodepool
+          #
+          # Taints and tolerations are used to ensure that the DeepSpeed worker pods
+          # are scheduled on the desired nodes. By applying taints to nodes, you can
+          # repel pods that do not have the corresponding tolerations. This is useful
+          # in situations where you want to reserve nodes with specific resources
+          # (e.g. GPU nodes) for particular workloads, like the DeepSpeed training
+          # job.
+          #
+          # In this example, the tolerations are set to allow the DeepSpeed worker
+          # pods to be scheduled on nodes with the specified taints (i.e., the node
+          # pool with GPU resources). This ensures that the training job can
+          # utilize the available GPU resources on those nodes, improving the
+          # efficiency and performance of the training process.
+          #
+          # You can remove the taint tolerations if you do not have any taints on your cluster.
+          tolerations:
+          # Change the nodepool name in here
+          - effect: NoSchedule
+            key: nodepool
+            operator: Equal
+            value: nodepool-256ram32cpu2gpu-0
+          # Taint toleration effect for GPU nodes
+          - effect: NoSchedule
+            key: nvidia.com/gpu
+            operator: Equal
+            value: present
+          containers:
+          # Container with the DeepSpeed training image built from the provided Dockerfile and the DeepSpeed support
+          # Change your image name and version in here
+          - image: <YOUR-DEEPSPEED-CONTAINER-NAME>:<VERSION>
+            name: deepspeed-mpijob-container
+            resources:
+              limits:
+                # Optional: varies to nodepool group 
+                cpu: 30
+                memory: 230Gi
+                nvidia.com/gpu: 2
+              requests:
+                # Optional: varies to nodepool group
+                cpu: 16
+                memory: 128Gi
+                nvidia.com/gpu: 1