diff --git a/examples/v2beta1/deepspeed/README.MD b/examples/v2beta1/deepspeed/README.MD new file mode 100644 index 000000000..7128f1b6d --- /dev/null +++ b/examples/v2beta1/deepspeed/README.MD @@ -0,0 +1,8 @@ +# DeeepSpeed Example + +This demo introduces the basic usage of deepspeed with mpi-operator. + +## References + +* https://github.com/microsoft/DeepSpeedExamples/blob/master/training/HelloDeepSpeed/README.md +* https://www.alibabacloud.com/help/en/ack/cloud-native-ai-suite/user-guide/deepspeed-distributed-training diff --git a/examples/v2beta1/deepspeed/deepspeed-helloworld.yaml b/examples/v2beta1/deepspeed/deepspeed-helloworld.yaml new file mode 100644 index 000000000..9bd863dd0 --- /dev/null +++ b/examples/v2beta1/deepspeed/deepspeed-helloworld.yaml @@ -0,0 +1,32 @@ +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: deepspeed-helloworld +spec: + slotsPerWorker: 1 + runPolicy: + cleanPodPolicy: Running + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed + name: deepspeed-helloworld + command: + - deepspeed + args: + - /workspace/DeepSpeedExamples/HelloDeepSpeed/train_bert_ds.py + - --checkpoint_dir + - /workspace + Worker: + replicas: 2 + template: + spec: + containers: + - image: registry.cn-beijing.aliyuncs.com/acs/deepspeed:hello-deepspeed + name: deepspeed-helloworld + resources: + limits: + nvidia.com/gpu: 8