From a2469e2fb8dc6a35d5f1dbff30a2366c4306fc9d Mon Sep 17 00:00:00 2001 From: Engin Eren <engin.eren@desy.de> Date: Wed, 6 Jul 2022 11:05:25 +0200 Subject: [PATCH] syntax error --- pytorch_job_wganHCAL_nccl.yaml | 64 +++++++--------------------------- wganHCAL.py | 2 +- 2 files changed, 14 insertions(+), 52 deletions(-) diff --git a/pytorch_job_wganHCAL_nccl.yaml b/pytorch_job_wganHCAL_nccl.yaml index fcdf040..b3618b4 100644 --- a/pytorch_job_wganHCAL_nccl.yaml +++ b/pytorch_job_wganHCAL_nccl.yaml @@ -1,7 +1,7 @@ apiVersion: "kubeflow.org/v1" kind: "PyTorchJob" metadata: - name: "pytorch-dist-wganHCAL-nccl" + name: "pytorch-dist-wganhcal-nccl" spec: pytorchReplicaSpecs: Master: @@ -9,42 +9,23 @@ spec: restartPolicy: OnFailure template: metadata: + labels: + mount-kerberos-secret: "true" + mount-eos: "true" + mount-nvidia-driver: "true" annotations: sidecar.istio.io/inject: "false" spec: - volumes: - - name: eos - hostPath: - path: /var/eos - - name: krb-secret-vol - secret: - secretName: krb-secret - - name: nvidia-driver - hostPath: - path: /opt/nvidia-driver - type: "" containers: - name: pytorch - image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp + image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3 imagePullPolicy: Always env: - - name: PATH - value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin - - name: LD_LIBRARY_PATH - value: /opt/nvidia-driver/lib64 - name: PYTHONUNBUFFERED value: "1" command: [sh, -c] args: - - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001; - volumeMounts: - - name: eos - mountPath: /eos - - name: krb-secret-vol - mountPath: "/secret/krb-secret-vol" - - name: nvidia-driver - mountPath: /opt/nvidia-driver + - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001; resources: limits: nvidia.com/gpu: 1 @@ -53,42 +34,23 @@ spec: restartPolicy: OnFailure template: metadata: + labels: + mount-kerberos-secret: "true" + mount-eos: "true" + mount-nvidia-driver: "true" annotations: sidecar.istio.io/inject: "false" spec: - volumes: - - name: eos - hostPath: - path: /var/eos - - name: krb-secret-vol - secret: - secretName: krb-secret - - name: nvidia-driver - hostPath: - path: /opt/nvidia-driver - type: "" containers: - name: pytorch - image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp + image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3 imagePullPolicy: Always env: - - name: PATH - value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin - - name: LD_LIBRARY_PATH - value: /opt/nvidia-driver/lib64 - name: PYTHONUNBUFFERED value: "1" command: [sh, -c] args: - - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001; - volumeMounts: - - name: eos - mountPath: /eos - - name: krb-secret-vol - mountPath: "/secret/krb-secret-vol" - - name: nvidia-driver - mountPath: /opt/nvidia-driver + - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001; resources: limits: nvidia.com/gpu: 1 diff --git a/wganHCAL.py b/wganHCAL.py index 262c458..9309eda 100644 --- a/wganHCAL.py +++ b/wganHCAL.py @@ -217,7 +217,7 @@ def parse_args(): # postprocess args - args.device = f'cuda:{args.local_rank}' # PytorchJob/launch.py + args.device = 'cuda:{}'.format(args.local_rank) # PytorchJob/launch.py args.batch_size = max(args.batch_size, args.world_size * 2) # min valid batchsize return args -- GitLab