diff --git a/pytorch_job_wganHCAL_nccl.yaml b/pytorch_job_wganHCAL_nccl.yaml index fcdf0407205c813895c65c8207c729516fb1932b..b3618b45dda216c69c2dcead56605bab479b634e 100644 --- a/pytorch_job_wganHCAL_nccl.yaml +++ b/pytorch_job_wganHCAL_nccl.yaml @@ -1,7 +1,7 @@ apiVersion: "kubeflow.org/v1" kind: "PyTorchJob" metadata: - name: "pytorch-dist-wganHCAL-nccl" + name: "pytorch-dist-wganhcal-nccl" spec: pytorchReplicaSpecs: Master: @@ -9,42 +9,23 @@ spec: restartPolicy: OnFailure template: metadata: + labels: + mount-kerberos-secret: "true" + mount-eos: "true" + mount-nvidia-driver: "true" annotations: sidecar.istio.io/inject: "false" spec: - volumes: - - name: eos - hostPath: - path: /var/eos - - name: krb-secret-vol - secret: - secretName: krb-secret - - name: nvidia-driver - hostPath: - path: /opt/nvidia-driver - type: "" containers: - name: pytorch - image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp + image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3 imagePullPolicy: Always env: - - name: PATH - value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin - - name: LD_LIBRARY_PATH - value: /opt/nvidia-driver/lib64 - name: PYTHONUNBUFFERED value: "1" command: [sh, -c] args: - - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001; - volumeMounts: - - name: eos - mountPath: /eos - - name: krb-secret-vol - mountPath: "/secret/krb-secret-vol" - - name: nvidia-driver - mountPath: /opt/nvidia-driver + - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001; resources: limits: nvidia.com/gpu: 1 @@ -53,42 +34,23 @@ spec: restartPolicy: OnFailure template: metadata: + labels: + mount-kerberos-secret: "true" + mount-eos: "true" + mount-nvidia-driver: "true" annotations: sidecar.istio.io/inject: "false" spec: - volumes: - - name: eos - hostPath: - path: /var/eos - - name: krb-secret-vol - secret: - secretName: krb-secret - - name: nvidia-driver - hostPath: - path: /opt/nvidia-driver - type: "" containers: - name: pytorch - image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp + image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3 imagePullPolicy: Always env: - - name: PATH - value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin - - name: LD_LIBRARY_PATH - value: /opt/nvidia-driver/lib64 - name: PYTHONUNBUFFERED value: "1" command: [sh, -c] args: - - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001; - volumeMounts: - - name: eos - mountPath: /eos - - name: krb-secret-vol - mountPath: "/secret/krb-secret-vol" - - name: nvidia-driver - mountPath: /opt/nvidia-driver + - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001; resources: limits: nvidia.com/gpu: 1 diff --git a/wganHCAL.py b/wganHCAL.py index 262c4580ff9f04d1fbdc88498002253fc9974687..9309edaa410146255e93ee3a7f0b20a4a9cf1a24 100644 --- a/wganHCAL.py +++ b/wganHCAL.py @@ -217,7 +217,7 @@ def parse_args(): # postprocess args - args.device = f'cuda:{args.local_rank}' # PytorchJob/launch.py + args.device = 'cuda:{}'.format(args.local_rank) # PytorchJob/launch.py args.batch_size = max(args.batch_size, args.world_size * 2) # min valid batchsize return args