From a2469e2fb8dc6a35d5f1dbff30a2366c4306fc9d Mon Sep 17 00:00:00 2001
From: Engin Eren <engin.eren@desy.de>
Date: Wed, 6 Jul 2022 11:05:25 +0200
Subject: [PATCH] syntax error

---
 pytorch_job_wganHCAL_nccl.yaml | 64 +++++++---------------------------
 wganHCAL.py                    |  2 +-
 2 files changed, 14 insertions(+), 52 deletions(-)

diff --git a/pytorch_job_wganHCAL_nccl.yaml b/pytorch_job_wganHCAL_nccl.yaml
index fcdf040..b3618b4 100644
--- a/pytorch_job_wganHCAL_nccl.yaml
+++ b/pytorch_job_wganHCAL_nccl.yaml
@@ -1,7 +1,7 @@
 apiVersion: "kubeflow.org/v1"
 kind: "PyTorchJob"
 metadata:
-  name: "pytorch-dist-wganHCAL-nccl"
+  name: "pytorch-dist-wganhcal-nccl"
 spec:
   pytorchReplicaSpecs:
     Master:
@@ -9,42 +9,23 @@ spec:
       restartPolicy: OnFailure
       template:
         metadata:
+          labels:
+            mount-kerberos-secret: "true"
+            mount-eos: "true"
+            mount-nvidia-driver: "true"
           annotations:
             sidecar.istio.io/inject: "false"
         spec:
-          volumes:
-          - name: eos
-            hostPath:
-              path: /var/eos
-          - name: krb-secret-vol
-            secret:
-              secretName: krb-secret
-          - name: nvidia-driver
-            hostPath:
-              path: /opt/nvidia-driver
-              type: ""
           containers:
             - name: pytorch
-              image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp
+              image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3
               imagePullPolicy: Always
               env:
-              - name: PATH
-                value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
-              - name: LD_LIBRARY_PATH
-                value: /opt/nvidia-driver/lib64
               - name: PYTHONUNBUFFERED
                 value: "1"
               command: [sh, -c] 
               args:  
-                - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 
-                  && python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001;
-              volumeMounts:
-              - name: eos
-                mountPath: /eos
-              - name: krb-secret-vol
-                mountPath: "/secret/krb-secret-vol"
-              - name: nvidia-driver
-                mountPath: /opt/nvidia-driver
+                - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001;
               resources: 
                 limits:
                   nvidia.com/gpu: 1
@@ -53,42 +34,23 @@ spec:
       restartPolicy: OnFailure
       template:
         metadata:
+          labels:
+            mount-kerberos-secret: "true"
+            mount-eos: "true"
+            mount-nvidia-driver: "true"
           annotations:
             sidecar.istio.io/inject: "false"
         spec:
-          volumes:
-          - name: eos
-            hostPath:
-              path: /var/eos
-          - name: krb-secret-vol
-            secret:
-              secretName: krb-secret
-          - name: nvidia-driver
-            hostPath:
-              path: /opt/nvidia-driver
-              type: ""  
           containers: 
             - name: pytorch
-              image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp
+              image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3
               imagePullPolicy: Always
               env:
-              - name: PATH
-                value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
-              - name: LD_LIBRARY_PATH
-                value: /opt/nvidia-driver/lib64
               - name: PYTHONUNBUFFERED
                 value: "1"
               command: [sh, -c] 
               args:  
-                - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 
-                  && python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001;
-              volumeMounts:
-              - name: eos
-                mountPath: /eos
-              - name: krb-secret-vol
-                mountPath: "/secret/krb-secret-vol"
-              - name: nvidia-driver
-                mountPath: /opt/nvidia-driver
+                - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001;
               resources: 
                 limits:
                   nvidia.com/gpu: 1
diff --git a/wganHCAL.py b/wganHCAL.py
index 262c458..9309eda 100644
--- a/wganHCAL.py
+++ b/wganHCAL.py
@@ -217,7 +217,7 @@ def parse_args():
 
 
     # postprocess args
-    args.device = f'cuda:{args.local_rank}'  # PytorchJob/launch.py
+    args.device = 'cuda:{}'.format(args.local_rank)  # PytorchJob/launch.py
     args.batch_size = max(args.batch_size,
                           args.world_size * 2)  # min valid batchsize
     return args
-- 
GitLab