From e05ea40abfed6a5fbbfd53b0040ad5b0a1f87e0c Mon Sep 17 00:00:00 2001
From: Engin Eren <engin.eren@desy.de>
Date: Fri, 22 Apr 2022 11:56:38 +0200
Subject: [PATCH] training with single energies

---
 pytorch_job_regressor_nccl_singlePod.yaml     | 50 -------------------
 ...or_nccl.yaml => pytorch_job_wgan_nccl.yaml |  6 +--
 wgan.py                                       |  2 +-
 3 files changed, 4 insertions(+), 54 deletions(-)
 delete mode 100644 pytorch_job_regressor_nccl_singlePod.yaml
 rename pytorch_job_regressor_nccl.yaml => pytorch_job_wgan_nccl.yaml (94%)

diff --git a/pytorch_job_regressor_nccl_singlePod.yaml b/pytorch_job_regressor_nccl_singlePod.yaml
deleted file mode 100644
index 38994c9..0000000
--- a/pytorch_job_regressor_nccl_singlePod.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
-apiVersion: "kubeflow.org/v1"
-kind: "PyTorchJob"
-metadata:
-  name: "pytorch-dist-regressor-nccl"
-spec:
-  pytorchReplicaSpecs:
-    Master:
-      replicas: 1
-      restartPolicy: OnFailure
-      template:
-        metadata:
-          annotations:
-            sidecar.istio.io/inject: "false"
-        spec:
-          volumes:
-          - name: eos
-            hostPath:
-              path: /var/eos
-          - name: krb-secret-vol
-            secret:
-              secretName: krb-secret
-          - name: nvidia-driver
-            hostPath:
-              path: /opt/nvidia-driver
-              type: ""
-          containers:
-            - name: pytorch
-              image: registry.hub.docker.com/engineren/pytorchjob:regCuda10debug
-              imagePullPolicy: Always
-              env:
-              - name: PATH
-                value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
-              - name: LD_LIBRARY_PATH
-                value: /opt/nvidia-driver/lib64
-              - name: PYTHONUNBUFFERED
-                value: "1"
-              command: [sh, -c] 
-              args:  
-                - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 && python -u regressor.py --backend nccl;
-              volumeMounts:
-              - name: eos
-                mountPath: /eos
-              - name: krb-secret-vol
-                mountPath: "/secret/krb-secret-vol"
-              - name: nvidia-driver
-                mountPath: /opt/nvidia-driver
-              resources: 
-                limits:
-                  nvidia.com/gpu: 1
-    
\ No newline at end of file
diff --git a/pytorch_job_regressor_nccl.yaml b/pytorch_job_wgan_nccl.yaml
similarity index 94%
rename from pytorch_job_regressor_nccl.yaml
rename to pytorch_job_wgan_nccl.yaml
index 74f177d..82d0aae 100644
--- a/pytorch_job_regressor_nccl.yaml
+++ b/pytorch_job_wgan_nccl.yaml
@@ -37,7 +37,7 @@ spec:
               command: [sh, -c] 
               args:  
                 - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 
-                  && python -u regressor.py --backend nccl --epochs 1;
+                  && python -u wgan.py --backend nccl --epochs 10 --exp wganv0;
               volumeMounts:
               - name: eos
                 mountPath: /eos
@@ -49,7 +49,7 @@ spec:
                 limits:
                   nvidia.com/gpu: 1
     Worker:
-      replicas: 1
+      replicas: 2
       restartPolicy: OnFailure
       template:
         metadata:
@@ -81,7 +81,7 @@ spec:
               command: [sh, -c] 
               args:  
                 - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 
-                  && python -u regressor.py --backend nccl --epochs 1;
+                  && python -u wgan.py --backend nccl --epochs 10 --exp wganv0;
               volumeMounts:
               - name: eos
                 mountPath: /eos
diff --git a/wgan.py b/wgan.py
index 1273dc6..67161e4 100644
--- a/wgan.py
+++ b/wgan.py
@@ -261,7 +261,7 @@ def run(args):
 
 
     print ("loading data")
-    dataset = HDF5Dataset('/eos/user/e/eneren/run_prod75k/hdf5/training_75k.hdf5', transform=None, train_size=75000)
+    dataset = HDF5Dataset('/eos/user/e/eneren/scratch/40GeV40k.hdf5', transform=None, train_size=40000)
     sampler = DistributedSampler(dataset, shuffle=True)    
     train_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=sampler, num_workers=1, pin_memory=False)
 
-- 
GitLab