From e05ea40abfed6a5fbbfd53b0040ad5b0a1f87e0c Mon Sep 17 00:00:00 2001 From: Engin Eren <engin.eren@desy.de> Date: Fri, 22 Apr 2022 11:56:38 +0200 Subject: [PATCH] training with single energies --- pytorch_job_regressor_nccl_singlePod.yaml | 50 ------------------- ...or_nccl.yaml => pytorch_job_wgan_nccl.yaml | 6 +-- wgan.py | 2 +- 3 files changed, 4 insertions(+), 54 deletions(-) delete mode 100644 pytorch_job_regressor_nccl_singlePod.yaml rename pytorch_job_regressor_nccl.yaml => pytorch_job_wgan_nccl.yaml (94%) diff --git a/pytorch_job_regressor_nccl_singlePod.yaml b/pytorch_job_regressor_nccl_singlePod.yaml deleted file mode 100644 index 38994c9..0000000 --- a/pytorch_job_regressor_nccl_singlePod.yaml +++ /dev/null @@ -1,50 +0,0 @@ -apiVersion: "kubeflow.org/v1" -kind: "PyTorchJob" -metadata: - name: "pytorch-dist-regressor-nccl" -spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: OnFailure - template: - metadata: - annotations: - sidecar.istio.io/inject: "false" - spec: - volumes: - - name: eos - hostPath: - path: /var/eos - - name: krb-secret-vol - secret: - secretName: krb-secret - - name: nvidia-driver - hostPath: - path: /opt/nvidia-driver - type: "" - containers: - - name: pytorch - image: registry.hub.docker.com/engineren/pytorchjob:regCuda10debug - imagePullPolicy: Always - env: - - name: PATH - value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin - - name: LD_LIBRARY_PATH - value: /opt/nvidia-driver/lib64 - - name: PYTHONUNBUFFERED - value: "1" - command: [sh, -c] - args: - - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 && python -u regressor.py --backend nccl; - volumeMounts: - - name: eos - mountPath: /eos - - name: krb-secret-vol - mountPath: "/secret/krb-secret-vol" - - name: nvidia-driver - mountPath: /opt/nvidia-driver - resources: - limits: - nvidia.com/gpu: 1 - \ No newline at end of file diff --git a/pytorch_job_regressor_nccl.yaml b/pytorch_job_wgan_nccl.yaml similarity index 94% rename from pytorch_job_regressor_nccl.yaml rename to pytorch_job_wgan_nccl.yaml index 74f177d..82d0aae 100644 --- a/pytorch_job_regressor_nccl.yaml +++ b/pytorch_job_wgan_nccl.yaml @@ -37,7 +37,7 @@ spec: command: [sh, -c] args: - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u regressor.py --backend nccl --epochs 1; + && python -u wgan.py --backend nccl --epochs 10 --exp wganv0; volumeMounts: - name: eos mountPath: /eos @@ -49,7 +49,7 @@ spec: limits: nvidia.com/gpu: 1 Worker: - replicas: 1 + replicas: 2 restartPolicy: OnFailure template: metadata: @@ -81,7 +81,7 @@ spec: command: [sh, -c] args: - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u regressor.py --backend nccl --epochs 1; + && python -u wgan.py --backend nccl --epochs 10 --exp wganv0; volumeMounts: - name: eos mountPath: /eos diff --git a/wgan.py b/wgan.py index 1273dc6..67161e4 100644 --- a/wgan.py +++ b/wgan.py @@ -261,7 +261,7 @@ def run(args): print ("loading data") - dataset = HDF5Dataset('/eos/user/e/eneren/run_prod75k/hdf5/training_75k.hdf5', transform=None, train_size=75000) + dataset = HDF5Dataset('/eos/user/e/eneren/scratch/40GeV40k.hdf5', transform=None, train_size=40000) sampler = DistributedSampler(dataset, shuffle=True) train_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=sampler, num_workers=1, pin_memory=False) -- GitLab