diff --git a/pytorch_job_regressor_nccl_singlePod.yaml b/pytorch_job_regressor_nccl_singlePod.yaml deleted file mode 100644 index 38994c9fdd2b69b38d5a64e05db1ca7289adac96..0000000000000000000000000000000000000000 --- a/pytorch_job_regressor_nccl_singlePod.yaml +++ /dev/null @@ -1,50 +0,0 @@ -apiVersion: "kubeflow.org/v1" -kind: "PyTorchJob" -metadata: - name: "pytorch-dist-regressor-nccl" -spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: OnFailure - template: - metadata: - annotations: - sidecar.istio.io/inject: "false" - spec: - volumes: - - name: eos - hostPath: - path: /var/eos - - name: krb-secret-vol - secret: - secretName: krb-secret - - name: nvidia-driver - hostPath: - path: /opt/nvidia-driver - type: "" - containers: - - name: pytorch - image: registry.hub.docker.com/engineren/pytorchjob:regCuda10debug - imagePullPolicy: Always - env: - - name: PATH - value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin - - name: LD_LIBRARY_PATH - value: /opt/nvidia-driver/lib64 - - name: PYTHONUNBUFFERED - value: "1" - command: [sh, -c] - args: - - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 && python -u regressor.py --backend nccl; - volumeMounts: - - name: eos - mountPath: /eos - - name: krb-secret-vol - mountPath: "/secret/krb-secret-vol" - - name: nvidia-driver - mountPath: /opt/nvidia-driver - resources: - limits: - nvidia.com/gpu: 1 - \ No newline at end of file diff --git a/pytorch_job_regressor_nccl.yaml b/pytorch_job_wgan_nccl.yaml similarity index 94% rename from pytorch_job_regressor_nccl.yaml rename to pytorch_job_wgan_nccl.yaml index 74f177dd45108395cedd9efc7f9b5893165c18f7..82d0aae1958de3bf15c17eebe22c0aaf723d82c8 100644 --- a/pytorch_job_regressor_nccl.yaml +++ b/pytorch_job_wgan_nccl.yaml @@ -37,7 +37,7 @@ spec: command: [sh, -c] args: - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u regressor.py --backend nccl --epochs 1; + && python -u wgan.py --backend nccl --epochs 10 --exp wganv0; volumeMounts: - name: eos mountPath: /eos @@ -49,7 +49,7 @@ spec: limits: nvidia.com/gpu: 1 Worker: - replicas: 1 + replicas: 2 restartPolicy: OnFailure template: metadata: @@ -81,7 +81,7 @@ spec: command: [sh, -c] args: - cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - && python -u regressor.py --backend nccl --epochs 1; + && python -u wgan.py --backend nccl --epochs 10 --exp wganv0; volumeMounts: - name: eos mountPath: /eos diff --git a/wgan.py b/wgan.py index 1273dc6dc4fe2ef09fae9883617bdcb2d02148c8..67161e4dd53474547ec50c0e701a1e714f5efac3 100644 --- a/wgan.py +++ b/wgan.py @@ -261,7 +261,7 @@ def run(args): print ("loading data") - dataset = HDF5Dataset('/eos/user/e/eneren/run_prod75k/hdf5/training_75k.hdf5', transform=None, train_size=75000) + dataset = HDF5Dataset('/eos/user/e/eneren/scratch/40GeV40k.hdf5', transform=None, train_size=40000) sampler = DistributedSampler(dataset, shuffle=True) train_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=sampler, num_workers=1, pin_memory=False)