Skip to content
Snippets Groups Projects
Commit e05ea40a authored by Engin Eren's avatar Engin Eren
Browse files

training with single energies

parent cb390740
No related branches found
No related tags found
1 merge request!3Test
Pipeline #3879419 passed
apiVersion: "kubeflow.org/v1"
kind: "PyTorchJob"
metadata:
name: "pytorch-dist-regressor-nccl"
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
volumes:
- name: eos
hostPath:
path: /var/eos
- name: krb-secret-vol
secret:
secretName: krb-secret
- name: nvidia-driver
hostPath:
path: /opt/nvidia-driver
type: ""
containers:
- name: pytorch
image: registry.hub.docker.com/engineren/pytorchjob:regCuda10debug
imagePullPolicy: Always
env:
- name: PATH
value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
- name: LD_LIBRARY_PATH
value: /opt/nvidia-driver/lib64
- name: PYTHONUNBUFFERED
value: "1"
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 && python -u regressor.py --backend nccl;
volumeMounts:
- name: eos
mountPath: /eos
- name: krb-secret-vol
mountPath: "/secret/krb-secret-vol"
- name: nvidia-driver
mountPath: /opt/nvidia-driver
resources:
limits:
nvidia.com/gpu: 1
\ No newline at end of file
......@@ -37,7 +37,7 @@ spec:
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0
&& python -u regressor.py --backend nccl --epochs 1;
&& python -u wgan.py --backend nccl --epochs 10 --exp wganv0;
volumeMounts:
- name: eos
mountPath: /eos
......@@ -49,7 +49,7 @@ spec:
limits:
nvidia.com/gpu: 1
Worker:
replicas: 1
replicas: 2
restartPolicy: OnFailure
template:
metadata:
......@@ -81,7 +81,7 @@ spec:
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0
&& python -u regressor.py --backend nccl --epochs 1;
&& python -u wgan.py --backend nccl --epochs 10 --exp wganv0;
volumeMounts:
- name: eos
mountPath: /eos
......
......@@ -261,7 +261,7 @@ def run(args):
print ("loading data")
dataset = HDF5Dataset('/eos/user/e/eneren/run_prod75k/hdf5/training_75k.hdf5', transform=None, train_size=75000)
dataset = HDF5Dataset('/eos/user/e/eneren/scratch/40GeV40k.hdf5', transform=None, train_size=40000)
sampler = DistributedSampler(dataset, shuffle=True)
train_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=sampler, num_workers=1, pin_memory=False)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment