Skip to content
Snippets Groups Projects
Commit e05ea40a authored by Engin Eren's avatar Engin Eren
Browse files

training with single energies

parent cb390740
Branches
No related tags found
1 merge request!3Test
Pipeline #3879419 passed
apiVersion: "kubeflow.org/v1"
kind: "PyTorchJob"
metadata:
name: "pytorch-dist-regressor-nccl"
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
volumes:
- name: eos
hostPath:
path: /var/eos
- name: krb-secret-vol
secret:
secretName: krb-secret
- name: nvidia-driver
hostPath:
path: /opt/nvidia-driver
type: ""
containers:
- name: pytorch
image: registry.hub.docker.com/engineren/pytorchjob:regCuda10debug
imagePullPolicy: Always
env:
- name: PATH
value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
- name: LD_LIBRARY_PATH
value: /opt/nvidia-driver/lib64
- name: PYTHONUNBUFFERED
value: "1"
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 && python -u regressor.py --backend nccl;
volumeMounts:
- name: eos
mountPath: /eos
- name: krb-secret-vol
mountPath: "/secret/krb-secret-vol"
- name: nvidia-driver
mountPath: /opt/nvidia-driver
resources:
limits:
nvidia.com/gpu: 1
\ No newline at end of file
......@@ -37,7 +37,7 @@ spec:
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0
&& python -u regressor.py --backend nccl --epochs 1;
&& python -u wgan.py --backend nccl --epochs 10 --exp wganv0;
volumeMounts:
- name: eos
mountPath: /eos
......@@ -49,7 +49,7 @@ spec:
limits:
nvidia.com/gpu: 1
Worker:
replicas: 1
replicas: 2
restartPolicy: OnFailure
template:
metadata:
......@@ -81,7 +81,7 @@ spec:
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0
&& python -u regressor.py --backend nccl --epochs 1;
&& python -u wgan.py --backend nccl --epochs 10 --exp wganv0;
volumeMounts:
- name: eos
mountPath: /eos
......
......@@ -261,7 +261,7 @@ def run(args):
print ("loading data")
dataset = HDF5Dataset('/eos/user/e/eneren/run_prod75k/hdf5/training_75k.hdf5', transform=None, train_size=75000)
dataset = HDF5Dataset('/eos/user/e/eneren/scratch/40GeV40k.hdf5', transform=None, train_size=40000)
sampler = DistributedSampler(dataset, shuffle=True)
train_loader = DataLoader(dataset, batch_size=args.batch_size, sampler=sampler, num_workers=1, pin_memory=False)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment