Skip to content
Snippets Groups Projects
Commit a2469e2f authored by Engin Eren's avatar Engin Eren
Browse files

syntax error

parent 01e0cd3b
No related branches found
No related tags found
1 merge request!43crit peter
Pipeline #4190217 passed
apiVersion: "kubeflow.org/v1"
kind: "PyTorchJob"
metadata:
name: "pytorch-dist-wganHCAL-nccl"
name: "pytorch-dist-wganhcal-nccl"
spec:
pytorchReplicaSpecs:
Master:
......@@ -9,42 +9,23 @@ spec:
restartPolicy: OnFailure
template:
metadata:
labels:
mount-kerberos-secret: "true"
mount-eos: "true"
mount-nvidia-driver: "true"
annotations:
sidecar.istio.io/inject: "false"
spec:
volumes:
- name: eos
hostPath:
path: /var/eos
- name: krb-secret-vol
secret:
secretName: krb-secret
- name: nvidia-driver
hostPath:
path: /opt/nvidia-driver
type: ""
containers:
- name: pytorch
image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp
image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3
imagePullPolicy: Always
env:
- name: PATH
value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
- name: LD_LIBRARY_PATH
value: /opt/nvidia-driver/lib64
- name: PYTHONUNBUFFERED
value: "1"
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0
&& python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001;
volumeMounts:
- name: eos
mountPath: /eos
- name: krb-secret-vol
mountPath: "/secret/krb-secret-vol"
- name: nvidia-driver
mountPath: /opt/nvidia-driver
- python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001;
resources:
limits:
nvidia.com/gpu: 1
......@@ -53,42 +34,23 @@ spec:
restartPolicy: OnFailure
template:
metadata:
labels:
mount-kerberos-secret: "true"
mount-eos: "true"
mount-nvidia-driver: "true"
annotations:
sidecar.istio.io/inject: "false"
spec:
volumes:
- name: eos
hostPath:
path: /var/eos
- name: krb-secret-vol
secret:
secretName: krb-secret
- name: nvidia-driver
hostPath:
path: /opt/nvidia-driver
type: ""
containers:
- name: pytorch
image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp
image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3
imagePullPolicy: Always
env:
- name: PATH
value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
- name: LD_LIBRARY_PATH
value: /opt/nvidia-driver/lib64
- name: PYTHONUNBUFFERED
value: "1"
command: [sh, -c]
args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0
&& python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001;
volumeMounts:
- name: eos
mountPath: /eos
- name: krb-secret-vol
mountPath: "/secret/krb-secret-vol"
- name: nvidia-driver
mountPath: /opt/nvidia-driver
- python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001;
resources:
limits:
nvidia.com/gpu: 1
......@@ -217,7 +217,7 @@ def parse_args():
# postprocess args
args.device = f'cuda:{args.local_rank}' # PytorchJob/launch.py
args.device = 'cuda:{}'.format(args.local_rank) # PytorchJob/launch.py
args.batch_size = max(args.batch_size,
args.world_size * 2) # min valid batchsize
return args
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment