Skip to content
Snippets Groups Projects
Commit a2469e2f authored by Engin Eren's avatar Engin Eren
Browse files

syntax error

parent 01e0cd3b
No related branches found
No related tags found
1 merge request!43crit peter
Pipeline #4190217 passed
apiVersion: "kubeflow.org/v1" apiVersion: "kubeflow.org/v1"
kind: "PyTorchJob" kind: "PyTorchJob"
metadata: metadata:
name: "pytorch-dist-wganHCAL-nccl" name: "pytorch-dist-wganhcal-nccl"
spec: spec:
pytorchReplicaSpecs: pytorchReplicaSpecs:
Master: Master:
...@@ -9,42 +9,23 @@ spec: ...@@ -9,42 +9,23 @@ spec:
restartPolicy: OnFailure restartPolicy: OnFailure
template: template:
metadata: metadata:
labels:
mount-kerberos-secret: "true"
mount-eos: "true"
mount-nvidia-driver: "true"
annotations: annotations:
sidecar.istio.io/inject: "false" sidecar.istio.io/inject: "false"
spec: spec:
volumes:
- name: eos
hostPath:
path: /var/eos
- name: krb-secret-vol
secret:
secretName: krb-secret
- name: nvidia-driver
hostPath:
path: /opt/nvidia-driver
type: ""
containers: containers:
- name: pytorch - name: pytorch
image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3
imagePullPolicy: Always imagePullPolicy: Always
env: env:
- name: PATH
value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
- name: LD_LIBRARY_PATH
value: /opt/nvidia-driver/lib64
- name: PYTHONUNBUFFERED - name: PYTHONUNBUFFERED
value: "1" value: "1"
command: [sh, -c] command: [sh, -c]
args: args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001;
&& python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001;
volumeMounts:
- name: eos
mountPath: /eos
- name: krb-secret-vol
mountPath: "/secret/krb-secret-vol"
- name: nvidia-driver
mountPath: /opt/nvidia-driver
resources: resources:
limits: limits:
nvidia.com/gpu: 1 nvidia.com/gpu: 1
...@@ -53,42 +34,23 @@ spec: ...@@ -53,42 +34,23 @@ spec:
restartPolicy: OnFailure restartPolicy: OnFailure
template: template:
metadata: metadata:
labels:
mount-kerberos-secret: "true"
mount-eos: "true"
mount-nvidia-driver: "true"
annotations: annotations:
sidecar.istio.io/inject: "false" sidecar.istio.io/inject: "false"
spec: spec:
volumes:
- name: eos
hostPath:
path: /var/eos
- name: krb-secret-vol
secret:
secretName: krb-secret
- name: nvidia-driver
hostPath:
path: /opt/nvidia-driver
type: ""
containers: containers:
- name: pytorch - name: pytorch
image: gitlab-registry.cern.ch/eneren/pytorchjob:ddp image: gitlab-registry.cern.ch/eneren/pytorchjob:ddpv3
imagePullPolicy: Always imagePullPolicy: Always
env: env:
- name: PATH
value: /opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/nvidia-driver/bin
- name: LD_LIBRARY_PATH
value: /opt/nvidia-driver/lib64
- name: PYTHONUNBUFFERED - name: PYTHONUNBUFFERED
value: "1" value: "1"
command: [sh, -c] command: [sh, -c]
args: args:
- cp /secret/krb-secret-vol/krb5cc_1000 /tmp/krb5cc_0 && chmod 600 /tmp/krb5cc_0 - python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.00001 --lrGen 0.00001;
&& python -u wganHCAL.py --backend nccl --epochs 50 --exp wganHCALv1 --lrCrit 0.0001 --lrGen 0.00001;
volumeMounts:
- name: eos
mountPath: /eos
- name: krb-secret-vol
mountPath: "/secret/krb-secret-vol"
- name: nvidia-driver
mountPath: /opt/nvidia-driver
resources: resources:
limits: limits:
nvidia.com/gpu: 1 nvidia.com/gpu: 1
...@@ -217,7 +217,7 @@ def parse_args(): ...@@ -217,7 +217,7 @@ def parse_args():
# postprocess args # postprocess args
args.device = f'cuda:{args.local_rank}' # PytorchJob/launch.py args.device = 'cuda:{}'.format(args.local_rank) # PytorchJob/launch.py
args.batch_size = max(args.batch_size, args.batch_size = max(args.batch_size,
args.world_size * 2) # min valid batchsize args.world_size * 2) # min valid batchsize
return args return args
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment