Skip to content
Snippets Groups Projects
Commit 71f7364d authored by Dejan Golubovic's avatar Dejan Golubovic
Browse files

fixes

parent f3ad0ab6
No related branches found
No related tags found
No related merge requests found
Pipeline #4770210 passed with stage
in 4 minutes and 54 seconds
......@@ -2,10 +2,9 @@ apiVersion: v1
kind: Pod
metadata:
name: clear-gpu-memory-
namespace: default
spec:
containers:
- name: clearGPUmem
- name: clear-gpu-mem
command: ["/bin/bash", "-c"]
args: ["python3 /terminate_gpu_processes.py"]
image: registry.cern.ch/ml/monit_used_gpus:v4
......
......@@ -8,6 +8,7 @@ import random
config.load_incluster_config()
k8s_client = client.ApiClient()
api = client.AppsV1Api(k8s_client)
core_api = client.CoreV1Api(k8s_client)
k8s_co_client = client.CustomObjectsApi()
n_workers = 16
......@@ -15,9 +16,27 @@ namespace = 'kubeflow-user-example-com'
random_string = ''.join(random.choices(string.ascii_lowercase + string.digits, k=10))
job_name = 'desy-3dgan-' + str(n_workers) + '-workers-' + random_string
gpu_type = 'nvidia.com/gpu'
pod_file = '/home/jovyan/a100-analysis/pod_clear_gpu.yaml'
daemonset_file = '/home/jovyan/a100-analysis/daemonset.yaml'
pytorchjob_file = '/home/jovyan/a100-analysis/pytorchjob.yaml'
with open(pod_file) as f:
pod_dict = yaml.safe_load(f)
for i in range(9):
pod_dict['metadata']['name'] = pod_dict['metadata']['name'][:17] + str(i)
pod_dict['spec']['nodeName'] = pod_dict['spec']['nodeName'][:42] + str(i)
pod_name = pod_dict['metadata']['name']
print('Clearing GPU memory of the node ' + pod_dict['spec']['nodeName'])
core_api.create_namespaced_pod(body=pod_dict, namespace=namespace)
time.sleep(10)
pod = core_api.read_namespaced_pod(name=pod_name, namespace=namespace)
while not pod.status and pod.status.phase != 'Completed':
pod = core_api.rad_namespaced_pod(name=pod_name, namespace=namespace)
time.sleep(10)
print('Finished clearing GPU memory of the node ' + pod_dict['spec']['nodeName'])
"""
with open(daemonset_file) as f:
daemonset_dict = yaml.safe_load(f)
daemonset_name = daemonset_dict['metadata']['name']
......@@ -78,4 +97,5 @@ k8s_co_client.delete_namespaced_custom_object(
print('Deleting daemonset ' + daemonset_name)
resp = api.delete_namespaced_daemon_set(daemonset_dict['metadata']['name'], namespace)
print(resp)
\ No newline at end of file
print(resp)
"""
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment