Skip to content
Snippets Groups Projects
Commit 997ab411 authored by Alex Iribarren's avatar Alex Iribarren
Browse files

Merge branch 'all_types_cleanup' into 'master'

Cleanup for all cases

See merge request !3
parents acf1f9e6 405aedaa
No related branches found
No related tags found
1 merge request!3Cleanup for all cases
Pipeline #2567310 passed
......@@ -37,9 +37,12 @@ testing_cleanup:
only:
- schedules
script:
# Required to use ai-tools and delete puppet managed machines
- echo "${IMAGECI_PWD}" | kinit ${IMAGECI_USER}@CERN.CH
# Required to use openstack client and delete non-puppet machines
- export OS_USERNAME=${IMAGECI_USER}
- export OS_PASSWORD=${IMAGECI_PWD}
# OS_TENANT_ID corresponds to "IT Linux Support - CI VMs" tenant.
# Python clients cannot escape spaces in tenant's name
- export OS_TENANT_ID=8e35e961-f888-4622-bf7d-5fe9eb8cf563
- python /cleanup.py
# Run against the Virtual machines tenant
- python3 ./cleanup.py --tenant_name "IT Linux Support - CI VMs" --metadata_cleanup "centos_test_cleanup" --metadata_managed "puppet_managed"
# Run against the Physical machines tenant
- python3 ./cleanup.py --tenant_name "IT Linux Support - CI Physical" --metadata_cleanup "centos_test_cleanup" --metadata_managed "puppet_managed"
FROM python:3.7
FROM gitlab-registry.cern.ch/ai-config-team/ai-tools
MAINTAINER Daniel Juarez Gonzalez <djuarezg@cern.ch>
# python-novaclient and python-glanceclient for openstack management
RUN pip install python-novaclient python-glanceclient pyyaml
LABEL maintainer="Daniel Juarez Gonzalez djuarezg@cern.ch"
COPY cleanup.py /
ENTRYPOINT ["python", "/cleanup.py"]
# Openstack VM cleanup
# Openstack machines cleanup
Docker image for cleaning up VMs generated during <https://gitlab.cern.ch/linuxsupport/koji-image-build> testing.
Docker image for cleaning up machines generated during <https://gitlab.cern.ch/linuxsupport/koji-image-build> and <https://gitlab.cern.ch/imageci> and old CentOS test images.
Marking VMs metadata key `centos_test_cleanup` as `false` will avoid the cleaning up. This is useful for debugging why a test failed.
Marking machine's metadata key `centos_test_cleanup` as `false` will avoid the cleaning up. This is useful for debugging why a test failed.
## Behaviour
This image will run in a scheduled job that performs the following actions:
1. Checks for any virtual machines running on a given Openstack tenant that have `centos_test_cleanup` metadata property.
1. For all retrieved servers, it deletes any failed VM, no matter how or from where it spawned.
1. According to metadata options checks for VMs that ran longer than `ACTIVE_HOURS_THRESHOLD`.
\ No newline at end of file
1. Checks for any virtual or physical machines running on a given Openstack tenant (`--tenant_name`)
1. For each node marked with `centos_test_cleanup` metadata property
1. For all retrieved servers, it deletes any failed machine, no matter how or from where it spawned
1. If marked with `puppet_managed==true` it will be processed as a puppet machine and will be deleted using `ai-kill` to remove Foreman entries
1. If marked with `puppet_managed==false` it will be processed as a non-puppet machine and will be deleted using `openstack server delete`
1. According to metadata options checks for machines that ran longer than `ACTIVE_HOURS_THRESHOLD`.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import yaml
import sys
from keystoneauth1 import loading
import argparse
import logging
from subprocess import Popen, PIPE
from keystoneauth1 import session
from keystoneauth1.identity import v3
from novaclient import client as novaclient
from glanceclient import client as glanceclient
from datetime import datetime, timedelta
OS_USERNAME = os.environ['OS_USERNAME']
OS_PASSWORD = os.environ['OS_PASSWORD']
# Can't use tenant name as it has spaces
OS_TENANT_ID = os.environ['OS_TENANT_ID']
# Check urls and API versions on https://openstack.cern.ch/project/api_access/
OS_AUTH_URL = "https://keystone.cern.ch/v3"
NOVA_VERSION = "2.1"
GLANCE_VERSION = "2"
METADATA_KEY = "centos_test_cleanup"
OS_AUTH_URL = 'https://keystone.cern.ch/v3'
NOVA_VERSION = '2.1'
GLANCE_VERSION = '2'
# Maximum hours for a test VM to live, 1 week
VM_ACTIVE_HOURS_THRESHOLD = 168
ACTIVE_HOURS_THRESHOLD = 168
# Maximum hours for a test image to exist, 2 weeks
IMAGE_AGE_HOURS_THRESHOLD = 336
# ai-tools command used for deleting managed nodes
AIKILL_BIN_PATH = '/usr/bin/ai-kill'
# Parse arguments from command line
parser = argparse.ArgumentParser()
parser.add_argument('-tn', '--tenant_name', required=True)
parser.add_argument('-mc', '--metadata_cleanup', default='centos_test_cleanup')
parser.add_argument('-mm', '--metadata_managed', default='puppet_managed')
args = parser.parse_args()
# Force logger to print through stderr and add default format to messages
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler(sys.stderr)
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
def delete_aikill(h):
"""
Delete the shortname for the given host
:param h: hostname of host to delete
:return: output details and return code
"""
try:
args = [AIKILL_BIN_PATH, h]
aikill = Popen(args, stdout=PIPE, stderr=PIPE)
(details, err) = aikill.communicate()
returncode = aikill.returncode
if returncode != 0:
logging.error('%s returned non-zero status (%s)' % (args,
err.strip()))
if len(err) > 0:
logging.error('ai-kill failed (%s)' % err.decode('utf-8'
).strip())
except Exception as e:
logging.error('ai-kill failed (%s)' % e)
return (details, returncode)
# Set metadata keys for filtering servers
# Only process machines marked with this metadata key
METADATA_CLEANUP = args.metadata_cleanup
# Use this metadata key to differentiate managed and unmanaged machines, as their deletion is different
METADATA_MANAGED = args.metadata_managed
OS_PROJECT_NAME = args.tenant_name
# Override the env var as it is required by ai-kill
os.environ['OS_PROJECT_NAME'] = OS_PROJECT_NAME
try:
# Openstack configuration
loader = loading.get_plugin_loader('password')
auth = loader.load_from_options(auth_url=OS_AUTH_URL,
username=OS_USERNAME,
password=OS_PASSWORD,
project_id=OS_TENANT_ID,
user_domain_name="Default")
auth = v3.Password(
auth_url=OS_AUTH_URL,
username=OS_USERNAME,
password=OS_PASSWORD,
project_name=OS_PROJECT_NAME,
user_domain_name='Default',
project_domain_name='Default',
)
sess = session.Session(auth=auth)
# Test instances cleanup
# ----------------------
print("Begin test VMs cleanup...")
logging.info('Begin test machines cleanup...')
nova = novaclient.Client(NOVA_VERSION, session=sess)
# Retrieve VMs with METADATA_KEY. We cannot use search_opts with metadata.
# Retrieve machines with metadata filtering. We cannot use search_opts with metadata.
servers = nova.servers.list()
cleanup_servers = [server for server in servers if METADATA_KEY in server.metadata and server.metadata[METADATA_KEY] == "true"]
removal_threshold_datetime = datetime.now() - timedelta(hours=VM_ACTIVE_HOURS_THRESHOLD)
print("There are %s CentOS test servers configured for cleanup running in the tenant" % len(cleanup_servers))
for server in cleanup_servers:
# Delete failed VM creations
if server.status == "SHUTOFF" or server.status == "ERROR":
print("%s is in %s state, deleting..." % (server.name,server.status))
# Non-Puppet servers only
unmanaged_cleanup_servers = [server for server in servers
if METADATA_CLEANUP in server.metadata
and METADATA_MANAGED in server.metadata
and server.metadata[METADATA_CLEANUP] == 'true'
and server.metadata[METADATA_MANAGED] == 'false']
removal_threshold_datetime = datetime.now() - timedelta(hours=ACTIVE_HOURS_THRESHOLD)
# Process non-puppetized
logging.info('There are %s CentOS unmanaged test servers configured for cleanup running in the tenant, checking...'
% len(unmanaged_cleanup_servers))
for server in unmanaged_cleanup_servers:
# Delete failed machine creations
if server.status == 'SHUTOFF' or server.status == 'ERROR':
logging.info('%s is in %s state, deleting...' % (server.name, server.status))
server.delete()
# We do not care about any other condition, just delete if it ran for longer than ACTIVE_HOURS_THRESHOLD
if datetime.strptime(server.created, '%Y-%m-%dT%H:%M:%SZ') < removal_threshold_datetime:
print("%s ran longer than %d hours, deleting..." % (server.name,VM_ACTIVE_HOURS_THRESHOLD))
logging.info('%s ran longer than %d hours, deleting...' % (server.name, ACTIVE_HOURS_THRESHOLD))
server.delete()
# Test images cleanup
# -------------------
print("Begin test image cleanup...")
# Puppet servers only
managed_cleanup_servers = [server for server in servers
if METADATA_CLEANUP in server.metadata
and METADATA_MANAGED in server.metadata
and server.metadata[METADATA_CLEANUP] == 'true'
and server.metadata[METADATA_MANAGED] == 'true']
glance = glanceclient.Client(GLANCE_VERSION, session=sess)
# Process puppetized
logging.info('There are %s CentOS managed test servers configured for cleanup running in the tenant, checking...'
% len(managed_cleanup_servers))
for server in managed_cleanup_servers:
# Delete failed machine creations
if server.status == 'SHUTOFF' or server.status == 'ERROR':
logging.info('%s is in %s state, deleting...' % (server.name, server.status))
(out, returncode) = delete_aikill(server.name)
if out:
logging.info(out.strip())
# We do not care about any other condition, just delete if it ran for longer than ACTIVE_HOURS_THRESHOLD
if datetime.strptime(server.created, '%Y-%m-%dT%H:%M:%SZ') < removal_threshold_datetime:
logging.info('%s ran longer than %d hours, deleting...' % (server.name, ACTIVE_HOURS_THRESHOLD))
(out, returncode) = delete_aikill(server.name)
if out:
logging.info(out.strip())
# Test images cleanup, which will probably only run on "IT Linux Support - CI VMs" tenant
# -------------------
logging.info('Begin test image cleanup...')
glance = glanceclient.Client(GLANCE_VERSION, session=sess)
removal_threshold_datetime = datetime.now() - timedelta(hours=IMAGE_AGE_HOURS_THRESHOLD)
# Retrieve images with METADATA_KEY as a property
for image in glance.images.list(filters={METADATA_KEY: "true"}):
# Retrieve images with METADATA_CLEANUP as a property
for image in glance.images.list(filters={METADATA_CLEANUP: 'true'}):
# Make super sure we only delete what we want
if image[METADATA_KEY] != 'true' or \
image.gitops != 'enable' or \
image.visibility == "public" or \
image.os_edition != 'Test' or \
image.os_distro not in ['CC', 'C'] or \
image.os_distro_major not in ['7', '8']:
continue
if image[METADATA_CLEANUP] != 'true' or \
image.gitops != 'enable' or \
image.visibility == 'public' or \
image.os_edition != 'Test' or \
image.os_distro not in ['CC', 'C'] or \
image.os_distro_major not in ['7', '8']:
continue
# Delete images that ran for longer than the existence threshold
if datetime.strptime(image.created_at, '%Y-%m-%dT%H:%M:%SZ') < removal_threshold_datetime:
print("%s (%s) existed in TEST status for more than %d hours, deleting..." % (image.name, image.id, IMAGE_AGE_HOURS_THRESHOLD))
logging.info('%s (%s) existed in TEST status for more than %d hours, deleting...' % (image.name, image.id, IMAGE_AGE_HOURS_THRESHOLD))
glance.images.delete(image.id)
logging.info('Cleanup checks finished')
except Exception as e:
print("There was an error during the cleanup, will retry on the next run")
logging.error("There was an error during the cleanup, will retry on the next run")
sys.exit(str(e))
print("Finished cleaning up CentOS test servers and images")
logging.info("Finished cleaning up CentOS test servers and images")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment