Merge branch 'all_types_cleanup' into 'master'

Cleanup for all cases See merge request !3

Merge branch 'all_types_cleanup' into 'master'
997ab411 · Alex Iribarren · acf1f9e6 · 405aedaa · 997ab411 · 997ab411
Commit 997ab411 authored 4 years ago by Alex Iribarren
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -37,9 +37,12 @@ testing_cleanup:
    only:
        - schedules
    script:
+        # Required to use ai-tools and delete puppet managed machines
+        - echo "${IMAGECI_PWD}" | kinit ${IMAGECI_USER}@CERN.CH
+        # Required to use openstack client and delete non-puppet machines
        - export OS_USERNAME=${IMAGECI_USER}
        - export OS_PASSWORD=${IMAGECI_PWD}
-        # OS_TENANT_ID corresponds to "IT Linux Support - CI VMs" tenant.
-        # Python clients cannot escape spaces in tenant's name
-        - export OS_TENANT_ID=8e35e961-f888-4622-bf7d-5fe9eb8cf563
-        - python /cleanup.py
+        # Run against the Virtual machines tenant
+        - python3 ./cleanup.py --tenant_name "IT Linux Support - CI VMs" --metadata_cleanup "centos_test_cleanup" --metadata_managed "puppet_managed"
+        # Run against the Physical machines tenant
+        - python3 ./cleanup.py --tenant_name "IT Linux Support - CI Physical" --metadata_cleanup "centos_test_cleanup" --metadata_managed "puppet_managed"
--- a/Dockerfile
+++ b/Dockerfile
-FROM python:3.7
+FROM gitlab-registry.cern.ch/ai-config-team/ai-tools

-MAINTAINER Daniel Juarez Gonzalez <djuarezg@cern.ch>
-
-# python-novaclient and python-glanceclient for openstack management
-RUN pip install python-novaclient python-glanceclient pyyaml
+LABEL maintainer="Daniel Juarez Gonzalez djuarezg@cern.ch"

 COPY cleanup.py /
-
-ENTRYPOINT ["python", "/cleanup.py"]
--- a/README.md
+++ b/README.md
-# Openstack VM cleanup
+# Openstack machines cleanup

-Docker image for cleaning up VMs generated during <https://gitlab.cern.ch/linuxsupport/koji-image-build> testing.
+Docker image for cleaning up machines generated during <https://gitlab.cern.ch/linuxsupport/koji-image-build> and <https://gitlab.cern.ch/imageci> and old CentOS test images.

-Marking VMs metadata key `centos_test_cleanup` as `false` will avoid the cleaning up. This is useful for debugging why a test failed.
+Marking machine's metadata key `centos_test_cleanup` as `false` will avoid the cleaning up. This is useful for debugging why a test failed.

 ## Behaviour

 This image will run in a scheduled job that performs the following actions:

-1. Checks for any virtual machines running on a given Openstack tenant that have `centos_test_cleanup` metadata property.
-1. For all retrieved servers, it deletes any failed VM, no matter how or from where it spawned.
-1. According to metadata options checks for VMs that ran longer than `ACTIVE_HOURS_THRESHOLD`.
\ No newline at end of file
+1. Checks for any virtual or physical machines running on a given Openstack tenant (`--tenant_name`)
+1. For each node marked with `centos_test_cleanup` metadata property
+    1. For all retrieved servers, it deletes any failed machine, no matter how or from where it spawned
+    1. If marked with `puppet_managed==true` it will be processed as a puppet machine and will be deleted using `ai-kill` to remove Foreman entries
+    1. If marked with `puppet_managed==false` it will be processed as a non-puppet machine and will be deleted using `openstack server delete`
+1. According to metadata options checks for machines that ran longer than `ACTIVE_HOURS_THRESHOLD`.
--- a/cleanup.py
+++ b/cleanup.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
 import os
 import yaml
 import sys
-from keystoneauth1 import loading
+import argparse
+import logging
+from subprocess import Popen, PIPE
 from keystoneauth1 import session
+from keystoneauth1.identity import v3
 from novaclient import client as novaclient
 from glanceclient import client as glanceclient
 from datetime import datetime, timedelta

 OS_USERNAME = os.environ['OS_USERNAME']
 OS_PASSWORD = os.environ['OS_PASSWORD']
-# Can't use tenant name as it has spaces
-OS_TENANT_ID = os.environ['OS_TENANT_ID']
 # Check urls and API versions on https://openstack.cern.ch/project/api_access/
-OS_AUTH_URL = "https://keystone.cern.ch/v3"
-NOVA_VERSION = "2.1"
-GLANCE_VERSION = "2"
-METADATA_KEY = "centos_test_cleanup"
+OS_AUTH_URL = 'https://keystone.cern.ch/v3'
+NOVA_VERSION = '2.1'
+GLANCE_VERSION = '2'
 # Maximum hours for a test VM to live, 1 week
-VM_ACTIVE_HOURS_THRESHOLD = 168
+ACTIVE_HOURS_THRESHOLD = 168
 # Maximum hours for a test image to exist, 2 weeks
 IMAGE_AGE_HOURS_THRESHOLD = 336
+# ai-tools command used for deleting managed nodes
+AIKILL_BIN_PATH = '/usr/bin/ai-kill'

+# Parse arguments from command line
+parser = argparse.ArgumentParser()
+parser.add_argument('-tn', '--tenant_name', required=True)
+parser.add_argument('-mc', '--metadata_cleanup', default='centos_test_cleanup')
+parser.add_argument('-mm', '--metadata_managed', default='puppet_managed')
+args = parser.parse_args()

+# Force logger to print through stderr and add default format to messages
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+ch = logging.StreamHandler(sys.stderr)
+ch.setLevel(logging.INFO)
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
+
+def delete_aikill(h):
+    """
+    Delete the shortname for the given host
+
+    :param h: hostname of host to delete
+    :return: output details and return code
+    """
+
+    try:
+        args = [AIKILL_BIN_PATH, h]
+        aikill = Popen(args, stdout=PIPE, stderr=PIPE)
+        (details, err) = aikill.communicate()
+        returncode = aikill.returncode
+        if returncode != 0:
+            logging.error('%s returned non-zero status (%s)' % (args,
+                          err.strip()))
+        if len(err) > 0:
+            logging.error('ai-kill failed (%s)' % err.decode('utf-8'
+                          ).strip())
+    except Exception as e:
+        logging.error('ai-kill failed (%s)' % e)
+    return (details, returncode)
+
+
+# Set metadata keys for filtering servers
+# Only process machines marked with this metadata key
+METADATA_CLEANUP = args.metadata_cleanup
+# Use this metadata key to differentiate managed and unmanaged machines, as their deletion is different
+METADATA_MANAGED = args.metadata_managed
+OS_PROJECT_NAME = args.tenant_name
+
+# Override the env var as it is required by ai-kill
+os.environ['OS_PROJECT_NAME'] = OS_PROJECT_NAME
 try:
    # Openstack configuration
-    loader = loading.get_plugin_loader('password')
-    auth = loader.load_from_options(auth_url=OS_AUTH_URL,
-                                    username=OS_USERNAME,
-                                    password=OS_PASSWORD,
-                                    project_id=OS_TENANT_ID,
-                                    user_domain_name="Default")
+    auth = v3.Password(
+        auth_url=OS_AUTH_URL,
+        username=OS_USERNAME,
+        password=OS_PASSWORD,
+        project_name=OS_PROJECT_NAME,
+        user_domain_name='Default',
+        project_domain_name='Default',
+        )
    sess = session.Session(auth=auth)

    # Test instances cleanup
    # ----------------------
-    print("Begin test VMs cleanup...")
-
+    logging.info('Begin test machines cleanup...')
    nova = novaclient.Client(NOVA_VERSION, session=sess)

-    # Retrieve VMs with METADATA_KEY. We cannot use search_opts with metadata.
+    # Retrieve machines with metadata filtering. We cannot use search_opts with metadata.
    servers = nova.servers.list()
-    cleanup_servers = [server for server in servers if METADATA_KEY in server.metadata and server.metadata[METADATA_KEY] == "true"]
-    removal_threshold_datetime = datetime.now() - timedelta(hours=VM_ACTIVE_HOURS_THRESHOLD)
-
-    print("There are %s CentOS test servers configured for cleanup running in the tenant" % len(cleanup_servers))
-    for server in cleanup_servers:
-        # Delete failed VM creations
-        if server.status == "SHUTOFF" or server.status == "ERROR":
-            print("%s is in %s state, deleting..." % (server.name,server.status))
+
+    # Non-Puppet servers only
+    unmanaged_cleanup_servers = [server for server in servers
+                                if METADATA_CLEANUP in server.metadata
+                                and METADATA_MANAGED in server.metadata
+                                and server.metadata[METADATA_CLEANUP] == 'true'
+                                and server.metadata[METADATA_MANAGED] == 'false']
+
+    removal_threshold_datetime = datetime.now() - timedelta(hours=ACTIVE_HOURS_THRESHOLD)
+
+    # Process non-puppetized
+    logging.info('There are %s CentOS unmanaged test servers configured for cleanup running in the tenant, checking...'
+                % len(unmanaged_cleanup_servers))
+    for server in unmanaged_cleanup_servers:
+
+        # Delete failed machine creations
+        if server.status == 'SHUTOFF' or server.status == 'ERROR':
+            logging.info('%s is in %s state, deleting...' % (server.name, server.status))
            server.delete()
+
        # We do not care about any other condition, just delete if it ran for longer than ACTIVE_HOURS_THRESHOLD
        if datetime.strptime(server.created, '%Y-%m-%dT%H:%M:%SZ') < removal_threshold_datetime:
-            print("%s ran longer than %d hours, deleting..." % (server.name,VM_ACTIVE_HOURS_THRESHOLD))
+            logging.info('%s ran longer than %d hours, deleting...' % (server.name, ACTIVE_HOURS_THRESHOLD))
            server.delete()

-    # Test images cleanup
-    # -------------------
-    print("Begin test image cleanup...")
+    # Puppet servers only
+    managed_cleanup_servers = [server for server in servers
+                            if METADATA_CLEANUP in server.metadata
+                            and METADATA_MANAGED in server.metadata
+                            and server.metadata[METADATA_CLEANUP] == 'true'
+                            and server.metadata[METADATA_MANAGED] == 'true']

-    glance = glanceclient.Client(GLANCE_VERSION, session=sess)
+    # Process puppetized
+    logging.info('There are %s CentOS managed test servers configured for cleanup running in the tenant, checking...'
+                % len(managed_cleanup_servers))
+    for server in managed_cleanup_servers:
+
+        # Delete failed machine creations
+
+        if server.status == 'SHUTOFF' or server.status == 'ERROR':
+            logging.info('%s is in %s state, deleting...' % (server.name, server.status))
+            (out, returncode) = delete_aikill(server.name)
+            if out:
+                logging.info(out.strip())
+
+        # We do not care about any other condition, just delete if it ran for longer than ACTIVE_HOURS_THRESHOLD
+        if datetime.strptime(server.created, '%Y-%m-%dT%H:%M:%SZ') < removal_threshold_datetime:
+            logging.info('%s ran longer than %d hours, deleting...' % (server.name, ACTIVE_HOURS_THRESHOLD))
+            (out, returncode) = delete_aikill(server.name)
+            if out:
+                logging.info(out.strip())

+    # Test images cleanup, which will probably only run on "IT Linux Support - CI VMs" tenant
+    # -------------------
+    logging.info('Begin test image cleanup...')
+    glance = glanceclient.Client(GLANCE_VERSION, session=sess)
    removal_threshold_datetime = datetime.now() - timedelta(hours=IMAGE_AGE_HOURS_THRESHOLD)
-    # Retrieve images with METADATA_KEY as a property
-    for image in glance.images.list(filters={METADATA_KEY: "true"}):
+
+    # Retrieve images with METADATA_CLEANUP as a property
+    for image in glance.images.list(filters={METADATA_CLEANUP: 'true'}):

        # Make super sure we only delete what we want
-        if image[METADATA_KEY] != 'true' or \
-           image.gitops != 'enable' or \
-           image.visibility == "public" or \
-           image.os_edition != 'Test' or \
-           image.os_distro not in ['CC', 'C'] or \
-           image.os_distro_major not in ['7', '8']:
-          continue
+        if image[METADATA_CLEANUP] != 'true' or \
+            image.gitops != 'enable' or \
+            image.visibility == 'public' or \
+            image.os_edition != 'Test' or \
+            image.os_distro not in ['CC', 'C'] or \
+            image.os_distro_major not in ['7', '8']:
+            continue

        # Delete images that ran for longer than the existence threshold
        if datetime.strptime(image.created_at, '%Y-%m-%dT%H:%M:%SZ') < removal_threshold_datetime:
-            print("%s (%s) existed in TEST status for more than %d hours, deleting..." % (image.name, image.id, IMAGE_AGE_HOURS_THRESHOLD))
+            logging.info('%s (%s) existed in TEST status for more than %d hours, deleting...' % (image.name, image.id, IMAGE_AGE_HOURS_THRESHOLD))
            glance.images.delete(image.id)

+    logging.info('Cleanup checks finished')
+
 except Exception as e:
-    print("There was an error during the cleanup, will retry on the next run")
+    logging.error("There was an error during the cleanup, will retry on the next run")
    sys.exit(str(e))

-print("Finished cleaning up CentOS test servers and images")
+logging.info("Finished cleaning up CentOS test servers and images")