From 010838710a6feb768e775b7fd11374dd28391ca1 Mon Sep 17 00:00:00 2001
From: Maarten Litmaath <Maarten.Litmaath@cern.ch>
Date: Sun, 30 Jul 2023 23:20:45 +0200
Subject: [PATCH] Adjustments for JAliEn and EL9, various cleanups etc.

---
 docs/site/vobox_htc_arc.md | 393 +++++++++++++++++--------------------
 1 file changed, 176 insertions(+), 217 deletions(-)

diff --git a/docs/site/vobox_htc_arc.md b/docs/site/vobox_htc_arc.md
index a51cc27..25abe11 100644
--- a/docs/site/vobox_htc_arc.md
+++ b/docs/site/vobox_htc_arc.md
@@ -1,36 +1,26 @@
-# HTCondor/ARC Installation on VOBox
+# HTCondor/ARC Installation on VObox
 
-This documentation describes how to configure VOBox to enable it submit ALICE jobs to [HTCondor CEs](#htcondor) or [ARC](#arc).
-Reference the appropriate section as needed.
+This documentation describes how to configure a VObox to enable it submit ALICE jobs to [HTCondor CEs](#htcondor) or [ARC](#arc).
+Refer to the appropriate section as needed.
 
-## HTCondor
-
-The VOBox will run its own HTCondor services that are __independent__ of the HTCondor services for your CE and batch system.
-The following instructions assume you are using  __CentOS/EL 7.5+__.
-
-### Install HTCondor
+The VObox will typically have been set up first as a __WLCG VObox__ as documented here: <br/><br/>
 
-1. Go to the repositories folder:
+   https://twiki.cern.ch/twiki/bin/view/LCG/WLCGvoboxDeployment
 
-    ```console
-    ~# cd /etc/yum.repos.d/
-    ```
+## HTCondor
 
-2. Depending on your OS version, download the relevant repository:
+The VObox will run its own HTCondor services that are __independent__ of the HTCondor services for your CE and batch system.
+The following instructions assume you are using  __CentOS/EL 7.5+__.  See below for installations compatible with __EL 9__.
 
-    | | |
-    |-|-|
-    |__CentOS/EL7__| ```~# wget http://research.cs.wisc.edu/htcondor/yum/repo.d/htcondor-stable-rhel7.repo``` |
+### Install HTCondor on CentOS 7
 
-3. Import RPM key for the repository:
+1. Install the EGI UMD 4 repository rpm:
 
     ```console
-    ~# cd /etc/pki/rpm-gpg/
-    ~# wget http://research.cs.wisc.edu/htcondor/yum/RPM-GPG-KEY-HTCondor
-    ~# rpm --import RPM-GPG-KEY-HTCondor
-    ``` 
+    ~# yum install http://repository.egi.eu/sw/production/umd/4/centos7/x86_64/updates/umd-release-4.1.3-1.el7.centos.noarch.rpm
+    ```
 
-4. Install HTCondor 8.5.5 or later:
+2. Install HTCondor __9.0.16__ or a later __9.0.x__ version (not yet 10.x):
 
     ```console
     ~# cd 
@@ -38,9 +28,9 @@ The following instructions assume you are using  __CentOS/EL 7.5+__.
     ~# yum install condor
     ```
 
-### AliEn Configuration
+### JAliEn Configuration
 
-This configuration is needed for HTCondor that _may_ run a _JobRouter_ (not needed for standard deployments).
+This configuration is needed for HTCondor that _used_ run a _JobRouter_ (not needed anymore).
 
 1. Go to the HTCondor configuration folder:
 
@@ -59,92 +49,45 @@ This configuration is needed for HTCondor that _may_ run a _JobRouter_ (not need
     ??? info "config.d/01_alice_jobrouter.config"
 
         ```bash
-        DAEMON_LIST = MASTER, SCHEDD, COLLECTOR    # , JOB_ROUTER    # (router not needed for standard deployments)
+        DAEMON_LIST = MASTER, SCHEDD, COLLECTOR
 
         # the next line is needed since recent HTCondor versions
 
         COLLECTOR_HOST = $(FULL_HOSTNAME)
 
-        CERTIFICATE_MAPFILE = /etc/condor/certificate_mapfile
         GSI_DAEMON_DIRECTORY = /etc/grid-security
         GSI_DAEMON_CERT = $(GSI_DAEMON_DIRECTORY)/hostcert.pem
         GSI_DAEMON_KEY  = $(GSI_DAEMON_DIRECTORY)/hostkey.pem
         GSI_DAEMON_TRUSTED_CA_DIR = $(GSI_DAEMON_DIRECTORY)/certificates
 
-        SEC_CLIENT_AUTHENTICATION_METHODS = FS, GSI
+        SEC_CLIENT_AUTHENTICATION_METHODS = SCITOKENS, FS, GSI
         SEC_DEFAULT_AUTHENTICATION_METHODS = FS, GSI
         SEC_DAEMON_AUTHENTICATION_METHODS = FS, GSI
 
+	AUTH_SSL_CLIENT_CADIR = /etc/grid-security/certificates
+
         COLLECTOR.ALLOW_ADVERTISE_MASTER = condor@fsauth/$(FULL_HOSTNAME)
         COLLECTOR.ALLOW_ADVERTISE_SCHEDD = $(FULL_HOSTNAME)
 
-        GRIDMAP = /etc/grid-security/grid-mapfile
-
         ALL_DEBUG = D_FULLDEBUG D_COMMAND
         SCHEDD_DEBUG = D_FULLDEBUG
-
-        # NOTE: the max jobs parameters below will need to be increased
-
-        # MaxJobs: typically ~10% more than the number of 1-core slots in the batch system
-
-        JOB_ROUTER_DEFAULTS = \
-         [ requirements=target.WantJobRouter is True; \
-           EditJobInPlace = True; \
-           MaxIdleJobs = 50; \
-           MaxJobs = 200; \
-           delete_WantJobRouter = true; \
-           delete_JobLeaseDuration = True; \
-           set_JobUniverse = 9; \
-           set_remote_jobuniverse = 5; \
-         ]
-
-        # NOTE: it typically is better _not_ to use such static entries, but rather the command below
-
-        #JOB_ROUTER_ENTRIES = \
-        #   [ GridResource = "condor your-CE.your-domain your-CE.your-domain:9619"; \
-        #     eval_set_GridResource = "condor your-CE.your-domain your-CE.your-domain:9619"; \
-        #     name = "My cluster"; \
-        #   ]
-
-        # configure a script to get the proper entries from the ALICE LDAP server (provided below)
-
-        JOB_ROUTER_ENTRIES_CMD = /var/lib/condor/get_job_routes.sh
-
-        JOB_ROUTER_ENTRIES_REFRESH = 300
-
-        JOB_ROUTER_POLLING_PERIOD = 10
-
-        JOB_ROUTER_ROUND_ROBIN_SELECTION = True
-
-        JOB_ROUTER_SCHEDD2_NAME = $(FULL_HOSTNAME)
-
-        JOB_ROUTER_SCHEDD2_POOL = $(FULL_HOSTNAME):9618
-        JOB_ROUTER_DEBUG = D_FULLDEBUG
-
         GRIDMANAGER_DEBUG = D_FULLDEBUG
-        JOB_ROUTER_SCHEDD2_SPOOL=/var/lib/condor/spool
 
         FRIENDLY_DAEMONS = condor@fsauth/$(FULL_HOSTNAME), root@fsauth/$(FULL_HOSTNAME), $(FULL_HOSTNAME)
-
         ALLOW_DAEMON = $(FRIENDLY_DAEMONS)
 
         SCHEDD.ALLOW_WRITE = $(FRIENDLY_DAEMONS), *@cern.ch/$(FULL_HOSTNAME)
-        ALLOW_DAEMON = $(ALLOW_DAEMON) $(FRIENDLY_DAEMONS)
 
-        # ========== FULL DEBUGS =============
-
-        GRIDMANAGER_DEBUG = D_FULLDEBUG
-
-        # more stuff from the CERN VOBOXes
+        # more stuff from the CERN VOboxes
 
         CONDOR_FSYNC = False
-        GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE = 1000   # to be increased (see MaxJobs above)
+        GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE = 10000
 
         GRIDMANAGER_JOB_PROBE_INTERVAL = 600
 
         GRIDMANAGER_MAX_PENDING_REQUESTS = 500
         GRIDMANAGER_GAHP_CALL_TIMEOUT = 3600
-        GRIDMANAGER_SELECTION_EXPR = (ClusterId % 2)          # 2 should be enough already
+        GRIDMANAGER_SELECTION_EXPR = (ClusterId % 2)
         GRIDMANAGER_GAHP_RESPONSE_TIMEOUT = 300
         GRIDMANAGER_DEBUG =
         ALLOW_DAEMON = $(ALLOW_DAEMON), $(FULL_HOSTNAME), $(IP_ADDRESS), unauthenticated@unmapped
@@ -162,142 +105,170 @@ This configuration is needed for HTCondor that _may_ run a _JobRouter_ (not need
     ~# service condor restart
     ~# chkconfig condor on
     ```
-    
+
 5. Check HTCondor is running and produces the following initial output:
 
     ```console
     ~# pstree | grep condor
 
      |-condor_master-+-condor_collecto
-     |               |-condor_job_rout
      |               |-condor_procd
      |               |-condor_schedd
      |               `-condor_shared_p
     ```
 
-### LDAP and VOBox Configuration
+### Install HTCondor on EL 9
 
-In the __Environment__ section add/adjust the values as needed:
+1. Install the HTCondor 10.x repository rpm:
 
-| Definition | Description |
-|:-----------|:------------|
-| ```USE_JOB_ROUTER=( 1 | 0)``` | Whether is is necessary to use job router service |
-| ```GRID_RESOURCE=condor your-CE.your-domain your-CE.your-domain:9619``` | HTCondor resource for explicitly defined for<br> submission to vanilla universe, otherwise<br> system default resource will be selected |
-| ```ROUTES_LIST=[ your-ce01.your-domain:9619 ] [ your-ce02.your-domain:9619 ]``` | Routes list example |
-| ```USE_EXTERNAL_CLOUD=(1 | 0)``` | Whether to use external cloud |
-| ```SUBMIT_ARGS=-append "+TestClassAd=1"```<br>```SUBMIT_ARGS=<String>``` | Specify extra options for condor_submit<br> command. Example: add extra ClassAds<br> to the job description |   
+    ```console
+    ~# yum install https://research.cs.wisc.edu/htcondor/repo/10.x/htcondor-release-current.el9.noarch.rpm
+    ```
 
-In ```~/.alien/Environment``` on the VOBox:
+2. Install HTCondor 10.x:
 
-```console
-d=$HOME/htcondor
-mkdir -p $d
+    ```console
+    ~# cd 
+    ~# yum update
+    ~# yum install condor
+    ```
 
-export HTCONDOR_LOG_PATH=$d
-```
+3. Add the following configuration contents:
 
-!!! warning ""
-    Mind the firewall settings on the VOBox. See [Network setup](../vobox/#network) for more details.
+    ??? info "/etc/condor/config.d/00-minicondor.vobox"
 
-### Miscellaneous Scripts
+        ```bash
+	# HTCONDOR CONFIGURATION TO CREATE A POOL WITH ONE MACHINE
+	# --> modified to allow it to be used ONLY for submitting to REMOTE CEs!
+	#
+	# This file was created upon initial installation of HTCondor.
+	# It contains configuration settings to set up a secure HTCondor
+	# installation consisting of **just one single machine**.
+	# YOU WILL WANT TO REMOVE THIS FILE IF/WHEN YOU DECIDE TO ADD ADDITIONAL
+	# MACHINES TO YOUR HTCONDOR INSTALLATION!  Most of these settings do
+	# not make sense if you have a multi-server pool.
+	#
+	# See the Quick Start Installation guide at:
+	#     https://htcondor.org/manual/quickstart.html
+	#
+
+	# ---  NODE ROLES  ---
+
+	# Every pool needs one Central Manager, some number of Submit nodes and
+	# as many Execute nodes as you can find. Consult the manual to learn
+	# about addtional roles.
+
+	use ROLE: CentralManager
+	use ROLE: Submit
+	# --> next line commented out to prevent jobs from running on this host:
+	# use ROLE: Execute
+
+	# --- NETWORK SETTINGS ---
+
+	# Configure HTCondor services to listen to port 9618 on the IPv4
+	# loopback interface.
+	# --> next line commented out to allow job submissions to remote CEs:
+	# NETWORK_INTERFACE = 127.0.0.1
+	BIND_ALL_INTERFACES = False
+	CONDOR_HOST = 127.0.0.1
+	# --> next line added to avoid condor_status errors:
+	CONDOR_HOST = $(HOSTNAME)
+
+	# --- SECURITY SETTINGS ---
+
+	# Verify authenticity of HTCondor services by checking if they are
+	# running with an effective user id of user "condor".
+	SEC_DEFAULT_AUTHENTICATION = REQUIRED
+	SEC_DEFAULT_INTEGRITY = REQUIRED
+	ALLOW_DAEMON = condor@$(UID_DOMAIN)
+	ALLOW_NEGOTIATOR = condor@$(UID_DOMAIN)
+
+	# Configure so only user root or user condor can run condor_on,
+	# condor_off, condor_restart, and condor_userprio commands to manage
+	# HTCondor on this machine.
+	# If you wish any user to do so, comment out the line below.
+	ALLOW_ADMINISTRATOR = root@$(UID_DOMAIN) condor@$(UID_DOMAIN)
+
+	# Allow anyone (on the loopback interface) to submit jobs.
+	ALLOW_WRITE = *
+	# Allow anyone (on the loopback interface) to run condor_q or condor_status.
+	ALLOW_READ = *
+
+	# --- PERFORMANCE TUNING SETTINGS ---
+
+	# Since there is just one server in this pool, we can tune various
+	# polling intervals to be much more responsive than the system defaults
+	# (which are tuned for pools with thousands of servers).  This will
+	# enable jobs to be scheduled faster, and job monitoring to happen more
+	# frequently.
+	SCHEDD_INTERVAL = 5
+	NEGOTIATOR_INTERVAL = 2
+	NEGOTIATOR_CYCLE_DELAY = 5
+	STARTER_UPDATE_INTERVAL = 5
+	SHADOW_QUEUE_UPDATE_INTERVAL = 10
+	UPDATE_INTERVAL = 5
+	RUNBENCHMARKS = 0
+
+	# --- COMMON CHANGES ---
+
+	# Uncomment the lines below and do 'sudo condor_reconfig' if you wish
+	# condor_q to show jobs from all users with one line per job by default.
+	#CONDOR_Q_DASH_BATCH_IS_DEFAULT = False
+	#CONDOR_Q_ONLY_MY_JOBS = False
+        ```
 
-The following script helps fill the routes list from LDAP:
+    ??? info "/etc/condor/config.d/99-alice-vobox.conf"
 
-??? info "Routes script"
+        ```bash
+	# non-standard settings for an ALICE VObox
 
-    ```bash
-    #!/bin/bash
-    # print HTCondor job routes obtained from the ALICE LDAP server
-    #
-    # example settings in /etc/condor/config.d:
-    #
-    # JOB_ROUTER_ENTRIES_CMD = /var/lib/condor/get_job_routes.sh
-    # JOB_ROUTER_ENTRIES_REFRESH = 600
-    #
-    # version 1.3 (2017/04/04)
-    # author: Maarten Litmaath
-
-    usage()
-    {
-        echo "Usage: $0 [-n] [ FQHN ]" >&2
-        exit 1
-    }
-
-    LOG=/tmp/job-routes-$(date '+%y%m%d').log
-    LDAP_ADDR=alice-ldap.cern.ch:8389
-    h=$(hostname -f)
-
-    case $1 in
-    -n)
-        LOG=
-        shift
-    esac
-
-    case $1 in
-    -*)
-        usage
-        ;;
-    ?*.?*.?*)
-        h=$1
-        ;;
-    ?*)
-        usage
-    esac
-
-    f="(&(objectClass=AlienCE)(host=$h))"
-
-    #
-    # wrapped example output lines returned by the ldapsearch:
-    #
-    # environment: ROUTES_LIST=\
-    # [ "condor ce503.cern.ch ce503.cern.ch:9619" ] \
-    # [ "condor ce504.cern.ch ce504.cern.ch:9619"; optional extra stuff ] \
-    # [ "condor ce505.cern.ch ce505.cern.ch:9619" ] \
-    # [ "condor ce506.cern.ch ce506.cern.ch:9619" ]
-    #
-    # or a simpler format (the port currently is needed for the SAM VO feed):
-    #
-    # environment: ROUTES_LIST=\
-    # [ ce503.cern.ch:9619 ] \
-    # [ ce504.cern.ch:9619; optional extra stuff ] \
-    # [ ce505.cern.ch:9619 ] \
-    # [ ce506.cern.ch:9619 ]
-    #
-    # the next line may even be absent:
-    #
-    # environment: USE_EXTERNAL_CLOUD=0
-    #
-
-    if [ "x$LOG" = x ]
-    then
-        LOG=/dev/null
-    else
-        echo == $(date) >> $LOG
-        exec 2>> $LOG
-    fi
-
-    ldapsearch -LLL -x -h $LDAP_ADDR -b o=alice,dc=cern,dc=ch "$f" environment |
-        perl -p00e 's/\r?\n //g' | perl -ne '
-            if (s/^environment: ROUTES_LIST *= *//i) {
-                s/\[ *([^]" ]+)(:\d+) *([];])/[ "condor $1 $1$2" $3/g;
-                s/\[ *([^]" ]+) *([];])/[ "condor $1 $1:9619" $2/g;
-                s/\[ *[^"]*"/[ "/g;
-                s/\[ *("[^"]+")/[ GridResource = $1; eval_set_GridResource = $1/g;
-                $routes = $_;
-                next;
-            }
-            if (s/^environment: USE_EXTERNAL_CLOUD *= *//i) {
-                $extern = "; set_WantExternalCloud = True" if /1/;
-                next;
-            }
-            END {
-                $extern .= " ]";
-                $routes =~ s/;? *]/$extern/eg;
-                print $routes;
-            }
-        ' | tee -a $LOG
+	CONDOR_FSYNC = False
+
+	DELEGATE_JOB_GSI_CREDENTIALS_LIFETIME = 0
+
+	GRIDMANAGER_DEBUG =
+	GRIDMANAGER_GAHP_CALL_TIMEOUT = 3600
+	GRIDMANAGER_GAHP_RESPONSE_TIMEOUT = 300
+	GRIDMANAGER_JOB_PROBE_INTERVAL = 600
+	GRIDMANAGER_MAX_PENDING_REQUESTS = 500
+	GRIDMANAGER_MAX_SUBMITTED_JOBS_PER_RESOURCE = 10000
+	GRIDMANAGER_SELECTION_EXPR = (ClusterId % 2)
+
+	GSI_SKIP_HOST_CHECK = true
+        ```
+
+4. Restart HTCondor now and automatically at boot time:
+
+    ```console
+    ~# systemctl enable --now condor
     ```
 
+5. Check HTCondor is running and produces the following initial output:
+
+    ```console
+    ~# pstree | grep condor
+        |-condor_master-+-condor_collecto
+        |               |-condor_negotiat
+        |               |-condor_procd
+        |               |-condor_schedd
+        |               `-condor_shared_p
+    ```
+
+### LDAP and VObox Configuration
+
+In the __Environment__ section, these values need to be added/adjusted as needed:
+
+| Definition | Description |
+|:-----------|:------------|
+| ```CE_LCGCE=your-ce01.your-domain:9619, your-ce02.your-domain:9619, ...``` | CE list example |
+| ```USE_TOKEN={0 | 1 | 2}``` | use X509 proxy, WLCG token, or both |
+| ```SUBMIT_ARGS=-append "+TestClassAd=1"```<br>```SUBMIT_ARGS=<String>``` | Specify extra options for condor_submit<br> command. Example: add extra ClassAds<br> to the job description |   
+
+!!! warning ""
+    Mind the firewall settings on the VObox. See [Network setup](../vobox/#network) for more details.
+
+### Miscellaneous Scripts
+
 Cleanup script for job logs and stdout/stderr files removal:
 
 ??? info "Clean up script"
@@ -340,50 +311,38 @@ Cleanup script for job logs and stdout/stderr files removal:
 Crontab line for the cleanup script:
 
 ```console
-37 * * * * /bin/sh $HOME/htcondor-cleanup.sh
+37 * * * * /bin/sh $HOME/cron/htcondor-cleanup.sh
 ```
 
 ## ARC
 
-!!! warning "ARC Instructions: Work In Progress"
-    Please note that these instructions are being actively updated and may not be complete.
-
 ### LDAP Configuration
 
-Add and adjust the following configuration as needed:
+The following configuration parameters need to be added/adjusted as needed:
 
-??? info "LDAP configuration"
+??? info "LDAP configuration examples"
 
     ```bash
-    # VOMS organization to be with BDII job status search
-    ALIEN_VOBOX_ORG=“alice"
- 
-    # a LDAP address of a BDII for a site. VobBox CE module takes number of running and queued jobs from it.
+    # optional (normally not needed): the site BDII to take running and queued job numbers from
  
     CE_SITE_BDII=ldap://site-bdii.gridpp.rl.ac.uk:2170/mds-vo-name=RAL-LCG2,o=grid
  
-    # specifies whether to use BDII for number of running/queued jobs retrieval
-    CE_USE_BDII -  (1 - use it, 0 - use arcstat instead)
+    # specifies whether to use BDII and which GLUE schema version (only 2 is supported in JAliEn)
+    CE_USE_BDII=2
  
     # a list of ARC CEs to be used for jobagent submission
-    # a list of resources can also be set through ~/.arc/client.conf file (see `man client.conf`)
-    CE_LCGCE=(arc-ce01.gridpp.rl.ac.uk:2811/nordugrid-Condor-grid3000M,arc-ce02.gridpp.rl.ac.uk:2811/nordugrid-Condor-grid3000M,arc-ce03.gridpp.rl.ac.uk:2811/nordugrid-Condor-grid3000M,arc-ce04.gridpp.rl.ac.uk:2811/nordugrid-Condor-grid3000M)
+    CE_LCGCE=arc-ce01.gridpp.rl.ac.uk:2811/nordugrid-Condor-grid3000M, ...
  
-    # arguments for arcsub command
-    CE_SUBMITARG =" -b FastestQueue"
+    # arguments for arcsub command (load-balancing is done by the JAliEn CE itself)
+    CE_SUBMITARG=--direct
   
-    # Submit additional parameters to arcsub, it will be used in case CE_USE_BDII=1, additional params for XRSL generation can be passed with a space-separated list:  “xrsl:a=b xrsl:c=d"
-    CE_SUBMITARG_LIST
- 
-    # specifies delay in minutes after which try to clean up completed jobs (default: 1440 mins)
-    ARCCLEAN_RUN_DELAY=1000
- 
-    # specifies delay in minutes after which try to check whether jobs file is sane (default: 60 minutes)
-    ARC_VALIDATE_JOBS_FILE_DELAY=100
+    # additional parameters to arcsub, in particular to pass XRSL clauses as shown
+    CE_SUBMITARG_LIST=xrsl:(queue=mcore_alice)(memory="2000")(count="8")(countpernode="8")(walltime="1500")(cputime="12000")
     ```
 
 !!! example "Debug ARC for Operations"
-    Set the following variable in ```~/.alien/Environment``` file to get ```arc*``` CLI tools to debug output into ```CE.log``` file:
+    Set the following variable in ```~/.alien/config/CE.env``` file to get ```arc*``` CLI tools to debug output into ```CE.log.N``` files:
     ```bash
     ARC_DEBUG=1
     ```
+
-- 
GitLab