From 1f173506733d456572a5f6ba7b9537ecd6215a2b Mon Sep 17 00:00:00 2001
From: Jakub Wozniak <jakub.wozniak@cern.ch>
Date: Wed, 10 Apr 2024 12:00:02 +0200
Subject: [PATCH] Revert "Merge branch
 'NXCALS-7436-Race-condition-when-starting-multiple-python' into 'develop'"

This reverts merge request !1960
---
 .../templates/config/spark-env.sh             | 159 ++++++++----------
 1 file changed, 66 insertions(+), 93 deletions(-)

diff --git a/ansible/roles/python-wheels-upload/templates/config/spark-env.sh b/ansible/roles/python-wheels-upload/templates/config/spark-env.sh
index a5a22f3c69..fcb99ffdf4 100644
--- a/ansible/roles/python-wheels-upload/templates/config/spark-env.sh
+++ b/ansible/roles/python-wheels-upload/templates/config/spark-env.sh
@@ -72,23 +72,6 @@
 # In order to determine that we check the spark-default.conf file and script input arguments ($@).
 # If you use spark-submit and set this inside the Python script we are unable to determine that. (jwozniak)
 
-NXCALS_VERSION="{{nxcals_version}}"
-
-#Global error handling
-handle_error() {
-    echo "An error occurred on line $1"
-    exit 1
-}
-trap 'handle_error $LINENO' ERR
-
-if command -v flock &> /dev/null; then
-    FLOCK="flock -x 100"
-  else
-    echo "flock is not installed, race condition possible if running multiple scripts for a single venv in parallel!"
-    FLOCK=
-fi
-
-
 function md5_cmd {
   if [[ $(uname) == "Darwin" ]]; then
     md5
@@ -97,7 +80,7 @@ function md5_cmd {
   fi
 }
 
-
+NXCALS_VERSION="{{nxcals_version}}"
 
 function get_writable_dir() {
   dir=$(dirname "$(mktemp -u)")
@@ -114,43 +97,6 @@ if [ ! "$NXCALS_WORKSPACE_TEMP_DIR" ]; then
   get_writable_dir
 fi
 
-function pack_venv() {
-  if [ ! "$NXCALS_PACK_ALL_PACKAGES" ]; then
-    venv-pack --python-prefix "$PYTHON_PREFIX" --output "$PACKED_VENV_FILE" \
-      --exclude nxcals-bundle/nxcals_jars/\* --exclude nxcals-bundle/jars/\* --exclude nxcals-bundle/examples/\* \
-      --exclude \*/pyspark/jars/\* --exclude \*/pyspark/examples/\*
-  else
-    venv-pack --python-prefix "$PYTHON_PREFIX" --output "$PACKED_VENV_FILE"
-  fi
-}
-
-function fix_venv() {
-    echo "Extracing packed venv to fix symlink to exec..."
-    # Fix packed venv - symlinks to python exec may be broken
-    FIXED_VENV_DIR="$NXCALS_WORKSPACE_TEMP_DIR/venv"
-    mkdir -p $FIXED_VENV_DIR
-    tar -xzf $NXCALS_WORKSPACE_TEMP_DIR/nxcals-python3-env.tar.gz -C $FIXED_VENV_DIR
-
-    echo "Fixing symlink to exec in venv..."
-
-    for file in $FIXED_VENV_DIR/bin/python*; do
-        if [ -L "$file" ]; then
-          newTarget=$PYTHON_PREFIX/bin/python$PYTHON_VERSION
-          rm $file
-          ln -s $newTarget $file
-        fi
-    done
-
-    echo "Packing again venv..."
-    OLD_PWD=`pwd`
-    rm $NXCALS_WORKSPACE_TEMP_DIR/nxcals-python3-env.tar.gz
-    cd $FIXED_VENV_DIR
-    tar -czf $NXCALS_WORKSPACE_TEMP_DIR/nxcals-python3-env.tar.gz ./*
-    cd $OLD_PWD
-    rm -r $FIXED_VENV_DIR
-}
-
-
 #A must as the pySpark is using those 2 variables to set the python on the executor. Both vars must be set. The driver uses what is the current python3 and the executor must use the LCG.
 #Exlusion is only for the jupyter setting - it shouldn't be overwritten
 JUPYTER_PYSPARK_REGEX='^\s*jupyter(\s.*|$)'
@@ -166,7 +112,6 @@ OLD_SPARK_CONF_DIR="$SPARK_CONF_DIR"
 export SPARK_CONF_DIR="$NXCALS_WORKSPACE_TEMP_DIR/conf"
 
 PACKED_VENV_FILE="${NXCALS_WORKSPACE_TEMP_DIR}"/{{spark_packed_venv_name}}
-LOCK="${NXCALS_WORKSPACE_TEMP_DIR}"/.lock
 
 echo "ENVIRONMENT:"
 echo "NXCALS_VERSION=${NXCALS_VERSION}"
@@ -179,7 +124,6 @@ echo "PACKED_VENV_FILE=${PACKED_VENV_FILE}"
 echo "PYSPARK_PYTHON=${PYSPARK_PYTHON}"
 echo "PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON}"
 echo "PYSPARK_DRIVER_PYTHON_OPTS=${PYSPARK_DRIVER_PYTHON_OPTS}"
-echo "LOCK=${LOCK}"
 echo
 echo "IMPORTANT:"
 echo "Rebuilding of the packed venv is required in cases there are new or modified packages provided by a user. In order to recreate the packed venv please:"
@@ -188,41 +132,39 @@ echo "        - execute your script again (rebuild will be performed during the
 echo "Target directory can be set with an env variable NXCALS_WORKSPACE_TEMP_DIR (if not set a temp dir will be used)."
 echo "Adding NXCALS related files to packed venv can be enabled by setting NXCALS_PACK_ALL_PACKAGES with any value."
 
+if [ ! -e "$NXCALS_WORKSPACE_TEMP_DIR/conf/spark-defaults.conf" ]; then
+  echo "Copying $SPARK_DEFAULTS to $SPARK_CONF_DIR ..."
 
-(
-  $FLOCK
-  if [ ! -e "$NXCALS_WORKSPACE_TEMP_DIR/conf/spark-defaults.conf" ]; then
-      echo "Copying $SPARK_DEFAULTS to $SPARK_CONF_DIR ..."
-      mkdir -p "$SPARK_CONF_DIR"
+  mkdir -p "$SPARK_CONF_DIR"
 
-      cp "$OLD_SPARK_CONF_DIR"/spark-defaults.conf "$SPARK_CONF_DIR"
-      cp "$OLD_SPARK_CONF_DIR"/log4j2.properties "$SPARK_CONF_DIR"
-      NEW_SPARK_CONF="$SPARK_CONF_DIR/spark-defaults.conf"
+  if [ $? -ne 0 ]; then
+    echo "ERROR: could not create $NXCALS_WORKSPACE_TEMP_DIR/conf. Aborting."
+    exit 1
+  fi
 
-      # make the spark.jars path absolute otherwise they are relative to the current working directory
-      # Mac OSX requires escaping brackets
+  cp "$OLD_SPARK_CONF_DIR"/* "$SPARK_CONF_DIR"
+  NEW_SPARK_CONF="$SPARK_CONF_DIR/spark-defaults.conf"
 
+  # make the spark.jars path absolute otherwise they are relative to the current working directory
+  # Mac OSX requires escaping brackets
+  if [[ $(uname) == "Darwin" ]]; then
+      sed -i -r 's,\([^/]\)nxcals_jars/\([^,]*\),\1'"$SPARK_HOME"'/nxcals_jars/\2,g' "$NEW_SPARK_CONF"
+  else
+      sed -i -r 's,([^/])nxcals_jars/([^,]*),\1'"$SPARK_HOME"'/nxcals_jars/\2,g' "$NEW_SPARK_CONF"
+  fi
 
-      if [[ $(uname) == "Darwin" ]]; then
-          sed -i -r 's,\([^/]\)nxcals_jars/\([^,]*\),\1'"$SPARK_HOME"'/nxcals_jars/\2,g' "$NEW_SPARK_CONF"
-      else
-          sed -i -r 's,([^/])nxcals_jars/([^,]*),\1'"$SPARK_HOME"'/nxcals_jars/\2,g' "$NEW_SPARK_CONF"
-      fi
+  # Replace the placeholder for the virtual_env path in spark-defaults.conf
+  # The archive is specified with '#environment' because that is how spark
+  # knows where to unzip it on the executors under a new directory called environment.
+  # For further information: http://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html
 
-      # Replace the placeholder for the virtual_env path in spark-defaults.conf
-      # The archive is specified with '#environment' because that is how spark
-      # knows where to unzip it on the executors under a new directory called environment.
-      # For further information: http://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html
-      sed -i -r 's@spark.yarn.dist.archives.*@spark.yarn.dist.archives '"$NXCALS_WORKSPACE_TEMP_DIR"'/'{{spark_packed_venv_name}}'#'{{spark_bundle_pyspark_venv_name}}'@g' "$NEW_SPARK_CONF"
-  fi
-) 100>$LOCK
+  sed -i -r 's@spark.yarn.dist.archives.*@spark.yarn.dist.archives '"$NXCALS_WORKSPACE_TEMP_DIR"'/'{{spark_packed_venv_name}}'#'{{spark_bundle_pyspark_venv_name}}'@g' "$NEW_SPARK_CONF"
+fi
 
 echo "Trying to determine YARN usage to make Python work correctly (conf/spark-env.sh)..."
+grep -q -e "^\s*spark.master\s*yarn" "${SPARK_CONF_DIR}/spark-defaults.conf"
 
-exit_code=0
-grep -q -e "^\s*spark.master\s*yarn" "${SPARK_CONF_DIR}/spark-defaults.conf" || exit_code=$?
-
-if [[ "${exit_code}" == "0" || $@ =~ .*master.*yarn.* ]]; then
+if [[ $? == '0' || $@ =~ .*master.*yarn.* ]]; then
   echo "Using YARN"
   # Normally $PYSPARK_PYTHON is set in the spark_session_builder.get_or_create().
   # But when user calls pyspark directly, this option is required.
@@ -238,17 +180,48 @@ if [[ "${exit_code}" == "0" || $@ =~ .*master.*yarn.* ]]; then
       echo "ERROR: YARN cluster doesn't support Python in version $PYTHON_VERSION. Supported versions are either 3.9 or 3.11"
       exit 1
     else
-      (
-        $FLOCK
-        if [[ ! -e "$PACKED_VENV_FILE" ]]; then
-            echo "Creating packed venv..."
-            pack_venv
-            fix_venv
-            echo -e "Packed venv created.\n...done!"
-        else
-            echo "Packed venv already built..."
+      echo "Creating packed venv..."
+
+      ERROR=0
+
+      if [ ! "$NXCALS_PACK_ALL_PACKAGES" ]; then
+        venv-pack --python-prefix "$PYTHON_PREFIX" --output "$PACKED_VENV_FILE" \
+          --exclude nxcals-bundle/nxcals_jars/\* --exclude nxcals-bundle/jars/\* --exclude nxcals-bundle/examples/\* \
+          --exclude \*/pyspark/jars/\* --exclude \*/pyspark/examples/\* || ERROR=1
+      else
+        venv-pack --python-prefix "$PYTHON_PREFIX" --output "$PACKED_VENV_FILE" || ERROR=1
+      fi
+
+      echo "INFO: Extracing packed venv to fix symlink to exec..."
+      # Fix packed venv - symlinks to python exec may be broken
+      FIXED_VENV_DIR="$NXCALS_WORKSPACE_TEMP_DIR/venv"
+      mkdir $FIXED_VENV_DIR
+      tar -xzf $NXCALS_WORKSPACE_TEMP_DIR/nxcals-python3-env.tar.gz -C $FIXED_VENV_DIR || ERROR=1
+
+      echo "INFO: Fixing symlink to exec in venv..."
+
+      for file in $FIXED_VENV_DIR/bin/python*; do
+        if [ -L "$file" ]; then
+          newTarget=$PYTHON_PREFIX/bin/python$PYTHON_VERSION
+          rm $file || ERROR=1
+          ln -s $newTarget $file || ERROR=1
         fi
-      ) 100>$LOCK
+      done
+
+      echo "INFO: Packing again venv..."
+      OLD_PWD=`pwd`
+      rm $NXCALS_WORKSPACE_TEMP_DIR/nxcals-python3-env.tar.gz
+      cd $FIXED_VENV_DIR
+      tar -czf $NXCALS_WORKSPACE_TEMP_DIR/nxcals-python3-env.tar.gz ./* || ERROR=1
+      cd $OLD_PWD
+      rm -r $FIXED_VENV_DIR
+
+      if [ $ERROR -ne 0 ]; then
+        echo "ERROR: could not create packed venv. Aborting."
+        exit 3
+      else
+        echo -e "Packed venv created.\n...done!"
+      fi
     fi
   fi
   echo "IMPORTANT:"
-- 
GitLab