From 7a550f9487d2f31da49656bdfb107c45f44c344f Mon Sep 17 00:00:00 2001
From: Haakon Andre Reme-Ness <haakon.andre.reme-ness@cern.ch>
Date: Wed, 15 Jan 2025 10:20:27 +0100
Subject: [PATCH] Changes to splitting

---
 docs/jdl_syntax.md                    | 149 +++++++-------------------
 docs/user/jdlSplit/af.md              |  29 +++++
 docs/user/jdlSplit/directaccess.md    |  27 +++++
 docs/user/jdlSplit/directory.md       |  25 +++++
 docs/user/jdlSplit/file.md            |  20 ++++
 docs/user/jdlSplit/parentdirectory.md |  24 +++++
 docs/user/jdlSplit/production.md      |  21 ++++
 docs/user/jdlSplit/se.md              |  30 ++++++
 8 files changed, 213 insertions(+), 112 deletions(-)
 create mode 100644 docs/user/jdlSplit/af.md
 create mode 100644 docs/user/jdlSplit/directaccess.md
 create mode 100644 docs/user/jdlSplit/directory.md
 create mode 100644 docs/user/jdlSplit/file.md
 create mode 100644 docs/user/jdlSplit/parentdirectory.md
 create mode 100644 docs/user/jdlSplit/production.md
 create mode 100644 docs/user/jdlSplit/se.md

diff --git a/docs/jdl_syntax.md b/docs/jdl_syntax.md
index cf5d91c..a6fde73 100644
--- a/docs/jdl_syntax.md
+++ b/docs/jdl_syntax.md
@@ -18,7 +18,7 @@ Different strategies have different optional or mandatory fields.
 
 ### SplitArguments
 ``` 
-    Reduntant field, but will add this for splitjobs to Arguments for job 
+    Reduntant field, but this field will be added to Arguments for all subjobs
 
     usage: SplitArguments="[arguments for executable]"
     
@@ -26,93 +26,16 @@ Different strategies have different optional or mandatory fields.
 
 ---
 
-### **Split strategies options**
----
-
-### production
-``` 
-    Duplicate the job a number of time equal to End-Start defined.
-    #alien_counter# begins the counter at Start provided.
-
-    usage: Split="production:[Start]-[End]"
-    
-    
-```
+## **Split strategies options**
+[production](user/jdlSplit/production)
+[file](user/jdlSplit/file)
+[directory](user/jdlSplit/directory)
+[parentdirectory](user/jdlSplit/parentdirectory)
+[se](user/jdlSplit/se)
+[af](user/jdlSplit/af)
+[directaccess](user/jdlSplit/directaccess)
 
----
-
-### file
-``` 
-    Divides inputdata files based on full LFN path, resulting in one file per subjob as LFN's are unique.
-
-    usage: Split="file"
-    
-    
-```
-
----
-
-### directory
-``` 
-    Divides inputdata files based on lowest directoy in LFN path.
-    Example: /alice/cern.ch/user/a/alice/LHC22f3.xml --> /alice/cern.ch/user/a/alice
-
-    usage: Split="directory"
-    
-    optional:
-            SplitMaxInputFileNumber
-            SplitMaxInputFileSize
-    
-```
-
----
-
-### parentdirectory
-``` 
-    Divides inputdata files based on parent of the lowest directoy in LFN path.
-    Example: /alice/cern.ch/user/a/alice/LHC22f3.xml --> /alice/cern.ch/user/a
-
-    usage: Split="parentdirectory"
-    
-    optional:
-            SplitMaxInputFileNumber
-            SplitMaxInputFileSize
-    
-```
-
----
-
-### se
-``` 
-    Divides inputdata files based on which Storage Elements files are stored on.
-
-    usage: Split="se"
-    
-    mandatory:
-            SplitMaxInputFileNumber
-    
-    optional:
-            SplitMinInputFileNumber
-    
-```
-
-### af (under development)
-``` 
-    Analysis Facility split meant for cases where files all share a Storage Element and forcing jobs to run on that site 
-
-    usage: Split="af"
-    
-    mandatory:
-            SplitMaxInputFileNumber/SplitMaxInputFileSize
-            
-    
-    optional:
-            ForceOnlySEInput
-            MaxInputMissingThreshold
-    
-```
-
----
+## Other split arguments ##
 
 ### SplitMaxInputFileNumber
 ``` 
@@ -147,7 +70,7 @@ Different strategies have different optional or mandatory fields.
 
 ---
 
-###  ForceOnlySEInput (under development)
+### ForceOnlySEInput
 ``` 
     Used by Analysis Facility to force only inputdata files located on site provided in Requirements of JDL to be used.
     Other files are ignored for the job. Has a default threshhold of missing files before it fails.
@@ -159,20 +82,41 @@ Different strategies have different optional or mandatory fields.
 
 ---
 
-###  MaxInputMissingThreshold (under development)
+### MaxInputMissingThreshold
+``` 
+    Sets a percentage value of missing files from the SE before an af split fails. Only used with af split
+
+    usage: MaxInputMissingThreshold="[percentage]"    
+    
+```
+
+---
+
+###  OrderLFN
 ``` 
-    Sets a percentage value of missing files before an af split fails
+    Order inputdata files in the JDL based on a given strategy (Usually will already be alphabetical by default)
 
-    usage: MaxInputMissingThreshold="[percentage]"
+    usage: OrderLFN = "options"
     
+    options:
+           random --> Shuffle all files randomly
+           size --> Order by size, matching largest with smalles and so forth
+           alphabetical --> order by name
+           "any string" --> String needs to be followed by numbers that is the basis of the sorting.
+                            Examples for usae is: epn or tf           
     
 ```
 
 ---
 
-### **#alien# pattern**
+## **#alien# pattern**
+
+This pattern is replaced by a value based on input data or a counter in the final JDL. This could be used anywhere
+in the JDL but is often used in Arguments or Outputs. 
 
-This pattern is replaced by a value based on subjob or a counter in the final JDL
+Examples: OutputDir = "/alice/data/2024/LHC24an/556767/cpass0/0200/#alienfilename/.root//#";
+          OutputDir = "/alice/data/2024/LHC24an/556767/cpass0/0200/#alien_counter_03i#";
+          
 
 ###  counter
 ``` 
@@ -208,22 +152,3 @@ This pattern is replaced by a value based on subjob or a counter in the final JD
 ```
 
 ---
-
-###  OrderBy
-``` 
-    Order inputdata files in the JDL based on a given strategy (Usually will be alphabetical by default)
-
-    usage: OrderBy = "options"
-    
-    options:
-           random --> Shuffle all files randomly
-           size --> Order by size, matching largest with smalles and so forth
-           epn --> Order by epn
-           tf --> Order by timeframes
-           alphabetical --> order by name
-           
-    
-    
-```
-
----
\ No newline at end of file
diff --git a/docs/user/jdlSplit/af.md b/docs/user/jdlSplit/af.md
new file mode 100644
index 0000000..7c5a3d5
--- /dev/null
+++ b/docs/user/jdlSplit/af.md
@@ -0,0 +1,29 @@
+### **Analysis Facility split strategy**
+
+Analysis Facility split meant for cases where files all share a Storage Element and forcing jobs to run on that site.
+As of right now uses only closeSE in requirement to get SE.
+
+Example: /alice/cern.ch/user/j/jalien/inputdatafile --> /alice/cern.ch/user/j/jalien
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Directory split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "directory";
+```
+Mandatory flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+or
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+
+Optional flags in JDL:
+[ForceOnlySEInput](../../jdl_syntax#forceonlyseinput)
+[MaxInputMissingThreshold](../../jdl_syntax#maxinputmissingthreshold)
+[OrderLFN](../../jdl_syntax#orderlfn)
diff --git a/docs/user/jdlSplit/directaccess.md b/docs/user/jdlSplit/directaccess.md
new file mode 100644
index 0000000..ae06767
--- /dev/null
+++ b/docs/user/jdlSplit/directaccess.md
@@ -0,0 +1,27 @@
+### **Direct Access split strategy**
+
+Split all datainput files evenly into different subjobs. This will force downloads of files to sites unless
+this is taken into account and mitigated by having all files on same site and set closeSE requirement.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Direct Access split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "directaccess";
+SplitMaxInputFileNumber = "10";
+```
+Mandatory flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+or
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+
+Optional flags in JDL:
+[OrderLFN](../../jdl_syntax#orderlfn)
+
diff --git a/docs/user/jdlSplit/directory.md b/docs/user/jdlSplit/directory.md
new file mode 100644
index 0000000..4205bb0
--- /dev/null
+++ b/docs/user/jdlSplit/directory.md
@@ -0,0 +1,25 @@
+### **Directory split strategy**
+
+Seperates inputdata files into different subjobs based on full path to lowest directory in LFN path and file size 
+or number of files restrictions.
+
+Example: /alice/cern.ch/user/j/jalien/inputdatafile --> /alice/cern.ch/user/j/jalien
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Directory split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "directory";
+```
+Optional flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+[OrderLFN](../../jdl_syntax#orderlfn)
+    
diff --git a/docs/user/jdlSplit/file.md b/docs/user/jdlSplit/file.md
new file mode 100644
index 0000000..34f9d02
--- /dev/null
+++ b/docs/user/jdlSplit/file.md
@@ -0,0 +1,20 @@
+### **File split strategy**
+
+Divides inputdata files based on full LFN path, resulting in one file per subjob as LFN's are unique. Will also add 
+Close.SE requirements to the LFN to match with site that have local access to the inputdata file.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"File split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputData = {
+"LF:/alice/cern.ch/user/j/jalien/jobs/inputdatafile"
+};
+Split = "file";
+```
\ No newline at end of file
diff --git a/docs/user/jdlSplit/parentdirectory.md b/docs/user/jdlSplit/parentdirectory.md
new file mode 100644
index 0000000..38e51dd
--- /dev/null
+++ b/docs/user/jdlSplit/parentdirectory.md
@@ -0,0 +1,24 @@
+### **Parentdirectory split strategy**
+
+Seperates inputdata files into different subjobs based on full path to the parent of the lowest directory in LFN path 
+and file size or number of files restrictions.
+
+Example: /alice/cern.ch/user/j/jalien/inputdatafile --> /alice/cern.ch/user/j
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Parentdirectory split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "parentdirectory";
+```
+Optional flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+[OrderLFN](../../jdl_syntax#orderlfn)
diff --git a/docs/user/jdlSplit/production.md b/docs/user/jdlSplit/production.md
new file mode 100644
index 0000000..bea41ca
--- /dev/null
+++ b/docs/user/jdlSplit/production.md
@@ -0,0 +1,21 @@
+### **Production split strategy**
+
+Duplicate the job a number of time equal to an interval defined by an end and a start number 
+Such as: production:[Start]-[End].
+This will also set the counter for [#alien_counter#](../../jdl_syntax#counter) which is useful for
+Monte Carlo Simulations.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Production split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "production:10-1";
+```
+
diff --git a/docs/user/jdlSplit/se.md b/docs/user/jdlSplit/se.md
new file mode 100644
index 0000000..5460509
--- /dev/null
+++ b/docs/user/jdlSplit/se.md
@@ -0,0 +1,30 @@
+### **Storage Element split strategy**
+
+Seperates inputdata files into different subjobs based on physical Storage Elements location files are stored on
+and file size or number of files restrictions. There is also a default minimum value for subjob size, where inputdata files
+are merged to form a larger subjob if it is too small, this minimum subjobs size value can be set by user. Requirements 
+for matching with sites that locally have inputdata files are also added. Maximum number of files per subjob or size 
+per subjobs MUST be set.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Storage Element split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "se";
+SplitMaxInputFileNumber = "10";
+```
+Mandatory flags in JDL
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+or
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+
+Optional flags in JDL:
+[SplitMinInputFileNumber](../../jdl_syntax#splitmininputfilenumber)
+[OrderLFN](../../jdl_syntax#orderlfn)
-- 
GitLab