From 087b02fa319af23fd1da21d7eabd48d18cfa39ac Mon Sep 17 00:00:00 2001
From: Haakon Andre Reme-Ness <haakon.andre.reme-ness@cern.ch>
Date: Wed, 8 May 2024 03:10:22 +0200
Subject: [PATCH 1/3] Added JDL syntax for job splitting

---
 docs/jdl_syntax.md | 210 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 210 insertions(+)
 create mode 100644 docs/jdl_syntax.md

diff --git a/docs/jdl_syntax.md b/docs/jdl_syntax.md
new file mode 100644
index 0000000..d1042f3
--- /dev/null
+++ b/docs/jdl_syntax.md
@@ -0,0 +1,210 @@
+# JDL syntax reference
+
+## Different split options
+Splitting a job into smaller subjobs is based on the strategy defined in the JDL and will split
+files provided by InputData or InputCollection to different subjobs with the same executable.
+Different strategies have different optional or mandatory fields.
+
+
+### Split
+``` 
+    Will only split the job if this field is defined.    
+
+    usage: Split="[strategy]"
+    
+```
+
+---
+
+### SplitArguments
+``` 
+    Reduntant field, but will add this for splitjobs to Arguments for job 
+
+    usage: SplitArguments="[arguments for executable]"
+    
+```
+
+---
+
+### **Split strategies options**
+---
+
+### production
+``` 
+    Duplicate the job a number of time equal to End-Start defined.
+    #alien_counter# begins the counter at Start provided.
+
+    usage: Split="production:[Start]-[End]"
+    
+    
+```
+
+---
+
+### file
+``` 
+    Divides inputdata files based on full LFN path, resulting in one file per subjob as LFN's are unique.
+
+    usage: Split="file"
+    
+    
+```
+
+---
+
+### directory
+``` 
+    Divides inputdata files based on lowest directoy in LFN path.
+    Example: /alice/cern.ch/user/a/alice/LHC22f3.xml --> /alice/cern.ch/user/a/alice
+
+    usage: Split="directory"
+    
+    optional:
+            SplitMaxInputFileNumber
+            SplitMaxInputFileSize
+    
+```
+
+---
+
+### parentdirectory
+``` 
+    Divides inputdata files based on parent of the lowest directoy in LFN path.
+    Example: /alice/cern.ch/user/a/alice/LHC22f3.xml --> /alice/cern.ch/user/a
+
+    usage: Split="parentdirectory"
+    
+    optional:
+            SplitMaxInputFileNumber
+            SplitMaxInputFileSize
+    
+```
+
+---
+
+### se
+``` 
+    Divides inputdata files based on which Storage Elements files are stored on.
+
+    usage: Split="se"
+    
+    mandatory:
+            SplitMaxInputFileNumber
+    
+    optional:
+            SplitMinInputFileNumber
+    
+```
+
+### af (under development)
+``` 
+    Analysis Facility split meant for cases where files all share a Storage Element and forcing jobs to run on that site 
+
+    usage: Split="af"
+    
+    mandatory:
+            SplitMaxInputFileNumber/SplitMaxInputFileSize
+            
+    
+    optional:
+            ForceOnlySEInput
+            MaxInputMissingThreshold
+    
+```
+
+---
+
+### SplitMaxInputFileNumber
+``` 
+    Sets a maximum limit for number of inputdata files per subjob
+
+    usage: SplitMaxInputFileNumber="[number]"
+    
+    
+```
+
+---
+### SplitMaxInputFileSize
+``` 
+    Sets a maximum limit for combined size of inputdata files per subjob
+
+    usage: SplitMaxInputFileSize="[number]"
+    
+    
+```
+
+---
+
+### SplitMinInputFileNumber
+``` 
+    Sets a minimum limit for number of inputdata files per subjob, used by storage element split
+    to merge subjobs with less inputdata files than the limit 
+
+    usage: SplitMinInputFileNumber="[number]"
+    
+    
+```
+
+---
+
+###  ForceOnlySEInput (under development)
+``` 
+    Used by Analysis Facility to force only inputdata files located on site provided in Requirements of JDL to be used.
+    Other files are ignored for the job. Has a default threshhold of missing files before it fails.
+
+    usage: ForceOnlySEInput="[true/false]"
+    
+    
+```
+
+---
+
+###  MaxInputMissingThreshold (under development)
+``` 
+    Sets a percentage value of missing files before an af split fails
+
+    usage: MaxInputMissingThreshold="[percentage]"
+    
+    
+```
+
+---
+
+### **#alien# pattern**
+
+This pattern is replaced by a value based on subjob or a counter in the final JDL
+
+###  counter
+``` 
+    An increasing subjob counter, can define 
+
+    usage: #alien_counter# --> 1,2,3....
+    
+    options:
+           #alien_counter_[number of digits]i# --> #alien_counter_03i# = 001, 002, 003... 
+    
+    
+```
+
+---
+
+###  counter
+``` 
+    Replace this pattern with a value based on either the first or last of the inputdata files in the subjob. 
+    Default if not provided is first.
+
+    usage: #alien[first/last][option]# 
+    
+    options:
+           dir --> /alice/cern.ch/user/a/alice/LHC22f3.xml = alice
+           fulldir --> /alice/cern.ch/user/a/alice/LHC22f3.xml = /alice/cern.ch/user/a/alice/LHC22f3.xml
+           filename/[pattern to be replaced]/[new value] --> filename/.xml/.new/ --> /alice/cern.ch/user/a/alice/LHC22f3.xml= LHC22f3.new
+           
+    example:
+            #alienlastdir#
+            #alienfilename/.root//#
+    
+    
+```
+
+---
\ No newline at end of file
-- 
GitLab


From aec56a90bc23e2b38b9fdba381949beb0fbe8b72 Mon Sep 17 00:00:00 2001
From: Haakon Andre Reme-Ness <harn@hvl.no>
Date: Wed, 4 Sep 2024 09:51:28 +0200
Subject: [PATCH 2/3] Added OrderBy

---
 docs/jdl_syntax.md | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/docs/jdl_syntax.md b/docs/jdl_syntax.md
index d1042f3..cf5d91c 100644
--- a/docs/jdl_syntax.md
+++ b/docs/jdl_syntax.md
@@ -188,7 +188,7 @@ This pattern is replaced by a value based on subjob or a counter in the final JD
 
 ---
 
-###  counter
+###  file patterns
 ``` 
     Replace this pattern with a value based on either the first or last of the inputdata files in the subjob. 
     Default if not provided is first.
@@ -205,6 +205,25 @@ This pattern is replaced by a value based on subjob or a counter in the final JD
             #alienfilename/.root//#
     
     
+```
+
+---
+
+###  OrderBy
+``` 
+    Order inputdata files in the JDL based on a given strategy (Usually will be alphabetical by default)
+
+    usage: OrderBy = "options"
+    
+    options:
+           random --> Shuffle all files randomly
+           size --> Order by size, matching largest with smalles and so forth
+           epn --> Order by epn
+           tf --> Order by timeframes
+           alphabetical --> order by name
+           
+    
+    
 ```
 
 ---
\ No newline at end of file
-- 
GitLab


From 8fdd82da1375f382c4b9afadd10318e1670d1feb Mon Sep 17 00:00:00 2001
From: hremenes <hremenes@cern.ch>
Date: Wed, 15 Jan 2025 09:47:49 +0100
Subject: [PATCH 3/3] Redid splitting docs

---
 docs/jdl_syntax.md                    | 149 +++++++-------------------
 docs/user/jdlSplit/af.md              |  29 +++++
 docs/user/jdlSplit/directaccess.md    |  27 +++++
 docs/user/jdlSplit/directory.md       |  25 +++++
 docs/user/jdlSplit/file.md            |  20 ++++
 docs/user/jdlSplit/parentdirectory.md |  24 +++++
 docs/user/jdlSplit/production.md      |  21 ++++
 docs/user/jdlSplit/se.md              |  30 ++++++
 8 files changed, 213 insertions(+), 112 deletions(-)
 create mode 100644 docs/user/jdlSplit/af.md
 create mode 100644 docs/user/jdlSplit/directaccess.md
 create mode 100644 docs/user/jdlSplit/directory.md
 create mode 100644 docs/user/jdlSplit/file.md
 create mode 100644 docs/user/jdlSplit/parentdirectory.md
 create mode 100644 docs/user/jdlSplit/production.md
 create mode 100644 docs/user/jdlSplit/se.md

diff --git a/docs/jdl_syntax.md b/docs/jdl_syntax.md
index cf5d91c..a6fde73 100644
--- a/docs/jdl_syntax.md
+++ b/docs/jdl_syntax.md
@@ -18,7 +18,7 @@ Different strategies have different optional or mandatory fields.
 
 ### SplitArguments
 ``` 
-    Reduntant field, but will add this for splitjobs to Arguments for job 
+    Reduntant field, but this field will be added to Arguments for all subjobs
 
     usage: SplitArguments="[arguments for executable]"
     
@@ -26,93 +26,16 @@ Different strategies have different optional or mandatory fields.
 
 ---
 
-### **Split strategies options**
----
-
-### production
-``` 
-    Duplicate the job a number of time equal to End-Start defined.
-    #alien_counter# begins the counter at Start provided.
-
-    usage: Split="production:[Start]-[End]"
-    
-    
-```
+## **Split strategies options**
+[production](user/jdlSplit/production)
+[file](user/jdlSplit/file)
+[directory](user/jdlSplit/directory)
+[parentdirectory](user/jdlSplit/parentdirectory)
+[se](user/jdlSplit/se)
+[af](user/jdlSplit/af)
+[directaccess](user/jdlSplit/directaccess)
 
----
-
-### file
-``` 
-    Divides inputdata files based on full LFN path, resulting in one file per subjob as LFN's are unique.
-
-    usage: Split="file"
-    
-    
-```
-
----
-
-### directory
-``` 
-    Divides inputdata files based on lowest directoy in LFN path.
-    Example: /alice/cern.ch/user/a/alice/LHC22f3.xml --> /alice/cern.ch/user/a/alice
-
-    usage: Split="directory"
-    
-    optional:
-            SplitMaxInputFileNumber
-            SplitMaxInputFileSize
-    
-```
-
----
-
-### parentdirectory
-``` 
-    Divides inputdata files based on parent of the lowest directoy in LFN path.
-    Example: /alice/cern.ch/user/a/alice/LHC22f3.xml --> /alice/cern.ch/user/a
-
-    usage: Split="parentdirectory"
-    
-    optional:
-            SplitMaxInputFileNumber
-            SplitMaxInputFileSize
-    
-```
-
----
-
-### se
-``` 
-    Divides inputdata files based on which Storage Elements files are stored on.
-
-    usage: Split="se"
-    
-    mandatory:
-            SplitMaxInputFileNumber
-    
-    optional:
-            SplitMinInputFileNumber
-    
-```
-
-### af (under development)
-``` 
-    Analysis Facility split meant for cases where files all share a Storage Element and forcing jobs to run on that site 
-
-    usage: Split="af"
-    
-    mandatory:
-            SplitMaxInputFileNumber/SplitMaxInputFileSize
-            
-    
-    optional:
-            ForceOnlySEInput
-            MaxInputMissingThreshold
-    
-```
-
----
+## Other split arguments ##
 
 ### SplitMaxInputFileNumber
 ``` 
@@ -147,7 +70,7 @@ Different strategies have different optional or mandatory fields.
 
 ---
 
-###  ForceOnlySEInput (under development)
+### ForceOnlySEInput
 ``` 
     Used by Analysis Facility to force only inputdata files located on site provided in Requirements of JDL to be used.
     Other files are ignored for the job. Has a default threshhold of missing files before it fails.
@@ -159,20 +82,41 @@ Different strategies have different optional or mandatory fields.
 
 ---
 
-###  MaxInputMissingThreshold (under development)
+### MaxInputMissingThreshold
+``` 
+    Sets a percentage value of missing files from the SE before an af split fails. Only used with af split
+
+    usage: MaxInputMissingThreshold="[percentage]"    
+    
+```
+
+---
+
+###  OrderLFN
 ``` 
-    Sets a percentage value of missing files before an af split fails
+    Order inputdata files in the JDL based on a given strategy (Usually will already be alphabetical by default)
 
-    usage: MaxInputMissingThreshold="[percentage]"
+    usage: OrderLFN = "options"
     
+    options:
+           random --> Shuffle all files randomly
+           size --> Order by size, matching largest with smalles and so forth
+           alphabetical --> order by name
+           "any string" --> String needs to be followed by numbers that is the basis of the sorting.
+                            Examples for usae is: epn or tf           
     
 ```
 
 ---
 
-### **#alien# pattern**
+## **#alien# pattern**
+
+This pattern is replaced by a value based on input data or a counter in the final JDL. This could be used anywhere
+in the JDL but is often used in Arguments or Outputs. 
 
-This pattern is replaced by a value based on subjob or a counter in the final JDL
+Examples: OutputDir = "/alice/data/2024/LHC24an/556767/cpass0/0200/#alienfilename/.root//#";
+          OutputDir = "/alice/data/2024/LHC24an/556767/cpass0/0200/#alien_counter_03i#";
+          
 
 ###  counter
 ``` 
@@ -208,22 +152,3 @@ This pattern is replaced by a value based on subjob or a counter in the final JD
 ```
 
 ---
-
-###  OrderBy
-``` 
-    Order inputdata files in the JDL based on a given strategy (Usually will be alphabetical by default)
-
-    usage: OrderBy = "options"
-    
-    options:
-           random --> Shuffle all files randomly
-           size --> Order by size, matching largest with smalles and so forth
-           epn --> Order by epn
-           tf --> Order by timeframes
-           alphabetical --> order by name
-           
-    
-    
-```
-
----
\ No newline at end of file
diff --git a/docs/user/jdlSplit/af.md b/docs/user/jdlSplit/af.md
new file mode 100644
index 0000000..7c5a3d5
--- /dev/null
+++ b/docs/user/jdlSplit/af.md
@@ -0,0 +1,29 @@
+### **Analysis Facility split strategy**
+
+Analysis Facility split meant for cases where files all share a Storage Element and forcing jobs to run on that site.
+As of right now uses only closeSE in requirement to get SE.
+
+Example: /alice/cern.ch/user/j/jalien/inputdatafile --> /alice/cern.ch/user/j/jalien
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Directory split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "directory";
+```
+Mandatory flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+or
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+
+Optional flags in JDL:
+[ForceOnlySEInput](../../jdl_syntax#forceonlyseinput)
+[MaxInputMissingThreshold](../../jdl_syntax#maxinputmissingthreshold)
+[OrderLFN](../../jdl_syntax#orderlfn)
diff --git a/docs/user/jdlSplit/directaccess.md b/docs/user/jdlSplit/directaccess.md
new file mode 100644
index 0000000..ae06767
--- /dev/null
+++ b/docs/user/jdlSplit/directaccess.md
@@ -0,0 +1,27 @@
+### **Direct Access split strategy**
+
+Split all datainput files evenly into different subjobs. This will force downloads of files to sites unless
+this is taken into account and mitigated by having all files on same site and set closeSE requirement.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Direct Access split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "directaccess";
+SplitMaxInputFileNumber = "10";
+```
+Mandatory flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+or
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+
+Optional flags in JDL:
+[OrderLFN](../../jdl_syntax#orderlfn)
+
diff --git a/docs/user/jdlSplit/directory.md b/docs/user/jdlSplit/directory.md
new file mode 100644
index 0000000..4205bb0
--- /dev/null
+++ b/docs/user/jdlSplit/directory.md
@@ -0,0 +1,25 @@
+### **Directory split strategy**
+
+Seperates inputdata files into different subjobs based on full path to lowest directory in LFN path and file size 
+or number of files restrictions.
+
+Example: /alice/cern.ch/user/j/jalien/inputdatafile --> /alice/cern.ch/user/j/jalien
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Directory split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "directory";
+```
+Optional flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+[OrderLFN](../../jdl_syntax#orderlfn)
+    
diff --git a/docs/user/jdlSplit/file.md b/docs/user/jdlSplit/file.md
new file mode 100644
index 0000000..34f9d02
--- /dev/null
+++ b/docs/user/jdlSplit/file.md
@@ -0,0 +1,20 @@
+### **File split strategy**
+
+Divides inputdata files based on full LFN path, resulting in one file per subjob as LFN's are unique. Will also add 
+Close.SE requirements to the LFN to match with site that have local access to the inputdata file.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"File split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputData = {
+"LF:/alice/cern.ch/user/j/jalien/jobs/inputdatafile"
+};
+Split = "file";
+```
\ No newline at end of file
diff --git a/docs/user/jdlSplit/parentdirectory.md b/docs/user/jdlSplit/parentdirectory.md
new file mode 100644
index 0000000..38e51dd
--- /dev/null
+++ b/docs/user/jdlSplit/parentdirectory.md
@@ -0,0 +1,24 @@
+### **Parentdirectory split strategy**
+
+Seperates inputdata files into different subjobs based on full path to the parent of the lowest directory in LFN path 
+and file size or number of files restrictions.
+
+Example: /alice/cern.ch/user/j/jalien/inputdatafile --> /alice/cern.ch/user/j
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Parentdirectory split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "parentdirectory";
+```
+Optional flags in JDL:
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+[OrderLFN](../../jdl_syntax#orderlfn)
diff --git a/docs/user/jdlSplit/production.md b/docs/user/jdlSplit/production.md
new file mode 100644
index 0000000..bea41ca
--- /dev/null
+++ b/docs/user/jdlSplit/production.md
@@ -0,0 +1,21 @@
+### **Production split strategy**
+
+Duplicate the job a number of time equal to an interval defined by an end and a start number 
+Such as: production:[Start]-[End].
+This will also set the counter for [#alien_counter#](../../jdl_syntax#counter) which is useful for
+Monte Carlo Simulations.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Production split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "production:10-1";
+```
+
diff --git a/docs/user/jdlSplit/se.md b/docs/user/jdlSplit/se.md
new file mode 100644
index 0000000..5460509
--- /dev/null
+++ b/docs/user/jdlSplit/se.md
@@ -0,0 +1,30 @@
+### **Storage Element split strategy**
+
+Seperates inputdata files into different subjobs based on physical Storage Elements location files are stored on
+and file size or number of files restrictions. There is also a default minimum value for subjob size, where inputdata files
+are merged to form a larger subjob if it is too small, this minimum subjobs size value can be set by user. Requirements 
+for matching with sites that locally have inputdata files are also added. Maximum number of files per subjob or size 
+per subjobs MUST be set.
+
+Example JDL:
+``` 
+User = "jalien";
+JobTag = {
+"Storage Element split!"
+};
+Packages = {
+"VO_ALICE@O2Physics::daily-20241202-0000-1"
+};
+Executable = "/alice/cern.ch/user/j/jalien/bin/splitting.sh";
+InputDataCollection = "LF:/alice/cern.ch/user/j/jalien/inputdatafileCollection.xml,nodownload";
+Split = "se";
+SplitMaxInputFileNumber = "10";
+```
+Mandatory flags in JDL
+[SplitMaxInputFileNumber](../../jdl_syntax#splitmaxinputfilenumber)
+or
+[SplitMaxInputFileSize](../../jdl_syntax#splitmaxinputfilesize)
+
+Optional flags in JDL:
+[SplitMinInputFileNumber](../../jdl_syntax#splitmininputfilenumber)
+[OrderLFN](../../jdl_syntax#orderlfn)
-- 
GitLab