Commit dd68544d authored by Domenico Giordano's avatar Domenico Giordano
Browse files

removed config_files folder

parent fe9dfab0
# Config Files
In this folder you can find all the configuration files used by the python files saved in
> control_room/airflow-compose/dags
In particular inside *publish_fluentd* you can find 3 different config files:
- **etl_test**: the config defines WHICH DATA TO EXTRACT FORM HDFS for the ***testing*** part (windows size, slide steps, which sensors data we want, in which machines we will run the expeiments (hostgroup choice) ...)
- **etl_train**: the config defines WHICH DATA TO EXTRACT FORM HDFS for the ***training*** part (windows size, slide steps, which sensors data we want ...)
- **experiment**: the config defines HOW TO ANALYSE THE DATA (windows size, slide steps, which algorithms are used, ad-hoc algorithms parameters ...)
\ No newline at end of file
---
# ----------------------------------------------------------------
# CONFIGURATION FILE TO DEFINE HOW TO ANALYSE YOUR DATA
# ----------------------------------------------------------------
# ----------------------------------------------------------------
# WINDOW OF ANALYSIS
# ----------------------------------------------------------------
# The length of every window in terms of steps.
# This number is dependent on the data we feed to the algorithm. This
# value should typically match the corresponding ETL config file for the test.
history_steps: 48
# The number of step between two subsequent window.
# This number is dependent on the data we feed to the algorithm. This
# value should typically match the corresponding ETL config file for the test.
# Note that slide_steps has the same value of history_steps we have a non
# overlapping scenario.
slide_steps: 48
# ----------------------------------------------------------------
# ALGORITHMs
# ----------------------------------------------------------------
# Algorithms to use for the analysis
# The keys correspond to the identifier of the algorithm.
# Note that it should be unique since ths will be used on all your plots.
# Every element will have the following sub-keys:
# - import_path: to indicate the path to the single algorithms classes.
# e.g. path.of.module.classname.
# e.g "adcern.analyser.AEDenseTF2"
# e.g "pyod.models.pca.PCA"
# then the last token will be separated by the "." and we do this:
# from pyod.models.pca import PCA
# and instantiate the object PCA()
# - parameters: dictionary of parameters to pass to the class of the algo
# during its initialization
# - train_on_test: default False, if True there won't be any training phase,
# every algorithm will operate on the current windows at test time. The only
# thing that comes from the train data is that the normalization, in fact the
# test data on which we perform direct prediction are normalized with mu and
# sigma of the data extracted as etl for the train period.
# - subsample_for_train: default 0. It defines thenumebr of training samples
# to extract to train your model. If -1 all the available data will be used.
# If 0 it uses the deafault values in the max_samples_for_train field.
# Note that subsample_for_train is used in combination with
# train_on_test = False because this allow for a bigger training set where
# subsampling makes sense
algo_and_params:
PCA_S1000:
import_path: pyod.models.pca.PCA
family: Traditional
train_on_test: False
subsample_for_train: 1000
parameters: {}
# PCA_SALL:
# import_path: pyod.models.pca.PCA
# family: Traditional
# train_on_test: False
# subsample_for_train: -1
# parameters: {}
LOF_200_S1000:
import_path: pyod.models.lof.LOF
family: Traditional
train_on_test: False
subsample_for_train: 1000
parameters:
n_neighbors: 200
#KNN_200_S1000:
# import_path: pyod.models.knn.KNN
# family: Traditional
# train_on_test: False
# subsample_for_train: 1000
# parameters:
# n_neighbors: 200
#OCSVM_S1000:
# import_path: pyod.models.ocsvm.OCSVM
# family: Traditional
# train_on_test: False
# subsample_for_train: 1000
# parameters:
# nu: 0.1
IForest_S1000:
import_path: pyod.models.iforest.IForest
family: Traditional
train_on_test: False
subsample_for_train: 1000
parameters:
n_estimators: 100
# IForest_SALL:
# import_path: pyod.models.iforest.IForest
# family: Traditional
# train_on_test: False
# subsample_for_train: -1
# parameters:
# n_estimators: 100
PERC_85_SALL:
import_path: adcern.analyser_baseline.PercScore
family: Traditional
train_on_test: True
subsample_for_train: -1
parameters:
nr_timeseries: 1110
nr_timesteps: 48
percentage_above: 0.85
# PERC_60_SALL:
# import_path: adcern.analyser_baseline.PercScore
# family: Traditional
# train_on_test: True
# subsample_for_train: -1
# parameters:
# nr_timeseries: 5
# nr_timesteps: 48
# percentage_above: 0.60
# AE_LSTM_SALL:
# import_path: adcern.analyser_deep.AELstmTF2
# family: Deep
# train_on_test: False
# subsample_for_train: -1
# parameters:
# nr_timeseries: 5
# nr_timesteps: 48
# epochs: 20
# verbose: 0
#AE_Dense_SALL:
# import_path: adcern.analyser_deep.AEDenseTF2
# family: Deep
# train_on_test: False
# subsample_for_train: -1
# parameters:
# epochs: 20
# verbose: 0
#AE_CNN_SALL:
# import_path: adcern.analyser_deep.AECnnTF2
# family: Deep
# train_on_test: False
# subsample_for_train: -1
# parameters:
# nr_timeseries: 5
# nr_timesteps: 48
# epochs: 20
# verbose: 0
# ForecastCNN_SALL:
# import_path: adcern.analyser_forecasting.ForecastCNN
# family: Deep
# train_on_test: False
# subsample_for_train: -1
# parameters:
# nr_timeseries: 5
# nr_timesteps: 48
# chunk_len: 6
# epochs: 20
# verbose: 0
# pyod.models.iforest.IForest: {}
# pyod.models.pca.PCA: {}
# pyod.models.ocsvm.OCSVM:
# nu: 0.1
# adcern.analyser_baseline.PercScore:
# nr_timeseries: 11
# nr_timesteps: 48
# percentage_above: 0.85
# pyod.models.knn.KNN: {}
# adcern.analyser_deep.AEDenseTF2:
# epochs: 20
# verbose: 0
# adcern.analyser_deep.AECnnTF2:
# nr_timeseries: 11
# nr_timesteps: 48
# epochs: 20
# verbose: 0
# adcern.analyser_deep.AELstmTF2:
# nr_timeseries: 11
# nr_timesteps: 48
# epochs: 20
# verbose: 0
# adcern.analyser_forecasting.ForecastCNN:
# nr_timeseries: 11
# nr_timesteps: 48
# chunk_len: 6
# epochs: 20
# verbose: 0
# ----------------------------------------------------------------
# SUB-SAMPLING
# ----------------------------------------------------------------
# Max number of sample we want to limit for traditional algortihms
max_samples_for_train: 1000
# Max number of sample we want to limit for deep algortihms
max_samples_for_train_deep: 10000
# Random seed used to subsample the data to feed to the algorithm.
# This is used only if max_samples_for_train or max_samples_for_train_deep
# are set, otherwise no subsampling is performed.
random_seed: 42
# ----------------------------------------------------------------
# ANALYSIS OUTPUT
# ----------------------------------------------------------------
# Path to save the traing time of your algo.
# Note that this path hould be accessible to the executor of the analysis
# (e.g. your VM or your container). Typically this is in a local folder if
# we work with our VM as main executor or in a shared volume if we work on
# a cluster.
folder_training_time: "/eos/project/i/it-cloud-data-analytics/experiments/always_on/time"
# Path to save the anomaly scores created by your algo.
# Note that this path hould be accessible to the executor of the analysis
# (e.g. your VM or your container). Typically this is in a local folder if
# we work with our VM as main executor or in a shared volume if we work on
# a cluster.
local_scores_folder: "/eos/project/i/it-cloud-data-analytics/experiments/always_on/scores"
# Number of most serious anomalies to send to MONIT for every temporal
# window of analysis.
# Note that this apllys only if your analysis executor is properly connected
# to the MONIT infrastructure via Fluentd (--log-driver docekr option)
# If not sure ignore this
publish_per_windows: 4
# ----------------------------------------------------------------
# ANNOTATION EVALUATION
# ----------------------------------------------------------------
# Hostgroup name in the form of absolute path.
# Note that this will be used to retrieve the annotations from grafana also.
hostgroup_abs_path: "cloud_compute/level2/batch/gva_shared_016"
# Start and End date of the benchmark you are running.
# Only annotations in this range will be considered.
# Note that the underscore between the date and time is fundamental
# because this will be passed as a parameter on the command line
# Note that if the DAGs experiment has produced less scores than the interval
# specified here, only the intersection will be considered.
start_benchmark: "2020-02-13_16:00:00"
end_benchmark: "2020-08-13_00:00:00"
# Path to save the artifacts of the evaluation section
# Note that it might contain: annotations, plots, summarized table etc.
evaluation_artifact_path: "/eos/project/i/it-cloud-data-analytics/experiments/always_on/results"
...
---
# ----------------------------------------------------------------
# CONFIGURATION FILE TO DEFINE WHICH DATA TO EXTRACT FORM HDFS
# ----------------------------------------------------------------
# Absolute path identifier of the cell/hostgroup that you want to mine.
# Note that it is in a list format, but only one hostgroup is supported so far.
hostgroups:
- cloud_compute/level2/batch/gva_project_013
#- cloud_compute/level2/main/gva_shared_016
# The pattern of the names of your data folders and ".metadata" files.
# This is the template name, each variable in brachets is replaced by the
# corresponding values via jinja tempalting
# Note that this name is the same use both for HDFS and your local copies.
code_project_name: "batch_013_{{ start_date }}_{{ end_date }}_{{ start_date_normalization}}_{{ end_date_normalization }}"
# ----------------------------------------------------------------
# LOCAL
# ----------------------------------------------------------------
# Local area of your VM where to save your data and metadata
# data are saved in folders with one parquet only
# metadata are saved in file with the same name of the resepctive foler
# plus the ".metadata" extension
local_cache_folder: "/eos/project/i/it-cloud-data-analytics/eos-datalake/always_on/"
# ----------------------------------------------------------------
# HDFS
# ----------------------------------------------------------------
# HDFS Area where Spark saves the aggregated data of your cell
# Note that the saving can create multiple file depending on the number of
# partitions that the workers were using.
hdfs_out_folder: "/project/it_cloud_data_analytics/always_on/"
# HDFS Area where Spark saves the aggregated data of your cell
# Note that here we force it to be one partiotion only
hdfs_cache_folder: "/project/it_cloud_data_analytics/always_on/"
# HDFS Area where Spark saves the normalization coefficients computed on the
# normalziation chunk of data between:
# date_start_normalization and date_end_normalization_excluded
normalization_out_folder: "/project/it_cloud_data_analytics/normalization"
# Wether you want to overwrite (true) or not (false) the raw data in HDFS
# If not sure leave true
overwrite_on_hdfs: true
# Wether you want to overwrite (true) or not (false) the noramlization
# coefficeints in HDFS. If not sure leave true
overwrite_normalization: true
# ----------------------------------------------------------------
# TEMPORAL DETAILS
# ----------------------------------------------------------------
# The level of aggregation of your raw time series data.
# The aggregator is typically the mean operator.
# e.g. if 5 it means that we summarize the data every 5 min, and the values
# with timestamp 7.45 will represent the mean of the previous 5 minutes from
# 7.40 to 7.45 but that value will have 7.45 as timestamp
aggregate_every_n_minutes: 10
# The length of your windows of data
# e.g. if aggregate_every_n_minutes = 10 and history_steps = 6 it means that
# every windows is summarizing 6 * 10 = 60 minutes
history_steps: 48
# The number of step you want to move your window.
# e.g. if aggregate_every_n_minutes = 10 and history_steps = 2 it means that
# you will get a window of data that is translated of 10 * 2 = 20 min with
# respect to the previous.
# Note that if slide_steps has the same value of history_steps you have non-
# overlapping windows
slide_steps: 48
# Used to create windows with future steps.
# If not sure keep this to 0
future_steps: 0
# Dates representing the start/end of the data and noramlization chunks.
# - start_date -> the starting date of data chunk of ETL
# - end_date -> the ending date of data chunk of ETL
# - start_date_normalization -> the starting date of the chunk of data used
# to learn noramlization coefficeints (typically this chunk preceeds the
# chunk of data)
# - end_date_normalization -> the ending date of the chunk of data used
# to learn noramlization coefficeints
# Note that the upper extremum is excluded (i.e. data will stop at the 23:59
# of the day preeceeding the date_end_excluded)
date_start: "{{ start_date }}"
date_end_excluded: "{{ end_date }}"
date_start_normalization: "{{ start_date_normalization }}"
date_end_normalization_excluded: "{{ end_date_normalization }}"
# ----------------------------------------------------------------
# METRICS
# ----------------------------------------------------------------
# List of plugins to mine.
# Note that it is a dictionary where every key represents the name your plugin
# have and the value is a dictionary with:
# 'plugin_instance', 'type' 'type_instance', 'plugin_name'
# the value asigned to these key is defining an and-filter.
# you will get only the data that have all those attributes
# ('plugin_instance', 'type' 'type_instance', 'plugin_name') in and with the
# specified value
# Note that if you do not want to filter on one attribute do not express it.
selected_plugins:
# EXTRA FOR THE SHARED
# cloud_contextswitch_involuntary:
# type: contextswitch
# type_instance: involuntary
# plugin_name: cloud
# 5 METRICS - FUNDAMENTAL
load_longterm:
value_instance: longterm
plugin_name: load
cpu__percent_idle:
plugin_instance: ''
type: percent
type_instance: idle
plugin_name: cpu
memory__memory_free:
plugin_instance: ''
type: memory
type_instance: free
plugin_name: memory
vmem__vmpage_io_memory_in:
plugin_instance: ''
type: vmpage_io
type_instance: memory
value_instance: in
plugin_name: vmem
swap_swapfile_swap_free:
type: swap
type_instance: free
plugin_name: swap
# +5 TO SHOULD IMPROVE
cpu__percent_wait:
plugin_instance: ''
type: percent
type_instance: wait
plugin_name: cpu
cpu__percent_system:
plugin_instance: ''
type: percent
type_instance: system
plugin_name: cpu
vmem__vmpage_io_memory_out:
plugin_instance: ''
type: vmpage_io
type_instance: memory
value_instance: out
plugin_name: vmem
interface__if_octets__tx:
type: if_octets
type_instance: ''
value_instance: tx
plugin_name: interface
interface__if_octets__rx:
type: if_octets
type_instance: ''
value_instance: rx
plugin_name: interface
# # +5 TO CHALLENGE ALGOS IN HIGH DIMENSIONALITY
# df_var_percent_bytes_free:
# plugin_instance: var
# type: percent_bytes
# type_instance: free
# plugin_name: df
# uptime__uptime_:
# plugin_instance: ''
# type: uptime
# type_instance: ''
# plugin_name: uptime
# processes__fork_rate_:
# plugin_instance: ''
# type: fork_rate
# type_instance: ''
# plugin_name: processes
# processes__ps_state_sleeping:
# plugin_instance: ''
# type: ps_state
# type_instance: sleeping
# plugin_name: processes
# processes__ps_state_blocked:
# plugin_instance: ''
# type: ps_state
# type_instance: blocked
# plugin_name: processes
...
\ No newline at end of file
---
# ----------------------------------------------------------------
# CONFIGURATION FILE TO DEFINE WHICH DATA TO EXTRACT FORM HDFS
# ----------------------------------------------------------------
# ----------------------------------------------------------------
# HOSTGROUP INFO
# ----------------------------------------------------------------
# Absolute path identifier of the cell/hostgroup that you want to mine.
# Note that it is in a list format, but only one hostgroup is supported so far.
hostgroups:
- cloud_compute/level2/batch/gva_project_013
#- cloud_compute/level2/main/gva_shared_016
# The pattern of the names of your data folders and ".metadata" files.
# This is the template name, each variable in brachets is replaced by the
# corresponding values via jinja tempalting
# Note that this name is the same use both for HDFS and your local copies.
code_project_name: "batch_013_{{ start_date }}_{{ end_date }}_{{ start_date_normalization}}_{{ end_date_normalization }}"
# ----------------------------------------------------------------
# LOCAL
# ----------------------------------------------------------------
# Local area of your VM where to save your data and metadata
# data are saved in folders with one parquet only
# metadata are saved in file with the same name of the resepctive foler
# plus the ".metadata" extension
local_cache_folder: "/eos/project/i/it-cloud-data-analytics/eos-datalake/always_on/"
# ----------------------------------------------------------------
# HDFS
# ----------------------------------------------------------------
# HDFS Area where Spark saves the aggregated data of your cell
# Note that the saving can create multiple file depending on the number of
# partitions that the workers were using.
hdfs_out_folder: "/project/it_cloud_data_analytics/always_on/"
# HDFS Area where Spark saves the aggregated data of your cell
# Note that here we force it to be one partiotion only
hdfs_cache_folder: "/project/it_cloud_data_analytics/always_on/"
# HDFS Area where Spark saves the normalization coefficients computed on the
# normalziation chunk of data between:
# date_start_normalization and date_end_normalization_excluded
normalization_out_folder: "/project/it_cloud_data_analytics/normalization"
# Wether you want to overwrite (true) or not (false) the raw data in HDFS
# If not sure leave true
overwrite_on_hdfs: true
# Wether you want to overwrite (true) or not (false) the noramlization
# coefficeints in HDFS. If not sure leave true
overwrite_normalization: true
# ----------------------------------------------------------------
# TEMPORAL DETAILS
# ----------------------------------------------------------------
# The level of aggregation of your raw time series data.
# The aggregator is typically the mean operator.
# e.g. if 5 it means that we summarize the data every 5 min, and the values
# with timestamp 7.45 will represent the mean of the previous 5 minutes from
# 7.40 to 7.45 but that value will have 7.45 as timestamp
aggregate_every_n_minutes: 10
# The length of your windows of data
# e.g. if aggregate_every_n_minutes = 10 and history_steps = 6 it means that
# every windows is summarizing 6 * 10 = 60 minutes
history_steps: 48
# The number of step you want to move your window.
# e.g. if aggregate_every_n_minutes = 10 and history_steps = 2 it means that
# you will get a window of data that is translated of 10 * 2 = 20 min with
# respect to the previous.
# Note that if slide_steps has the same value of history_steps you have non-
# overlapping windows
slide_steps: 1
# Used to create windows with future steps.
# If not sure keep this to 0
future_steps: 0
# Dates representing the start/end of the data and noramlization chunks.
# - start_date -> the starting date of data chunk of ETL
# - end_date -> the ending date of data chunk of ETL
# - start_date_normalization -> the starting date of the chunk of data used
# to learn noramlization coefficeints (typically this chunk preceeds the
# chunk of data)
# - end_date_normalization -> the ending date of the chunk of data used
# to learn noramlization coefficeints
# Note that the upper extremum is excluded (i.e. data will stop at the 23:59
# of the day preeceeding the date_end_excluded)
date_start: "{{ start_date }}"
date_end_excluded: "{{ end_date }}"
date_start_normalization: "{{ start_date_normalization }}"
date_end_normalization_excluded: "{{ end_date_normalization }}"
# ----------------------------------------------------------------
# METRICS
# ----------------------------------------------------------------
# List of plugins to mine.
# Note that it is a dictionary where every key represents the name your plugin
# have and the value is a dictionary with:
# 'plugin_instance', 'type' 'type_instance', 'plugin_name'
# the value asigned to these key is defining an and-filter.
# you will get only the data that have all those attributes
# ('plugin_instance', 'type' 'type_instance', 'plugin_name') in and with the
# specified value
# Note that if you do not want to filter on one attribute do not express it.
selected_plugins:
# EXTRA FOR THE SHARED
# cloud_contextswitch_involuntary:
# type: contextswitch
# type_instance: involuntary
# plugin_name: cloud
# 5 METRICS - FUNDAMENTAL
load_longterm:
value_instance: longterm
plugin_name: load
cpu__percent_idle:
plugin_instance: ''
type: percent
type_instance: idle
plugin_name: cpu
memory__memory_free:
plugin_instance: ''
type: memory
type_instance: free
plugin_name: memory
vmem__vmpage_io_memory_in:
plugin_instance: ''
type: vmpage_io
type_instance: memory
value_instance: in
plugin_name: vmem
swap_swapfile_swap_free:
type: swap