Commit 7c49116b authored by Domenico Giordano's avatar Domenico Giordano
Browse files

support yaml

parent 03f2d049
---
# ----------------------------------------------------------------
# CONFIGURATION FILE TO DEFINE WHICH DATA TO EXTRACT FORM HDFS
# ----------------------------------------------------------------
# the variables having value __templated__ are
# replaced by the BashOperator using the Jinja templates
# ----------------------------------------------------------------
# HOSTGROUP INFO
# ----------------------------------------------------------------
# Absolute path identifier of the cell/hostgroup that you want to mine.
# Note that it is in a list format, but only one hostgroup is supported so far.
hostgroups:
- cloud_compute/level2/batch/gva_project_013
# The pattern of the names of your data folders and ".metadata" files.
# This is the template name, each variable in brachets is replaced by the
# corresponding values via jinja tempalting
# Note that this name is the same use both for HDFS and your local copies.
code_project_name: "__templated__"
# ----------------------------------------------------------------
# LOCAL
# ----------------------------------------------------------------
# Local area of your VM where to save your data and metadata
# data are saved in folders with one parquet only
# metadata are saved in file with the same name of the resepctive foler
# plus the ".metadata" extension
local_cache_folder: "__templated__"
# ----------------------------------------------------------------
# HDFS
# ----------------------------------------------------------------
# HDFS Area where Spark saves the aggregated data of your cell
# Note that the saving can create multiple file depending on the number of
# partitions that the workers were using.
hdfs_out_folder: "__templated__"
# HDFS Area where Spark saves the aggregated data of your cell
# Note that here we force it to be one partiotion only
hdfs_cache_folder: "__templated__"
# HDFS Area where Spark saves the normalization coefficients computed on the
# normalziation chunk of data between:
# date_start_normalization and date_end_normalization_excluded
normalization_out_folder: "__templated__"
# Wether you want to overwrite (true) or not (false) the raw data in HDFS
# If not sure leave true
overwrite_on_hdfs: true
# Wether you want to overwrite (true) or not (false) the noramlization
# coefficeints in HDFS. If not sure leave true
overwrite_normalization: true
# ----------------------------------------------------------------
# TEMPORAL DETAILS
# ----------------------------------------------------------------
# The level of aggregation of your raw time series data.
# The aggregator is typically the mean operator.
# e.g. if 5 it means that we summarize the data every 5 min, and the values
# with timestamp 7.45 will represent the mean of the previous 5 minutes from
# 7.40 to 7.45 but that value will have 7.45 as timestamp
aggregate_every_n_minutes: 10
# The length of your windows of data
# e.g. if aggregate_every_n_minutes = 10 and history_steps = 6 it means that
# every windows is summarizing 6 * 10 = 60 minutes
history_steps: 48
# The number of step you want to move your window.
# e.g. if aggregate_every_n_minutes = 10 and history_steps = 2 it means that
# you will get a window of data that is translated of 10 * 2 = 20 min with
# respect to the previous.
# Note that if slide_steps has the same value of history_steps you have non-
# overlapping windows
slide_steps: 1
# Used to create windows with future steps.
# If not sure keep this to 0
future_steps: 0
# Dates representing the start/end of the data and noramlization chunks.
# - start_date -> the starting date of data chunk of ETL
# - end_date -> the ending date of data chunk of ETL
# - start_date_normalization -> the starting date of the chunk of data used
# to learn noramlization coefficeints (typically this chunk preceeds the
# chunk of data)
# - end_date_normalization -> the ending date of the chunk of data used
# to learn noramlization coefficeints
# Note that the upper extremum is excluded (i.e. data will stop at the 23:59
# of the day preeceeding the date_end_excluded)
date_start: "__templated__"
date_end_excluded: "__templated__"
date_start_normalization: "__templated__"
date_end_normalization_excluded: "__templated__"
# ----------------------------------------------------------------
# METRICS
# ----------------------------------------------------------------
# List of plugins to mine.
# Note that it is a dictionary where every key represents the name your plugin
# have and the value is a dictionary with:
# 'plugin_instance', 'type' 'type_instance', 'plugin_name'
# the value asigned to these key is defining an and-filter.
# you will get only the data that have all those attributes
# ('plugin_instance', 'type' 'type_instance', 'plugin_name') in and with the
# specified value
# Note that if you do not want to filter on one attribute do not express it.
selected_plugins:
# EXTRA FOR THE SHARED
# cloud_contextswitch_involuntary:
# type: contextswitch
# type_instance: involuntary
# plugin_name: cloud
# 5 METRICS - FUNDAMENTAL
load_longterm:
value_instance: longterm
plugin_name: load
cpu__percent_idle:
plugin_instance: ''
type: percent
type_instance: idle
plugin_name: cpu
memory__memory_free:
plugin_instance: ''
type: memory
type_instance: free
plugin_name: memory
vmem__vmpage_io_memory_in:
plugin_instance: ''
type: vmpage_io
type_instance: memory
value_instance: in
plugin_name: vmem
swap_swapfile_swap_free:
type: swap
type_instance: free
plugin_name: swap
# +5 TO SHOULD IMPROVE
# cpu__percent_wait:
# plugin_instance: ''
# type: percent
# type_instance: wait
# plugin_name: cpu
# cpu__percent_system:
# plugin_instance: ''
# type: percent
# type_instance: system
# plugin_name: cpu
# vmem__vmpage_io_memory_out:
# plugin_instance: ''
# type: vmpage_io
# type_instance: memory
# value_instance: out
# plugin_name: vmem
# interface__if_octets__tx:
# type: if_octets
# type_instance: ''
# value_instance: tx
# plugin_name: interface
# interface__if_octets__rx:
# type: if_octets
# type_instance: ''
# value_instance: rx
# plugin_name: interface
# # +5 TO CHALLENGE ALGOS IN HIGH DIMENSIONALITY
# df_var_percent_bytes_free:
# plugin_instance: var
# type: percent_bytes
# type_instance: free
# plugin_name: df
# uptime__uptime_:
# plugin_instance: ''
# type: uptime
# type_instance: ''
# plugin_name: uptime
# processes__fork_rate_:
# plugin_instance: ''
# type: fork_rate
# type_instance: ''
# plugin_name: processes
# processes__ps_state_sleeping:
# plugin_instance: ''
# type: ps_state
# type_instance: sleeping
# plugin_name: processes
# processes__ps_state_blocked:
# plugin_instance: ''
# type: ps_state
# type_instance: blocked
# plugin_name: processes
...
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment