Commit 68b005a2 authored by Domenico Giordano's avatar Domenico Giordano
Browse files
parents 98b521e9 f859632c
......@@ -12,6 +12,7 @@
#
# In order to run the same script manually, assuming only docker available, run
#
# CI_PROJECT_DIR=`pwd | sed -e 's@/tests/spark_etl@@'`
# docker run --rm -e CI_USER=$CI_USER -e CI_USER_PASSWD=$CI_USER_PASSWD -e CI_PROJECT_DIR=${CI_PROJECT_DIR} -v /tmp:/tmp -v /builds:/builds -v `pwd`:/work -v /var/run/docker.sock:/var/run/docker.sock gitlab-registry.cern.ch/cloud-infrastructure/data-analytics/compose:qa /work/tests/spark_etl/ci_test_script.sh
#
# Consider to open the Spark connection ports in iptables
......
......@@ -118,11 +118,11 @@
{
"data": {
"text/plain": [
"dict_items([('spark.extraListeners', 'sparkmonitor.listener.JupyterSparkMonitorListener'), ('spark.driver.extraClassPath', '/usr/local/lib/swan/extensions/sparkmonitor/listener.jar')])"
]
 
 
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
......@@ -133,11 +133,11 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"ExecuteTime": {
 
 
"start_time": "2020-03-24T16:26:10.913524Z"
}
},
"outputs": [
{
......
......@@ -56,32 +56,32 @@
"end_time": "2020-04-06T16:57:54.952772Z",
"start_time": "2020-04-06T16:57:54.946536Z"
}
},
"outputs": [],
 
 
"from etl.spark_etl import cluster_utils"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
 
 
"start_time": "2020-04-06T16:57:59.373816Z"
}
},
"outputs": [],
"source": [
 
 
"sc, spark, conf = cluster_utils.set_spark()"
]
},
{
"cell_type": "markdown",
 
 
"source": [
"# Test data extraction "
]
},
{
......@@ -95,11 +95,11 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
 
 
"start_time": "2020-04-06T16:58:01.859192Z"
}
},
"outputs": [],
"source": [
......
......@@ -124,7 +124,7 @@ def run_tests():
# output dir in user HDFS area
outbasepath = "test_rally_errors"
# input file path with data to process with spark
inbasepath = "/project/monitoring/archive/openstack/logs/generic/rallytester/2020/0*/0[1-2]" # noqa: E501
inbasepath = "/project/monitoring/archive/openstack/logs/generic/rallytester/2021/0*/0[1-2]" # noqa: E501
# schema file
schemafile = 'rally_schema.json'
......@@ -138,7 +138,7 @@ def run_tests():
# in general, if enough data are collected in a single day
# it is representative of the whole data structure expected
# in the other days
sdir = '/project/monitoring/archive/openstack/logs/generic/rallytester/2020/02/01' # noqa: E501
sdir = '/project/monitoring/archive/openstack/logs/generic/rallytester/2021/02/01' # noqa: E501
cluster_utils.get_schema(spark, sdir, schemafile)
myprint("Load the schema from the file")
......
......@@ -106,32 +106,32 @@
#output dir in user HDFS area
outbasepath="test_rally_errors"
#output base file name
outbasefile="rally_errors.parquet"
#input file path with data to process with spark
inbasepath="/project/monitoring/archive/openstack/logs/generic/rallytester/2020/0*/01"
inbasepath="/project/monitoring/archive/openstack/logs/generic/rallytester/2021/0*/01"
#schema file
schemafile='rally_schema.json'
```
%% Cell type:code id: tags:
``` python
!hdfs dfs -ls /project/monitoring/archive/openstack/logs/generic/rallytester/2020/02/01
!hdfs dfs -ls /project/monitoring/archive/openstack/logs/generic/rallytester/2021/02/01
```
%% Cell type:code id: tags:
``` python
cluster_utils.get_list_dirs('/project/monitoring/archive/openstack/logs/generic/rallytester/2020/02/01')
cluster_utils.get_list_dirs('/project/monitoring/archive/openstack/logs/generic/rallytester/2021/02/01')
```
%% Cell type:code id: tags:
``` python
full_df = spark.read.json('/project/monitoring/archive/openstack/logs/generic/rallytester/2020/02/01')
full_df = spark.read.json('/project/monitoring/archive/openstack/logs/generic/rallytester/2021/02/01')
```
%% Cell type:code id: tags:
``` python
......@@ -145,11 +145,11 @@
# in general, if enough data are collected in a single day
# it is representative of the whole data structure expected
# in the other days
spark_df = cluster_utils.get_schema(spark,
'/project/monitoring/archive/openstack/logs/generic/rallytester/2020/02/01',
'/project/monitoring/archive/openstack/logs/generic/rallytester/2021/02/01',
schemafile) #<<<< If you do not have a schema file pre-defined
```
%% Cell type:code id: tags:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment