Commit 355443df authored by Domenico Giordano's avatar Domenico Giordano
Browse files

moved spark_etl integration test in new dir

parent cec1bd94
......@@ -216,14 +216,14 @@ spark_etl: &template_test_spark_etl
rules:
- if: '$CI_COMMIT_BRANCH =~ /^qa.*$/ || $CI_COMMIT_TAG =~ /^v.*$/'
script:
- . ${CI_PROJECT_DIR}/tests/spark_etl/ci_test_script.sh
- . ${CI_PROJECT_DIR}/tests/spark_etl/integration/ci_test_script.sh
- start_docker_compose
after_script:
- . ${CI_PROJECT_DIR}/tests/spark_etl/ci_test_script.sh
- . ${CI_PROJECT_DIR}/tests/spark_etl/integration/ci_test_script.sh
- stop_docker_compose
artifacts:
paths:
- $CI_PROJECT_DIR/tests/spark_etl/*
- $CI_PROJECT_DIR/tests/spark_etl/integration/*
expire_in: 1 week
when: always
......
#!/bin/bash -e
# This script is used by the gitlab CI of this repository to run a
# test of access and processing of Spark data that is based on the script test_spark_connector.sh
# Requirements:
# - spark_etl libs (distributed in docker image)
# - Kerberos authentication (passed via CI)
# - cvmfs spark libs (exposed via cvmfs)
# In order to access the needed libraries from cvmfs,
# a cvmfs service is started with docker-compose
#
# In order to run the same script manually, assuming only docker available, run
#
# CI_PROJECT_DIR=`pwd | sed -e 's@/tests/spark_etl@@'`
# docker run --rm -e CI_USER=$CI_USER -e CI_USER_PASSWD=$CI_USER_PASSWD -e CI_PROJECT_DIR=${CI_PROJECT_DIR} -v /tmp:/tmp -v /builds:/builds -v `pwd`:/work -v /var/run/docker.sock:/var/run/docker.sock gitlab-registry.cern.ch/cloud-infrastructure/data-analytics/compose:qa bash -c '. /work/ci_test_script.sh; start_docker_compose'
#
# Consider to open the Spark connection ports in iptables
#
# sudo iptables -I INPUT -p tcp -m multiport --dports 5001:6000 -m comment --comment "00200 firewall for hadoop jobs" -j ACCEPT
# sudo iptables -I DOCKER-USER -p tcp -m multiport --dports 5001:6000 -m comment --comment "00200 firewall for hadoop jobs" -j ACCEPT
#
function start_docker_compose() {
mkdir -p $KRB5DIR
export KRB5CCNAME=$KRB5DIR/krb5cc_docker; kdestroy -c $KRB5CCNAME ; echo $CI_USER_PASSWD | kinit -c $KRB5CCNAME $CI_USER@CERN.CH; klist -c $KRB5CCNAME
ls -l $KRB5DIR
echo -e "\n------- DUMP docker-compose.yml ----\n"
docker-compose -f docker-compose.yml config
echo -e "\n------- END docker-compose.yml ----\n"
docker-compose -f docker-compose.yml down --remove-orphans --volumes
docker-compose pull
docker-compose -f docker-compose.yml -p spark_etl up --remove-orphans --renew-anon-volumes --abort-on-container-exit --exit-code-from srv_spark_etl
#docker-compose logs -f 2>&1 >> compose.log &
}
function stop_docker_compose(){
cd $WORK_DIR
docker-compose down --remove-orphans --volumes # First remove containers
docker container prune -f
docker-compose down --rmi all --remove-orphans --volumes # Then remove images
}
export WORK_DIR=$(readlink -f $(dirname $BASH_SOURCE))
echo WORK_DIR $WORK_DIR
cd $WORK_DIR
export CI_COMMIT_BRANCH=${CI_COMMIT_BRANCH:-noCI}
export COMMIT_TAG=${CI_COMMIT_TAG:-$CI_COMMIT_BRANCH}
export CVMFSDIR=/builds/cvmfs-${COMMIT_TAG}
export KRB5DIR=/builds/krb5-${COMMIT_TAG}
export IMAGE_TAG=${COMMIT_TAG}
# docker-compose pipeline to run pyspark test
# providing cvmfs via container service
# Use docker-compose variable substitution as from
# https://docs.docker.com/compose/compose-file/#variable-substitution
version: '3.2'
# using this version to get cvmfs
#https://github.com/moby/moby/issues/34936
services:
srv_cvmfs:
image: gitlab-registry.cern.ch/hep-benchmarks/hep-workloads-builder/cvmfs-image:v1.0
command: -r sft.cern.ch -t /tmp/traces
privileged: true
#container_name: cnt_cvmfs_sft
volumes:
- $CVMFSDIR:/cvmfs:shared
network_mode: host
# This does not work
# https://docs.docker.com/compose/compose-file/
# https://docs.docker.com/storage/bind-mounts/#configure-bind-propagation
# - type: bind
# source: /tmp/cvmfs-stf
# target: /cvmfs
# propagation: shared
#ERROR: The Compose file './docker-compose.yml' is invalid because:
#services.srv_cvmfs.volumes contains unsupported option: 'propagation'
srv_spark_etl:
image: gitlab-registry.cern.ch/cloud-infrastructure/data-analytics/sparknotebook:${IMAGE_TAG}
command: sh -c "echo 'sleep to give time to /cvmfs to get alive' && /usr/bin/sleep 30 && /work/tests/spark_etl/test_spark_connector.sh"
user: root
depends_on:
- srv_cvmfs
volumes:
- $CVMFSDIR:/cvmfs:shared
- $CI_PROJECT_DIR:/work
- $KRB5DIR:$KRB5DIR
environment:
- KRB5CCNAME=${KRB5CCNAME}
network_mode: host
\ No newline at end of file
%% Cell type:markdown id: tags:
# Test access of Spark via pySPark
%% Cell type:markdown id: tags:
This notebooks installs the data-analytics package
and tests the basic functionalities
In order to run it in Swan, follow those steps
1) pass your kerberos credentials
2) install the package, using a specific tag (in the example is qa)
3) run
%% Cell type:code id: tags:
``` python
import os, sys
os.environ['PYTHONPATH']=os.environ['HOME']+'.local/lib/python3.6/site-packages/:'+os.environ['PYTHONPATH']
```
%% Cell type:markdown id: tags:
# Test package
%% Cell type:code id: tags:
``` python
from etl.spark_etl import cluster_utils
```
%% Cell type:code id: tags:
``` python
# Start the spark session
sc, spark, conf = cluster_utils.set_spark()
```
%% Cell type:markdown id: tags:
# Test data extraction
%% Cell type:markdown id: tags:
In this example access the rally data, extract a subset of data