Commit 42d8afd0 authored by Crystal Chua's avatar Crystal Chua
Browse files

Merge branch 'master' of https://gitlab.cern.ch/eos/quarkdb

parents a5fa6613 9931243d
Pipeline #1565881 failed with stages
in 83 minutes and 15 seconds
......@@ -4,3 +4,4 @@ _book/
.vscode
*.sublime-project
*.sublime-workspace
docs/site
......@@ -8,6 +8,33 @@ stages:
before_script:
- export GTEST_COLOR="1"
make-docs:
stage: build
image: fedora:latest
script:
- dnf install -y python3-pip git
- git submodule update --recursive --init
- pip3 install mkdocs
- cp CHANGELOG.md docs/docs/release-notes.md
- cd docs
- mkdocs build
- cd ..
- mv docs/site "$CI_JOB_NAME"
artifacts:
paths:
- "$CI_JOB_NAME"
c8:
stage: build
image: gitlab-registry.cern.ch/eos/quarkdb/build-image-c8
script:
- packaging/gitlab-build.sh
- mkdir ${CI_JOB_NAME}
- cp -r /root/rpmbuild/RPMS build/SRPMS ${CI_JOB_NAME}
artifacts:
paths:
- "$CI_JOB_NAME"
cc7:
stage: build
image: gitlab-registry.cern.ch/eos/quarkdb/build-image-cc7
......@@ -62,6 +89,16 @@ cc7-test:
- quarkdb-tests
- quarkdb-stress-tests
c8-test:
stage: test
image: gitlab-registry.cern.ch/eos/quarkdb/build-image-c8
dependencies:
- c8
script:
- yum -y localinstall c8/RPMS/*
- quarkdb-tests
- quarkdb-stress-tests
fedora-test:
stage: test
image: gitlab-registry.cern.ch/eos/quarkdb/build-image-fedora
......@@ -87,62 +124,90 @@ fedora-tsan-test:
rpms:
stage: publish
image: gitlab-registry.cern.ch/eos/quarkdb/build-image-cc7
image: gitlab-registry.cern.ch/eos/gitlab-eos/cc7:latest
dependencies:
- cc7
- c8
- fedora
script:
- for platform in cc7 fedora; do sudo -u stci -H packaging/repo-manager.py --action add --base /eos/project/s/storage-ci/www/quarkdb --ref ${CI_COMMIT_REF_NAME} --packages ${platform}/RPMS/* ${platform}/SRPMS/* ; done
- yum install -y createrepo
- automount
- cat "$STCI_PASSWORD" | kinit stci
- eosfusebind
- for platform in cc7 c8 fedora; do packaging/repo-manager.py --action add --base /eos/project/s/storage-ci/www/quarkdb --ref ${CI_COMMIT_REF_NAME} --packages ${platform}/RPMS/* ${platform}/SRPMS/* ; done
- sleep 60
tags:
- docker-cc7
- docker-privileged
retry: 2
only:
- branches@eos/quarkdb
- tags@eos/quarkdb
docs:
stage: publish
image: gitlab-registry.cern.ch/eos/quarkdb/build-image-cc7
script:
- gitbook build
- chown -R stci _book
- SNAPSHOT=$(date +%s)
- TARGET="/eos/project/q/quarkdb/www/docs/${CI_COMMIT_REF_NAME}"
- STAGING_AREA="$TARGET-${SNAPSHOT}"
- sudo -u stci -H cp -r _book "$STAGING_AREA"
- sudo -u stci -H packaging/replace-directory.sh "$STAGING_AREA" "$TARGET"
tags:
- docker-cc7
retry: 2
only:
- branches@eos/quarkdb
- tags@eos/quarkdb
stage: publish
image: gitlab-registry.cern.ch/eos/gitlab-eos/cc7:latest
dependencies:
- make-docs
script:
- automount
- cat "$STCI_PASSWORD" | kinit stci
- eosfusebind
- yum install -y git tree
- SNAPSHOT=$(date +%s)
- TARGET="/eos/project/q/quarkdb/www/docs/${CI_COMMIT_REF_NAME}"
- STAGING_AREA="$TARGET-${SNAPSHOT}"
- tree
- cp -r make-docs "$STAGING_AREA"
- packaging/replace-directory.sh "$STAGING_AREA" "$TARGET"
- sleep 60
tags:
- docker-privileged
retry: 2
only:
- branches@eos/quarkdb
- tags@eos/quarkdb
coverage:
stage: publish
image: gitlab-registry.cern.ch/eos/quarkdb/build-image-cc7
image: gitlab-registry.cern.ch/eos/gitlab-eos/cc7:latest
dependencies:
- ubuntu-asan
script:
- chown -R stci build/coverage-report
- automount
- cat "$STCI_PASSWORD" | kinit stci
- eosfusebind
- SNAPSHOT=$(date +%s)
- TARGET="/eos/project/q/quarkdb/www/coverage/${CI_COMMIT_REF_NAME}"
- STAGING_AREA="$TARGET-${SNAPSHOT}"
- sudo -u stci -H cp -r build/coverage-report "$STAGING_AREA"
- sudo -u stci -H packaging/replace-directory.sh "$STAGING_AREA" "$TARGET"
- cp -r build/coverage-report "$STAGING_AREA"
- packaging/replace-directory.sh "$STAGING_AREA" "$TARGET"
- sleep 60
tags:
- docker-cc7
- docker-privileged
retry: 2
only:
- branches@eos/quarkdb
- tags@eos/quarkdb
cc7-koji-scratch:
stage: koji
image: gitlab-registry.cern.ch/linuxsupport/cc7-base
script:
- yum install -y koji
- yum install -y git rpm-build python python3
- mkdir ~/.koji
- cp ci/koji/config ~/.koji
- echo "${QDBKOJI_PASSWORD}" | kinit qdbkoji
- packaging/make-srpm.sh
- koji build --scratch --wait quarkdb7 ./build/SRPMS/*.src.rpm
when: manual
cc7-koji:
stage: koji
image: gitlab-registry.cern.ch/linuxsupport/cc7-base
script:
- yum install -y koji
- yum install -y git rpm-build python
- yum install -y git rpm-build python python3
- mkdir ~/.koji
- cp ci/koji/config ~/.koji
- echo "${QDBKOJI_PASSWORD}" | kinit qdbkoji
......@@ -155,6 +220,20 @@ cc7-koji:
only:
- tags@eos/quarkdb
c8-image:
stage: build-image
variables:
TO: gitlab-registry.cern.ch/eos/quarkdb/build-image-c8
DOCKER_FILE: ci/c8/Dockerfile
NO_CACHE: 1
script:
- ""
tags:
- docker-image-build
only:
variables:
- $BUILD_IMAGES == "1"
cc7-image:
stage: build-image
variables:
......
......@@ -10,3 +10,9 @@
[submodule "deps/qclient"]
path = deps/qclient
url = https://gitlab.cern.ch/eos/qclient.git
[submodule "deps/asio"]
path = deps/asio
url = https://github.com/chriskohlhoff/asio
[submodule "deps/mkdocs-material"]
path = deps/mkdocs-material
url = https://github.com/squidfunk/mkdocs-material.git
# Changelog
All notable changes to this project will be documented in this file.
## Unreleased
- A race condition was sometimes causing elections to fail spuriously, making
the election of a stable leader to require slightly more rounds than it should have.
### Bug fixes
- The mechanism meant to provide an early warning for potential ``MANIFEST``
corruption was flaky, and would sometimes report a problem where none existed.
### Improvements
- Implementation of an optional part of raft, pre-vote. This should prevent partitioned,
or otherwise flaky rejoining servers from triggering unnecessary and disruptive elections.
A node will first issue an experimental voting round before advancing its term, and start campaigning
for earnest only if it has a good chance of winning.
Many thanks to Franck Eyraud (JRC) for the bug report concerning erroneous ``MANIFEST``-related
warning.
## 0.4.2 (2020-03-12)
### Bug fixes
- Under complicated conditions (follower is very far behind leader + network instabilities),
replication towards a particular follower could become stuck. (to workaround, restart leader node)
- Running ``DEL`` on a lease key would cause all nodes in a cluster to crash
with an assertion. ``DEL`` will now simply release the given lease, as if
``lease-release`` had been called.
### New features
- Implement command ``quarkdb-verify-checksum`` for manually running a full checksum scan.
- Addition of ``quarkdb-validate-checkpoint`` tool for ensuring that a given
checkpoint is valid -- useful to run in backup scripts before streaming a given
checkpoint for long-term storage.
### Improvements
- Security hardening of the redis parser for unauthenticated clients.
- Package and distribute ``quarkdb-ldb`` tool based on the one provided by RocksDB.
- Attempt to detect potential ``MANIFEST`` corruption early by measuring mtime lag
compared to newest SST file.
Many thanks to Crystal Chua (AARNet) for the bug report and all support offered
related to RocksDB's ``MANIFEST`` corruption issue, as well as to Pete Eby (ORNL)
for finding and reporting the bug causing replication to become stuck.
## 0.4.1 (2020-01-17)
### Bug fixes
- Fixed ability to subscribe to multiple channels with one command, when push types
are active. Previously, the server would erroneously send one "OK" response per
channel subscribed, breaking QClient.
### New features
- Possibility to choose between three different journal fsync policies through
``RAFT-SET-FSYNC-POLICY`` command.
- Implementation of ``CLIENT GETNAME``, and automatic tagging of intercluster
connections.
### Improvements
- Automatic fsync of the raft journal once per second.
- Better cluster resilience in case of sudden machine powercuts.
Many thanks to Franck Eyraud (JRC) for the bug reports relating to sudden poweroff, and valuable discussion on fsync behavior.
## 0.4.0 (2019-12-06)
### Bug fixes
- Locality hints ending with a pipe symbol (|) could subsequently trigger an
assertion and crash when encountered during ``LHSCAN``, due to faulty key parsing code.
The pipe symbol (|) has special meaning inside internal QuarkDB keys, and is used
to escape field separators (#).
### New features
- Addition of ``quarkdb-server`` binary to allow running QDB without XRootD.
- Add support for ``CLIENT SETNAME`` command as aid in debugging.
### Improvements
- Improvements to replication behaviour when one of the followers is very far behind the leader.
Previously, an excessive number of entries were kept in the request pipeline, which
wasted memory and could potentially trigger OOM.
- Switch to CLI11 for command line argument parsing.
- Upgrade rocksdb dependency to v6.2.4.
## 0.3.9 (2019-09-20)
### Bug fixes
- ``DEQUE-SCAN-BACK`` was returning the wrong cursor to signal end of
iteration: ``next:0`` while it should have been ``0``.
- A race condition was sometimes causing elections to fail spuriously.
Establishing a stable quorum would occasionally require slightly more election
rounds than it should have.
### New features
- Implementation of health indicators through ``QUARKDB-HEALTH`` command.
- Added support for RESPv3 push types, activated on a per-client basis through
``ACTIVATE-PUSH-TYPES`` command.
- Implementation of ``LHLOCDEL`` command for conditionally deleting a locality hash
field, only if the provided hint matches.
- Add convenience command ``DEQUE-CLEAR``.
- Add support for ``MATCHLOC`` in ``LHSCAN``, used to filter out results based
on locality hint.
- Add ``RECOVERY-SCAN`` command for scanning through complete keyspace, including
internal rocksdb keys.
- Add tool ``quarkdb-sst-inspect`` to allow low-level inspection of SST files.
- Add command ``RAFT-JOURNAL-SCAN`` to make searching through the contents of the
raft journal easier.
### Improvements
- Protection for a strange case of corruption which brought down a development
test cluster. (last-applied jumped ahead of commit-index by 1024, causing all
writes to stall). From now on, similar kind of corruption should only take out
a single node, and not spread to the entire cluster.
- Add command ``RAFT-JOURNAL-SCAN`` to make searching through the contents of the
raft journal easier.
- ``KEYS`` is now implemented in terms of ``SCAN``, making prefix matching of the
keyspace just as efficient as with ``SCAN``. (Note: The use of ``KEYS`` is still
generally discouraged due to potentially huge response size)
- Add ``RECOVERY-SCAN`` command for scanning through complete keyspace, including
internal rocksdb keys.
- Add tool ``quarkdb-sst-inspect`` to allow low-level inspection of SST files.
- Removed unused tool ``quarkdb-scrub``.
## 0.3.8 (2019-05-27)
- Prevent elections from hanging on the TCP timeout when one of the member hosts
is dropping packets, which could bring down an otherwise healthy cluster.
......
......@@ -55,6 +55,11 @@ else()
set(BUILDING_WITH_TSAN FALSE)
endif()
#-------------------------------------------------------------------------------
# Link the xrootd library with jemalloc?
#-------------------------------------------------------------------------------
option(XROOTD_JEMALLOC "Link xrootd library with jemalloc?" OFF)
#-------------------------------------------------------------------------------
# Look inside the rocksdb cache, which is used to greatly reduce compilation
# time of QuarkDB.
......@@ -138,7 +143,7 @@ endif()
#-------------------------------------------------------------------------------
# Compiler options
#-------------------------------------------------------------------------------
add_definitions(-Wall -Wextra -Werror -Wno-unused-parameter -std=c++17 -g -fPIC)
add_definitions(-Wall -Wextra -Werror -Wno-unused-parameter -std=c++17 -g -fPIC -DASIO_STANDALONE)
#-------------------------------------------------------------------------------
# Build source and tests
......
# QuarkDB
[![build status](https://gitlab.cern.ch/eos/quarkdb/badges/master/build.svg)](https://gitlab.cern.ch/eos/quarkdb/commits/master)
[![build status](https://gitlab.cern.ch/eos/quarkdb/badges/master/pipeline.svg)](https://gitlab.cern.ch/eos/quarkdb/commits/master)
[![coverage report](https://gitlab.cern.ch/eos/quarkdb/badges/master/coverage.svg)](https://quarkdb.web.cern.ch/quarkdb/coverage/master/)
[QuarkDB](https://gitlab.cern.ch/eos/quarkdb) is a highly available datastore that implements a small subset
......
# A C8 image + build dependencies of quarkdb.
# Significantly improves CI build time, since all packages
# are already there.
FROM gitlab-registry.cern.ch/linuxsupport/c8-base:latest
MAINTAINER Georgios Bitzes, georgios.bitzes@cern.ch, CERN 2020
RUN dnf clean all
RUN dnf remove -y whois-mkpasswd || true
RUN dnf install -y expect git
RUN git clone https://gitlab.cern.ch/eos/quarkdb.git; cd quarkdb; ci/c8/prepare.sh; cd ..; rm -rf quarkdb
#!/usr/bin/env bash
set -e
##------------------------------------------------------------------------------
## Bootstrap packages - needed to run 'builddep' on quarkdb for the next
## step.
##------------------------------------------------------------------------------
dnf install -y expect gcc-c++ cmake3 make rpm-build which git yum-utils libtsan dnf-plugins-core python3 epel-release
##------------------------------------------------------------------------------
## Extract quarkdb build dependencies from its specfile.
##------------------------------------------------------------------------------
./packaging/make-srpm.sh
dnf builddep -y build/SRPMS/*
##------------------------------------------------------------------------------
## Install rocksdb
##------------------------------------------------------------------------------
ci/install-rocksdb.sh
......@@ -6,7 +6,6 @@ set -e
## step.
##------------------------------------------------------------------------------
yum install -y https://dl.fedoraproject.org/pub/epel/7/x86_64/Packages/p/python36-3.6.8-1.el7.x86_64.rpm https://dl.fedoraproject.org/pub/epel/7/x86_64/Packages/p/python36-libs-3.6.8-1.el7.x86_64.rpm
yum install -y gcc-c++ cmake3 make rpm-build which git yum-plugin-priorities yum-utils
##------------------------------------------------------------------------------
......@@ -36,4 +35,4 @@ gitbook build # Gitbook will install more stuff during its first execution
## Install rocksdb
##------------------------------------------------------------------------------
scl enable devtoolset-7 "ci/install-rocksdb.sh"
scl enable devtoolset-8 "ci/install-rocksdb.sh"
#!/usr/bin/env bash
set -ex
apt-get install -y git g++ cmake zlib1g-dev openssl libssl-dev python libbz2-dev lcov uuid-dev libjemalloc-dev libdw-dev libdw1 liblz4-dev libzstd-dev
apt-get install -y git g++ cmake zlib1g-dev openssl libssl-dev python python3 libbz2-dev lcov uuid-dev libjemalloc-dev libdw-dev libdw1 liblz4-dev libzstd-dev
This diff is collapsed.
Subproject commit 22afb86087a77037cd296d27134756c9b0d2cb75
Subproject commit dc5d86d52e380fa0ec558a616992143312de0a65
This diff is collapsed.
Subproject commit 1afe3d97a472fa6d2ad83ece5f46542d8b8aee80
Subproject commit d07e09a2cbcadf9b6de29d1c5e8c7af5eb9164f6
Subproject commit 641fae60f63619ed5d0c9d9e4c4ea5a0ffa3e253
Subproject commit 76a56d89a7740f8dbb01edabf1ea5abc95a67657
# Getting started
After following the instructions in this chapter, by the end you will have
a fully-functional QuarkDB cluster. The steps are:
* [Installation](INSTALLATION.md): Install the QuarkDB binaries into your local system,
along with the necessary dependencies.
* [Configuration](CONFIGURATION.md): Decide which nodes will be part of the QuarkDB
cluster, and configure them.
* [Troubleshooting](TROUBLESHOOTING.md): A list of common errors encountered
during setup, and how to solve them.
# Raft extensions in QuarkDB
Although we follow the raft algorithm closely, we have made several improvements.
Understanding the rest of this page requires having a good understanding of
[raft](https://raft.github.io/raft.pdf), please read the paper first.
1. An RPC which only serves as a heartbeat.
Even though _appendEntries_ serves as a heartbeat, it can be problematic: A
pipelined storm of gigantic in size _appendEntries_ messages will heavily
influence message acknowledgement reception latencies. When using short raft
timeouts, this can easily lead to spurious timeouts and re-elections.
For this reason, we use a separate thread on the leader node which regularly
sends heartbeats, decoupled from replication.
The heartbeat request contains two fields:
* The raft term of the contacting node, for which it is a leader.
* The server identifier (host:port) of the contacting node.
The heartbeat response contains two fields as well:
* The raft term of the node being contacted.
* Whether or not the heartbeat was successful, that is, whether the contacted
node recognizes the sender as leader for the specified term.
The fact that this exchange doesn't access the journal at all makes this a
separate process from replication, as certain locks don't need to be taken,
and makes the cluster much more robust against spurious timeouts.
1. Veto responses on vote requests.
In QuarkDB, a node can reply in three different ways to a vote request:
* Granted
* Denied
* Veto
A veto is simply a stronger form of vote denial. The responder communicates to
the sender that, if they were to become a leader, a critical safety violation
would occur on raft invariants. Namely, if elected, the contacting node would
attempt to overwrite already committed journal entries with conflicting ones.
Clearly, raft already protects from the above scenario -- it's simply an extra,
paranoid precaution. It should never happen that a node receives a quorum of
positive votes, plus a non-zero number of vetoes. If it does happen, we print
a serious error in the logs, and the node does not become leader, even though
having received a quorum of positive votes.
However, this mechanism is quite useful in a different way as well: A node
receiving a veto _knows_ it cannot possibly be the next leader of this cluster,
as its journal does not contain all committed entries. Therefore, a veto
begins a period of election embargo, during which a node will willingly stop
attempting to elect itself, up until the moment it is contacted by a leader node.
This is useful in a number of ways:
* Entirely mitigates "Disruptive Servers" scenario (see section 4.2.3 of the
Raft PhD thesis), in which a node which is not aware it has been removed from
the cluster repeatedly attempts to become elected, thus making the cluster
unavailable. In QuarkDB, such a node will get vetoed, and willingly stop
making further election attempts.
* During 1-way network partitions, where a machine is able to initiate TCP connections
to other boxes, but others are not able to do the same, QuarkDB nodes will be
unable to receive heartbeats, but able to initiate voting rounds. (We have
had this happen in production _:)_ ) The constant
election attempts would normally bring the entire cluster down, even if a quorum
of nodes are available, and the network partition only affects a single node.
However, the veto mechanism will quickly calm down the partitioned node, and
it will simply wait until the partitioned has been healed, without disrupting
the rest.
# QuarkDB
[QuarkDB](https://gitlab.cern.ch/eos/quarkdb) is a highly available datastore that implements a small subset
of the redis command set, developed by IT-ST at CERN.
We build on top of [rocksdb](https://github.com/facebook/rocksdb), an embeddable, transactional
key-value store.
High availability is achieved through multiple replicated nodes and the
[raft](https://raft.github.io) distributed consensus algorithm.
# Getting started
Visit [this chapter](GETTING-STARTED.md) for instructions on how to get a
QuarkDB cluster up and running.
There's also a short [screencast demo](https://asciinema.org/a/NdX791Ah4JVkGQnUQkBVm3dDJ),
which shows how to set up a test cluster on localhost.
# Summary
* [Introduction](README.md)
* [Getting started](GETTING-STARTED.md)
* [Installation](INSTALLATION.md)
* [Configuration](CONFIGURATION.md)
* [Troubleshooting](TROUBLESHOOTING.md)
* [Password authentication](AUTHENTICATION.md)
* [Backup & restore](BACKUP.md)
* [Bulkload mode](BULKLOAD.md)
* [Upgrading](UPGRADING.md)
* [Raft basics](RAFT.md)
* [Membership updates](MEMBERSHIP.md)
* Advanced topics
* [Write path](WRITEPATH.md)
* [Raft extensions](RAFT-EXTENSIONS.md)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment