Commit a65b8192 authored by Georgios Bitzes's avatar Georgios Bitzes
Browse files

Implement ParanoidManifestChecker to try and catch potential MANIFEST corruption early

parent 7bbcef95
Pipeline #1432887 failed with stages
in 91 minutes and 15 seconds
......@@ -86,6 +86,7 @@ add_library(XrdQuarkDB SHARED
storage/KeyDescriptorBuilder.cc storage/KeyDescriptorBuilder.hh
storage/KeyLocators.hh
storage/LeaseInfo.hh
storage/ParanoidManifestChecker.cc storage/ParanoidManifestChecker.hh
storage/PatternMatching.hh
storage/Randomization.cc storage/Randomization.hh
storage/ReverseLocator.hh
......
......@@ -33,6 +33,8 @@
#include "storage/ExpirationEventIterator.hh"
#include "storage/ReverseLocator.hh"
#include "storage/InternalKeyParsing.hh"
#include "storage/ConsistencyScanner.hh"
#include "storage/ParanoidManifestChecker.hh"
#include "utils/IntToBinaryString.hh"
#include "utils/TimeFormatting.hh"
#include <sys/stat.h>
......@@ -153,6 +155,7 @@ StateMachine::StateMachine(std::string_view f, bool write_ahead_log, bool bulk_l
ensureClockSanity(!dirExists);
retrieveLastApplied();
manifestChecker.reset(new ParanoidManifestChecker(filename));
consistencyScanner.reset(new ConsistencyScanner(*this));
}
......@@ -184,6 +187,7 @@ void StateMachine::ensureClockSanity(bool justCreated) {
}
StateMachine::~StateMachine() {
manifestChecker.reset();
consistencyScanner.reset();
if(db) {
......@@ -1879,10 +1883,10 @@ rocksdb::Status StateMachine::noop(LogIndex index) {
}
//------------------------------------------------------------------------------
// Return health information about the state machine
// Return health information regarding free space
//------------------------------------------------------------------------------
std::vector<HealthIndicator> StateMachine::getHealthIndicators() {
std::string description = "FREE-SPACE-SM";
HealthIndicator StateMachine::getFreeSpaceHealth() {
std::string description = "SM-FREE-SPACE";
struct statfs out;
if(statfs(filename.c_str(), &out) != 0) {
......@@ -1911,7 +1915,22 @@ std::vector<HealthIndicator> StateMachine::getHealthIndicators() {
status = chooseWorstHealth(status, HealthStatus::kYellow);
}
return { HealthIndicator(status, description, SSTR(freeBytes << " bytes (" << percentFree << "%)")) };
return HealthIndicator(status, description, SSTR(freeBytes << " bytes (" << percentFree << "%)"));
}
//------------------------------------------------------------------------------
// Return health information about the state machine
//------------------------------------------------------------------------------
std::vector<HealthIndicator> StateMachine::getHealthIndicators() {
std::string description = "SM-MANIFEST-TIMEDIFF";
HealthStatus healthStatus = HealthStatus::kGreen;
Status status = manifestChecker->getLastStatus();
if(!status.ok()) {
healthStatus = HealthStatus::kRed;
}
return { getFreeSpaceHealth(), HealthIndicator(healthStatus, description, status.getMsg()) };
}
rocksdb::Status StateMachine::manualCompaction() {
......
......@@ -31,7 +31,6 @@
#include "utils/RequestCounter.hh"
#include "storage/KeyDescriptor.hh"
#include "storage/KeyLocators.hh"
#include "storage/ConsistencyScanner.hh"
#include "storage/KeyConstants.hh"
#include "storage/LeaseInfo.hh"
#include "health/HealthIndicator.hh"
......@@ -42,6 +41,9 @@
namespace quarkdb {
class ConsistencyScanner;
class ParanoidManifestChecker;
enum class LeaseAcquisitionStatus {
kKeyTypeMismatch,
kAcquired,
......@@ -366,7 +368,7 @@ private:
std::mutex writeMtx;
std::unique_ptr<rocksdb::DB> db;
std::unique_ptr<ParanoidManifestChecker> manifestChecker;
std::unique_ptr<ConsistencyScanner> consistencyScanner;
const std::string filename;
......@@ -375,6 +377,11 @@ private:
Timekeeper timeKeeper;
RequestCounter requestCounter;
//----------------------------------------------------------------------------
// Return health information regarding free space
//----------------------------------------------------------------------------
HealthIndicator getFreeSpaceHealth();
};
......
// ----------------------------------------------------------------------
// File: ParanoidManifestChecker.cc
// Author: Georgios Bitzes - CERN
// ----------------------------------------------------------------------
/************************************************************************
* quarkdb - a redis-like highly available key-value store *
* Copyright (C) 2020 CERN/Switzerland *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>.*
************************************************************************/
#include "storage/ParanoidManifestChecker.hh"
#include "utils/DirectoryIterator.hh"
#include "utils/StringUtils.hh"
#include <sys/stat.h>
namespace quarkdb {
ParanoidManifestChecker::ParanoidManifestChecker(std::string_view path)
: mPath(path) {
mThread.reset(&ParanoidManifestChecker::main, this);
}
void ParanoidManifestChecker::main(ThreadAssistant &assistant) {
while(!assistant.terminationRequested()) {
Status st = checkDB(mPath);
if(!st.ok()) {
qdb_error("Potential MANIFEST corruption for DB at " << mPath << "(" << st.getMsg() << ")");
}
mLastStatus.set(st);
assistant.wait_for(std::chrono::minutes(5));
}
}
bool operator<(struct timespec &one, struct timespec &two) {
if(one.tv_sec == two.tv_sec) {
return one.tv_nsec < two.tv_nsec;
}
return one.tv_sec < two.tv_sec;
}
Status ParanoidManifestChecker::checkDB(std::string_view path) {
DirectoryIterator iter(path);
struct dirent* entry = nullptr;
struct timespec manifestMtime;
struct timespec sstMtime;
while((entry = iter.next())) {
struct stat statbuf;
if(stat(SSTR(path << "/" << entry->d_name).c_str(), &statbuf) == 0) {
if(StringUtils::startsWith(entry->d_name, "MANIFEST") && manifestMtime < statbuf.st_mtim) {
manifestMtime = statbuf.st_mtim;
}
if(StringUtils::endsWith(entry->d_name, ".sst") && sstMtime < statbuf.st_mtim) {
sstMtime = statbuf.st_mtim;
}
}
}
int secDiff = sstMtime.tv_sec - manifestMtime.tv_sec;
std::string diff = SSTR(secDiff << " sec");
// 1 hour should be more than enough (?)
if(secDiff >= 3600) {
return Status(1, diff);
}
return Status(0, diff);
}
//------------------------------------------------------------------------------
// Get last status
//------------------------------------------------------------------------------
Status ParanoidManifestChecker::getLastStatus() const {
return mLastStatus.get();
}
}
// ----------------------------------------------------------------------
// File: ParanoidManifestChecker.hh
// Author: Georgios Bitzes - CERN
// ----------------------------------------------------------------------
/************************************************************************
* quarkdb - a redis-like highly available key-value store *
* Copyright (C) 2020 CERN/Switzerland *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>.*
************************************************************************/
#ifndef QUARKDB_PARANOID_MANIFEST_CHECKER_HH
#define QUARKDB_PARANOID_MANIFEST_CHECKER_HH
#include "utils/AssistedThread.hh"
#include <rocksdb/db.h>
#include <string_view>
#include "Status.hh"
#include "utils/Synchronized.hh"
namespace quarkdb {
//------------------------------------------------------------------------------
// We've observed in the past RocksDB corrupting its MANIFEST file, while new
// SST files were being written.
//
// This is an attempt at detecting this problem early, but we're not sure if
// it works, or how useful it might be.
//------------------------------------------------------------------------------
class ParanoidManifestChecker {
public:
//----------------------------------------------------------------------------
// Constructor receiving the rocksdb path
//----------------------------------------------------------------------------
ParanoidManifestChecker(std::string_view path);
//----------------------------------------------------------------------------
// Main thread checking the status on regular intervals
//----------------------------------------------------------------------------
void main(ThreadAssistant &assistant);
//----------------------------------------------------------------------------
// Check the given DB path
//----------------------------------------------------------------------------
static Status checkDB(std::string_view path);
//----------------------------------------------------------------------------
// Get last status
//----------------------------------------------------------------------------
Status getLastStatus() const;
private:
AssistedThread mThread;
std::string mPath;
Synchronized<Status> mLastStatus;
};
}
#endif
......@@ -30,7 +30,7 @@ using namespace quarkdb;
//------------------------------------------------------------------------------
// Construct iterator object on the given path - must be a directory.
//------------------------------------------------------------------------------
DirectoryIterator::DirectoryIterator(const std::string &mypath)
DirectoryIterator::DirectoryIterator(std::string_view mypath)
: path(mypath), reachedEnd(false), dir(nullptr) {
dir = opendir(path.c_str());
......
......@@ -26,6 +26,7 @@
#include <dirent.h>
#include <string>
#include <string_view>
namespace quarkdb {
......@@ -35,7 +36,7 @@ public:
//----------------------------------------------------------------------------
// Construct iterator object on the given path - must be a directory.
//----------------------------------------------------------------------------
DirectoryIterator(const std::string &path);
DirectoryIterator(std::string_view path);
//----------------------------------------------------------------------------
// Destructor
......
......@@ -26,6 +26,7 @@
#include "storage/ReverseLocator.hh"
#include "storage/PatternMatching.hh"
#include "storage/ExpirationEventIterator.hh"
#include "storage/ConsistencyScanner.hh"
#include "StateMachine.hh"
#include "test-utils.hh"
#include <gtest/gtest.h>
......
......@@ -29,6 +29,7 @@
#include "raft/RaftReplicator.hh"
#include "raft/RaftConfig.hh"
#include "raft/RaftTrimmer.hh"
#include "storage/ConsistencyScanner.hh"
#include "Configuration.hh"
#include "QuarkDBNode.hh"
#include "../test-utils.hh"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment