Commit 5812e512 authored by Elvin Sindrilaru's avatar Elvin Sindrilaru

MGM: Fix possible deadlock when starting the MGM with multiple file systems draining

parent 3d7502c8
Pipeline #471551 passed with stages
in 41 minutes and 2 seconds
......@@ -94,61 +94,64 @@ bool
FileSystem::SetConfigStatus(eos::common::FileSystem::fsstatus_t new_status)
{
using eos::mgm::FsView;
eos::common::FileSystem::fsstatus_t old_status = GetConfigStatus();
if (gOFS->mIsCentralDrain) {
eos_static_info("fsid=%d, centralized drain type", GetId());
int drain_tx = IsDrainTransition(old_status, new_status);
if (drain_tx) {
std::string out_msg;
if (drain_tx > 0) {
if (!gOFS->mDrainEngine.StartFsDrain(this, 0, out_msg)) {
eos_static_err("%s", out_msg.c_str());
return false;
}
} else if (drain_tx < 0) {
if (!gOFS->mDrainEngine.StopFsDrain(this, out_msg)) {
eos_static_err("%s", out_msg.c_str());
if (gOFS->MasterPtr->IsActivated()) {
eos::common::FileSystem::fsstatus_t old_status = GetConfigStatus();
if (gOFS->mIsCentralDrain) {
eos_static_info("fsid=%d, centralized drain type", GetId());
int drain_tx = IsDrainTransition(old_status, new_status);
if (drain_tx) {
std::string out_msg;
if (drain_tx > 0) {
if (!gOFS->mDrainEngine.StartFsDrain(this, 0, out_msg)) {
eos_static_err("%s", out_msg.c_str());
return false;
}
} else if (drain_tx < 0) {
if (!gOFS->mDrainEngine.StopFsDrain(this, out_msg)) {
eos_static_err("%s", out_msg.c_str());
}
}
}
}
} else {
eos_static_info("fsid=%d, distributed drain type", GetId());
} else {
eos_static_info("fsid=%d, distributed drain type", GetId());
if ((old_status == kDrainDead) || (old_status == kDrain)) {
// Stop draining
XrdSysMutexHelper scop_lock(mDrainJobMutex);
if ((old_status == kDrainDead) || (old_status == kDrain)) {
// Stop draining
XrdSysMutexHelper scop_lock(mDrainJobMutex);
if (mDrainJob) {
delete mDrainJob;
mDrainJob = 0;
SetDrainStatus(eos::common::FileSystem::kNoDrain);
if (mDrainJob) {
delete mDrainJob;
mDrainJob = 0;
SetDrainStatus(eos::common::FileSystem::kNoDrain);
}
}
}
if ((new_status == kDrain) || (new_status == kDrainDead)) {
// Create a drain job
XrdSysMutexHelper scope_lock(mDrainJobMutex);
if ((new_status == kDrain) || (new_status == kDrainDead)) {
// Create a drain job
XrdSysMutexHelper scope_lock(mDrainJobMutex);
// Check if there is still a drain job
if (mDrainJob) {
delete mDrainJob;
mDrainJob = 0;
}
// Check if there is still a drain job
if (mDrainJob) {
delete mDrainJob;
mDrainJob = 0;
}
if (ShouldBroadCast()) {
mDrainJob = new DrainJob(GetId());
} else {
// this is a filesystem on a ro-slave MGM e.g. it does not drain
}
} else {
if (new_status == kEmpty) {
SetDrainStatus(eos::common::FileSystem::kDrained);
SetLongLong("stat.drainprogress", 100);
if (ShouldBroadCast()) {
mDrainJob = new DrainJob(GetId());
} else {
// this is a filesystem on a ro-slave MGM e.g. it does not drain
}
} else {
SetDrainStatus(eos::common::FileSystem::kNoDrain);
if (new_status == kEmpty) {
SetDrainStatus(eos::common::FileSystem::kDrained);
SetLongLong("stat.drainprogress", 100);
} else {
SetDrainStatus(eos::common::FileSystem::kNoDrain);
}
}
}
}
......@@ -172,41 +175,32 @@ FileSystem::SetString(const char* key, const char* str, bool broadcast)
return eos::common::FileSystem::SetString(key, str, broadcast);
}
//------------------------------------------------------------------------------
// Check if this is a config transition or noop
//------------------------------------------------------------------------------
bool
FileSystem::IsConfigTransition(const eos::common::FileSystem::fsstatus_t
old_status,
const eos::common::FileSystem::fsstatus_t new_status)
{
return old_status != new_status;
}
//------------------------------------------------------------------------------
// Check if this is a drain transition i.e. enables or disabled draining
//------------------------------------------------------------------------------
int
FileSystem::IsDrainTransition(const eos::common::FileSystem::fsstatus_t
old_status,
const eos::common::FileSystem::fsstatus_t new_status)
old,
const eos::common::FileSystem::fsstatus_t status)
{
using eos::common::FileSystem;
// Enable draining
if ((old_status != FileSystem::kDrain) &&
(old_status != FileSystem::kDrainDead) &&
((new_status == FileSystem::kDrain) ||
(new_status == FileSystem::kDrainDead))) {
if (((old != FileSystem::kDrain) &&
(old != FileSystem::kDrainDead) &&
((status == FileSystem::kDrain) ||
(status == FileSystem::kDrainDead))) ||
(((old == FileSystem::kDrain) ||
(old == FileSystem::kDrainDead)) &&
(status == old))) {
return 1;
}
// Stop draining
if (((old_status == FileSystem::kDrain) ||
(old_status == FileSystem::kDrainDead)) &&
((new_status != FileSystem::kDrain) &&
(new_status != FileSystem::kDrainDead))) {
if (((old == FileSystem::kDrain) ||
(old == FileSystem::kDrainDead)) &&
((status != FileSystem::kDrain) &&
(status != FileSystem::kDrainDead))) {
return -1;
}
......
......@@ -45,18 +45,6 @@ class DrainJob;
class FileSystem : public eos::common::FileSystem
{
public:
//----------------------------------------------------------------------------
//! Check if this is a config transition or noop
//!
//! @param new_status new configuration status to be set
//! @param new_status new configuration status to be set
//!
//! @return true if transition, otherwise false
//----------------------------------------------------------------------------
static
bool IsConfigTransition(const eos::common::FileSystem::fsstatus_t old_status,
const eos::common::FileSystem::fsstatus_t new_status);
//----------------------------------------------------------------------------
//! Check if this is a drain transition i.e. enables or disabled draining
//!
......
......@@ -4103,4 +4103,21 @@ FsSpace::ResetDraining()
}
}
//------------------------------------------------------------------------------
// Reapply the configstatus for file systems to trigger eventually the
// draining - for the central draining case
//------------------------------------------------------------------------------
void
FsView::ReapplyConfigStatus()
{
eos_info("reapplying config status");
eos::common::RWMutexReadLock view_rd_lock(ViewMutex);
for (auto it = mIdView.cbegin(); it != mIdView.cend(); ++it) {
auto fs = it->second;
auto cfg_status = fs->GetConfigStatus();
fs->SetConfigStatus(cfg_status);
}
}
EOSMGMNAMESPACE_END
......@@ -1107,6 +1107,12 @@ public:
//----------------------------------------------------------------------------
void SetNextFsId(eos::common::FileSystem::fsid_t fsid);
//----------------------------------------------------------------------------
//! Reapply the status of the file system to trigger eventually the draining
//! - this is needed for the central draining
//----------------------------------------------------------------------------
void ReapplyConfigStatus();
private:
pthread_t hbthread; ///< Thread ID of the heartbeat thread
bool mIsHeartbeatOn; ///< True if heartbeat thread is running
......
......@@ -1132,6 +1132,13 @@ Master::Activate(XrdOucString& stdOut, XrdOucString& stdErr, int transitiontype)
}
fActivated.store(true);
// Reapply the fs configstatus to start eventually the draining given the
// space config
if (transitiontype == Transition::Type::kMasterToMaster) {
FsView::gFsView.ReapplyConfigStatus();
}
return true;
}
......
......@@ -368,12 +368,6 @@ proc_fs_config(std::string& identifier, std::string& key, std::string& value,
auto new_status = eos::common::FileSystem::GetConfigStatusFromString(
value.c_str());
// Nothing to do
if (!FileSystem::IsConfigTransition(old_status, new_status)) {
eos_static_info("msg=\"fsid=%d already in the desired state\"", fsid);
return 0;
}
if (value == "empty") {
// Check if this filesystem is really empty
if (gOFS->eosFsView->getNumFilesOnFs(fs->GetId()) != 0) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment