Commit f6acb0c3 authored by Elvin Sindrilaru's avatar Elvin Sindrilaru

MGM: Don't broadcast unneccessary file system configuration values and have

the draining report the number of failed transfers.
parent 44a4a078
......@@ -427,9 +427,9 @@ public:
//! Set the draining status
//----------------------------------------------------------------------------
bool
SetDrainStatus(fsstatus_t status)
SetDrainStatus(fsstatus_t status, bool broadcast = true)
{
return SetString("drainstatus", GetDrainStatusAsString(status));
return SetString("drainstatus", GetDrainStatusAsString(status), broadcast);
}
//----------------------------------------------------------------------------
......
......@@ -28,15 +28,14 @@
EOSMGMNAMESPACE_BEGIN
/*----------------------------------------------------------------------------*/
bool
FileSystem::StartDrainJob()
/*----------------------------------------------------------------------------*/
/**
* @brief Start a drain job on this filesystem
* @return true if started otherwise false
*/
/*----------------------------------------------------------------------------*/
bool
FileSystem::StartDrainJob()
{
if (!ShouldBroadCast()) {
// this is a filesystem on a ro-slave MGM e.g. it does not drain
......@@ -57,15 +56,14 @@ FileSystem::StartDrainJob()
return true;
}
/*----------------------------------------------------------------------------*/
bool
FileSystem::StopDrainJob()
/*----------------------------------------------------------------------------*/
/**
* @brief Stop a drain job on this filesystem
* @return true if stopped otherwise false
*/
/*----------------------------------------------------------------------------*/
bool
FileSystem::StopDrainJob()
{
eos::common::FileSystem::fsstatus_t isstatus = GetConfigStatus();
......
......@@ -1227,6 +1227,7 @@ FsView::GetFileSystemFormat(std::string option)
format += "key=stat.drainfiles:format=ol|";
format += "key=stat.drainbytesleft:format=ol|";
format += "key=stat.drainretry:format=ol|";
format += "key=stat.drain.failed:format=ol|";
format += "key=graceperiod:format=ol|";
format += "key=stat.timeleft:format=ol|";
format += "key=stat.active:format=os|";
......@@ -1291,7 +1292,7 @@ FsView::GetFileSystemFormat(std::string option)
format += "key=stat.drainbytesleft:width=12:format=+l:tag=bytes-left:unit=B|";
format += "key=stat.timeleft:width=11:format=l:tag=timeleft|";
format += "key=stat.drainretry:width=6:format=l:tag=retry|";
format += "key=stat.wopen:width=6:format=l:tag=wopen";
format += "key=stat.drain.failed:width=12:format=+l:tag=failed";
} else if (option == "l") {
// long format
format = "header=1:key=host:width=24:format=-S|";
......
......@@ -132,7 +132,7 @@ XrdMgmOfs::FsConfigListener()
if (queue == MgmConfigQueue.c_str()) {
// This is an MGM configuration modification
if (!gOFS->MgmMaster.IsMaster()) {
// only an MGM slave needs to aplly this
// only an MGM slave needs to apply this
gOFS->ObjectManager.HashMutex.LockRead();
XrdMqSharedHash* hash = gOFS->ObjectManager.GetObject(queue.c_str(), "hash");
......@@ -162,10 +162,8 @@ XrdMgmOfs::FsConfigListener()
}
} else {
eos_info("Call SetConfig %s %s", key.c_str(), value.c_str());
gOFS->ConfEngine->SetConfigValue(0,
key.c_str(),
value.c_str(),
false);
gOFS->ConfEngine->SetConfigValue(0, key.c_str(),
value.c_str(), false);
gOFS->ConfEngine->ApplyEachConfig(key.c_str(), &value, (void*) &err);
}
}
......@@ -194,11 +192,12 @@ XrdMgmOfs::FsConfigListener()
"queue %s which is not registered ", queue.c_str());
} else {
FsView::gFsView.ViewMutex.LockRead();
if (FsView::gFsView.mIdView.count(fsid)) {
fs = FsView::gFsView.mIdView[fsid];
if (fs && FsView::gFsView.mNodeView.count(fs->GetQueue())) {
// check if the change notification is an actual change in the geotag
// check if the change notification is an actual change in the geotag
FsNode* node = FsView::gFsView.mNodeView[fs->GetQueue()];
static_cast<GeoTree*>(node)->getGeoTagInTree(fsid , oldgeotag);
oldgeotag.erase(0, 8); // to get rid of the "<ROOT>::" prefix
......@@ -206,12 +205,13 @@ XrdMgmOfs::FsConfigListener()
if (fs && (oldgeotag != newgeotag)) {
eos_warning("Received a geotag change for fsid %lu new geotag is "
"%s, old geotag was %s ", (unsigned long)fsid,
newgeotag.c_str(), oldgeotag.c_str());
"%s, old geotag was %s ", (unsigned long)fsid,
newgeotag.c_str(), oldgeotag.c_str());
FsView::gFsView.ViewMutex.UnLockRead();
eos::common::RWMutexWriteLock fs_rw_lock(FsView::gFsView.ViewMutex);
eos::common::FileSystem::fs_snapshot snapshot;
fs->SnapShotFileSystem(snapshot);
// Update node view tree structure
if (FsView::gFsView.mNodeView.count(snapshot.mQueue)) {
FsNode* node = FsView::gFsView.mNodeView[snapshot.mQueue];
......@@ -264,7 +264,7 @@ XrdMgmOfs::FsConfigListener()
}
} else {
FsView::gFsView.ViewMutex.UnLockRead();
}
}
} else {
FsView::gFsView.ViewMutex.UnLockRead();
}
......@@ -297,7 +297,6 @@ XrdMgmOfs::FsConfigListener()
if (gOFS->MgmMaster.IsMaster()) {
// only an MGM master needs to initiate draining
eos::common::FileSystem::fsid_t fsid = 0;
FileSystem* fs = 0;
long long errc = 0;
std::string configstatus = "";
std::string bootstatus = "";
......@@ -321,32 +320,22 @@ XrdMgmOfs::FsConfigListener()
if (fsid && errc && (cfgstatus >= eos::common::FileSystem::kRO) &&
(bstatus == eos::common::FileSystem::kOpsError)) {
// this is the case we take action and explicitly ask to start a drain job
// Case when we take action and explicitly ask to start a drain job
eos::common::RWMutexReadLock lock(FsView::gFsView.ViewMutex);
if (FsView::gFsView.mIdView.count(fsid)) {
fs = FsView::gFsView.mIdView[fsid];
} else {
fs = 0;
}
if (fs) {
fs->StartDrainJob();
FileSystem* fs = FsView::gFsView.mIdView[fsid];
fs->SetConfigStatus(eos::common::FileSystem::kDrain);
}
}
if (fsid && (!errc)) {
// make sure there is no drain job triggered by a previous filesystem errc!=0
// Make sure there is no drain job triggered by a previous filesystem errc!=0
eos::common::RWMutexReadLock lock(FsView::gFsView.ViewMutex);
if (FsView::gFsView.mIdView.count(fsid)) {
fs = FsView::gFsView.mIdView[fsid];
} else {
fs = 0;
}
if (fs) {
fs->StopDrainJob();
FileSystem* fs = FsView::gFsView.mIdView[fsid];
fs->SetConfigStatus(eos::common::FileSystem::kRW);
}
}
}
......
This diff is collapsed.
......@@ -55,12 +55,12 @@ public:
//! Constructor
//!
//! @param thread_pool drain thread pool to use for jobs
//! @param fs_id filesystem id
//! @param target_fs_id file system where to drain
//! @param src_fsid filesystem id to drain
//! @param dst_fsid file system where to drain
//----------------------------------------------------------------------------
DrainFs(eos::common::ThreadPool& thread_pool,
eos::common::FileSystem::fsid_t fs_id,
eos::common::FileSystem::fsid_t target_fs_id = 0);
eos::common::FileSystem::fsid_t src_fsid,
eos::common::FileSystem::fsid_t dst_fsid = 0);
//----------------------------------------------------------------------------
//! Destructor
......@@ -187,6 +187,8 @@ private:
//---------------------------------------------------------------------------
void Stop();
constexpr static std::chrono::seconds sRefreshTimeout {60};
constexpr static std::chrono::seconds sStallTimeout {600};
eos::common::FileSystem::fsid_t mFsId; ///< Drain source fsid
eos::common::FileSystem::fsid_t mTargetFsId; /// Drain target fsid
eos::common::FileSystem::eDrainStatus mStatus;
......@@ -194,7 +196,6 @@ private:
std::atomic<bool> mForceRetry; ///< Flag to retry failed transfers
std::atomic<std::uint32_t> mMaxRetries; ///< Max number of retries
std::atomic<std::uint32_t> mMaxJobs; ///< Max number of drain jobs
uint64_t mTotalFiles; ///< Total number of files to drain
std::chrono::seconds mDrainPeriod; ///< Allowed time for file system to drain
std::chrono::time_point<std::chrono::steady_clock> mDrainStart;
std::chrono::time_point<std::chrono::steady_clock> mDrainEnd;
......@@ -206,6 +207,12 @@ private:
std::list<std::shared_ptr<DrainTransferJob>> mJobsRunning;
eos::common::ThreadPool& mThreadPool;
std::future<State> mFuture;
uint64_t mTotalFiles; ///< Total number of files to drain
uint64_t mLastNumToDrain; ///< Last number of drain jobs recorded
//! Last timestamp when a refresh of failed transfers was performed
std::chrono::time_point<std::chrono::steady_clock> mLastRefreshTime;
//! Last timestamp when drain progress was recorded
std::chrono::time_point<std::chrono::steady_clock> mLastProgressTime;
};
EOSMGMNAMESPACE_END
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment