Fix test client failing periodically
test-client is periodically failing. Specifically, taped is coredumping every once in a while. There seem to be two different core dumps going on (hopefully related to the same issue), 1 for the maintenance process and 1 for the taped process. Below is a stacktrace for the taped failure; it seems to be related to XRootD:
(gdb) bt full
#0 0x00007f3519c7094c in __pthread_kill_implementation () from /lib64/libc.so.6
No symbol table info available.
#1 0x00007f3519c23646 in raise () from /lib64/libc.so.6
No symbol table info available.
#2 0x00007f3519c0d7f3 in abort () from /lib64/libc.so.6
No symbol table info available.
#3 0x00007f3519c0e130 in __libc_message.cold () from /lib64/libc.so.6
No symbol table info available.
#4 0x00007f3519c7a9f7 in malloc_printerr () from /lib64/libc.so.6
No symbol table info available.
#5 0x00007f3519c7ddd2 in _int_malloc () from /lib64/libc.so.6
No symbol table info available.
#6 0x00007f3519c7e809 in malloc () from /lib64/libc.so.6
No symbol table info available.
#7 0x00007f3519c5ab44 in _IO_file_doallocate () from /lib64/libc.so.6
No symbol table info available.
#8 0x00007f3519c687e0 in _IO_doallocbuf () from /lib64/libc.so.6
No symbol table info available.
#9 0x00007f3519c66155 in __GI__IO_file_seekoff () from /lib64/libc.so.6
No symbol table info available.
#10 0x00007f3519c641b3 in fseeko64 () from /lib64/libc.so.6
No symbol table info available.
#11 0x00007f3519d1e5ec in __nss_files_fopen () from /lib64/libc.so.6
No symbol table info available.
#12 0x00007f3519d2267d in _nss_files_gethostbyaddr_r () from /lib64/libc.so.6
No symbol table info available.
#13 0x00007f3519d02ddf in gethostbyaddr_r@@GLIBC_2.2.5 () from /lib64/libc.so.6
No symbol table info available.
#14 0x00007f3519d0bdb0 in getnameinfo () from /lib64/libc.so.6
No symbol table info available.
#15 0x00007f3517cd4ff9 in XrdNetAddrInfo::Resolve() () from /lib64/libXrdUtils.so.3
No symbol table info available.
#16 0x00007f3517cd5583 in XrdNetAddrInfo::Name(char const*, char const**) () from /lib64/libXrdUtils.so.3
No symbol table info available.
#17 0x00007f3517359315 in XrdSecProtocolsssObject () from /lib64/libXrdSecsss-5.so
No symbol table info available.
#18 0x00007f35173652b0 in XrdSecPManager::Get(char const*, XrdNetAddrInfo&, XrdSecBuffer&, XrdOucErrInfo*) () from /lib64/libXrdSec-5.so
No symbol table info available.
#19 0x00007f3517363e18 in XrdSecGetProtocol () from /lib64/libXrdSec-5.so
No symbol table info available.
#20 0x00007f351a319f4e in XrdCl::XRootDTransport::GetCredentials (this=this@entry=0x7f33fc02caf0, credentials=@0x7f3501fec538: 0x0, hsData=hsData@entry=0x7f34c00048b0, info=info@entry=0x7f33fc0026d0)
at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClXRootDTransport.cc:2643
protocolName = <error: Cannot access memory at address 0x1a1a1a1a1a1a1a1a>
log = 0x1d07ca0
ei = {_vptr.XrdOucErrInfo = 0x7f351a477220 <vtable for XrdOucErrInfo+16>, ErrInfo = {static Max_Error_Len = 2048, static Path_Offset = 1024, user = 0x7f351a41fead "", ucap = 0, code = 0,
message = "Plugin loaded secsss v5.6.1 from sec.protocol libXrdSecsss-5.so\000\000\000\000\000\000\000\000\000p\275\376\0015\177\000\000\220R>\0275\177\000\000\000\305\327\0275\177\000\000p\310\327\0275\177\000\000\242\000\000\000\000\000\000\000\346\000\000\000\000\000\000\200", '\000' <repeats 48 times>, "\242\000\000\000\000\000\000\000\346\000\000\000\000\000\000\200", '\000' <repeats 49 times>..., static uVMask = 65535, static uAsync = -2147483648, static uUrlOK = 1073741824, static uMProt = 536870912, static uReadR = 268435456, static uIPv4 = 134217728, static uIPv64 = 67108864,
static uPrip = 33554432, static uLclF = 16777216, static uRedirFlgs = 8388608, static uEcRedir = 4194304}, ErrCB = 0x0, {ErrCBarg = 139864536256112, ErrEnv = 0x7f34c0000e70}, mID = 0, dOff = -1,
reserved = 0, dataBuff = 0x0}
authHandler = 0x7f3517363d60 <XrdSecGetProtocol>
secuidc = <optimized out>
secgidc = <optimized out>
secuid = <optimized out>
secgid = <optimized out>
uidSetter = {pFsUid = <optimized out>, pFsGid = <optimized out>, pStreamName = <optimized out>, pPrevFsUid = -1, pPrevFsGid = -1, pOk = true}
srvAddrInfo = @0x7f33fc028ef0: {<XrdNetAddrInfo> = {static noPort = 1, static noPortRaw = 2, static old6Map4 = 4, static prefipv4 = 8, IP = {v6 = {sin6_family = 2, sin6_port = 17924,
sin6_flowinfo = 1191244810, sin6_addr = {__in6_u = {__u6_addr8 = "\000\000\000\000\000\000\000\000k for: r", __u6_addr16 = {0, 0, 0, 0, 8299, 28518, 14962, 29216}, __u6_addr32 = {0, 0,
1868963947, 1914714738}}}, sin6_scope_id = 980709231}, v4 = {sin_family = 2, sin_port = 17924, sin_addr = {s_addr = 1191244810}, sin_zero = "\000\000\000\000\000\000\000"}, Addr = {
sa_family = 2, sa_data = "\004F\n\364\000G\000\000\000\000\000\000\000"}}, {sockAddr = 0x7f33fc028ef0, unixPipe = 0x7f33fc028ef0}, hostName = 0x0, addrLoc = {Country = "\000e",
Region = 0 '\000', Locale = 0 '\000', TimeZone = -128 '\200', Flags = 0 '\000', Speed = 0, Latitude = 0, Longtitude = 0}, addrSize = 16, protType = 6 '\006', protFlgs = 0 '\000', sockNum = 0,
protName = 0x0, static isTLS = 1 '\001'}, static PortInSpec = -2147483648}
#21 0x00007f351a31a82a in XrdCl::XRootDTransport::DoAuthentication (this=this@entry=0x7f33fc02caf0, hsData=hsData@entry=0x7f34c00048b0, info=info@entry=0x7f33fc0026d0)
at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClXRootDTransport.cc:2414
urlParams = std::map with 8 elements = {["eos.injection"] = "1", ["eos.lfn"] = "fxid:12", ["eos.rgid"] = "0", ["eos.ruid"] = "0", ["eos.space"] = "default", ["eos.workflow"] = "retrieve_written",
["oss.asize"] = "15360", ["xrdcl.requuid"] = "5c6a17d3-f8ab-45b2-a8ce-6f8ceba7409a"}
pars = 0x7f34c0002ea0 "&P=krb5,host/eos-mgm-0.eos-mgm.dev.svc.cluster.local@TEST.CTA&P=sss,0.+13:/etc/eos.keytab&P=unix\a"
it = <optimized out>
authBuffLen = <optimized out>
st = {<XrdCl::Status> = {status = 0, code = 0, errNo = 0}, pMessage = ""}
log = <optimized out>
sInfo = @0x7f33fc028230: {status = XrdCl::XRootDStreamInfo::AuthSent, pathId = 0 '\000'}
credentials = 0x0
protocolName = ""
msg = <optimized out>
req = <optimized out>
reqBuffer = <optimized out>
#22 0x00007f351a31d1f3 in XrdCl::XRootDTransport::HandShakeMain (this=this@entry=0x7f33fc02caf0, handShakeData=handShakeData@entry=0x7f34c00048b0, channelData=...)
at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClXRootDTransport.cc:586
st = {<XrdCl::Status> = {status = 0, code = 1, errNo = 0}, pMessage = ""}
info = 0x7f33fc0026d0
sInfo = @0x7f33fc028230: {status = XrdCl::XRootDStreamInfo::AuthSent, pathId = 0 '\000'}
#23 0x00007f351a31d541 in XrdCl::XRootDTransport::HandShake (this=0x7f33fc02caf0, handShakeData=0x7f34c00048b0, channelData=...)
at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClXRootDTransport.cc:486
info = 0x7f33fc0026d0
scopedLock = {mtx = 0x7f33fc002848}
#24 0x00007f351a39e528 in XrdCl::AsyncSocketHandler::HandleHandShake (this=this@entry=0x7f33fc024ca0, msg=std::unique_ptr<XrdCl::Message> = {...})
at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClAsyncSocketHandler.cc:583
st = {<XrdCl::Status> = {status = 31904, code = 464, errNo = 0},
pMessage = "\000\000\000\000\000\000\000\000\330\317\376\0015\177", '\000' <repeats 11 times>, "\317\376\0015\177\000\000\240L\002\3743\177", '\000' <repeats 26 times>, ' ' <repeats 32 times>, "0\320\376\0015\177\000\000\001\000\000\000\000\000\000\000\240L\002\3743\177\000\000\240|\320\001", '\000' <repeats 12 times>, "\020\300\000\3743\177\000\000\001\000\000\000\000\000\000\000e\3559\0325\177", '\000' <repeats 34 times>, "% ++! \"*"...}
waitSeconds = <optimized out>
#25 0x00007f351a39e933 in XrdCl::AsyncSocketHandler::OnReadWhileHandshaking (this=0x7f33fc024ca0) at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClAsyncSocketHandler.cc:571
st = {<XrdCl::Status> = {status = 0, code = 0, errNo = 0}, pMessage = ""}
#26 0x00007f351a39ed65 in XrdCl::AsyncSocketHandler::EventRead (type=1 '\001', this=0x7f33fc024ca0) at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClAsyncSocketHandler.cc:248
No locals.
#27 XrdCl::AsyncSocketHandler::EventRead (type=1 '\001', this=0x7f33fc024ca0) at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClAsyncSocketHandler.cc:234
No locals.
#28 XrdCl::AsyncSocketHandler::Event (this=0x7f33fc024ca0, type=1 '\001') at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClAsyncSocketHandler.cc:224
No locals.
#29 0x00007f351a306c56 in (anonymous namespace)::SocketCallBack::Event (this=0x7f33fc028f50, chP=<optimized out>, cbArg=<optimized out>, evFlags=<optimized out>)
at /usr/src/debug/xrootd-5.6.1-1.el9.x86_64/xrootd/src/XrdCl/XrdClPollerBuiltIn.cc:83
ev = 1 '\001'
log = 0x1d07ca0
#30 0x00007f3517c911e7 in XrdSys::IOEvents::Poller::CbkXeq(XrdSys::IOEvents::Channel*, int, int, char const*) () from /lib64/libXrdUtils.so.3
No symbol table info available.
#31 0x00007f3517c924dc in XrdSys::IOEvents::PollE::Dispatch(XrdSys::IOEvents::Channel*, unsigned int) () from /lib64/libXrdUtils.so.3
No symbol table info available.
#32 0x00007f3517c926d8 in XrdSys::IOEvents::PollE::Begin(XrdSysSemaphore*, int&, char const**) () from /lib64/libXrdUtils.so.3
No symbol table info available.
#33 0x00007f3517c8f0bd in XrdSys::IOEvents::BootStrap::Start(void*) () from /lib64/libXrdUtils.so.3
No symbol table info available.
#34 0x00007f3517c97698 in XrdSysThread_Xeq () from /lib64/libXrdUtils.so.3
No symbol table info available.
#35 0x00007f3519c6ec02 in start_thread () from /lib64/libc.so.6
No symbol table info available.
#36 0x00007f3519cf3c40 in clone3 () from /lib64/libc.so.6
No symbol table info available.
We also get the following in the logs (this has been going on for a while now; what is causing this?)
{"epoch_time":1736241852.422857289,"local_time":"2025-01-07T10:24:12+0100","hostname":"tpsrv01-0","program":"cta-taped","log_level":"ERROR","pid":553,"tid":553,"message":"In Scheduler::reportRetrieveJobsBatch(): failed to report.","drive_name":"VDSTK01","instance":"CI","sched_backend":"ceph","SubprocessName":"maintenanceHandler","fileId":4294977310,"reportType":"FailureReport","exceptionMSG":"In EOSReporter::AsyncQueryHandler::HandleResponse(): failed to XrdCl::FileSystem::Query() [ERROR] Server responded with an error: [3000] Unable to trigger workflow - no workflow defined for <workflow>.<event> [EINVAL] &mgm.pcmd=event&mgm.fid=2734&mgm.logid=cta&mgm.event=sync::retrieve_failed&mgm.workflow=default&mgm.path=/dummy_path&mgm.ruid=0&mgm.rgid=0&mgm.errmsg=; Invalid argument code:400 errNo:3000 status:1"}
It seems like it doesn't recognize the sync::retrieve_failed event