Object store leaks Archive requests
During a run of CTA, several files were written to tape but:
- the archival was not reported to EOS
- the archive request object is left behind in the object store
For a file without issues, we see:
[root@ctaeos /]# eos file info /eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f
File: '/eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f' Flags: 0644
Size: 420
Modify: Mon Feb 6 10:22:18 2017 Timestamp: 1486372938.419150339
Change: Mon Feb 6 10:22:10 2017 Timestamp: 1486372930.601715551
CUid: 3 CGid: 4 Fxid: 0000009c Fid: 156 Pid: 27 Pxid: 0000001b
XStype: adler XS: f6 88 83 b6 ETAG: 41875931136:f68883b6
replica Stripes: 1 Blocksize: 4k LayoutId: 00100012
#Rep: 2
# fs-id #...................................................................................................................................
# host # schedgroup # path # boot # configstatus # drain # active # geotag
#...................................................................................................................................
0 1 ...est.svc.cluster.local default.0 /fst booted rw nodrain online ...est.svc.cluster.local
1 65535 localhost tape /does_not_exist off nodrain
*******
and (noting the last log line, with report):
[root@tpsrv /]# grep 0b82c4f4-f826-4382-8bee-d3c9e00aae2f /cta-taped.log
<158>2017-02-06T10:22:21.619409+01:00 cta-taped[325]: LVL="Info" TID="325" MSG="Created tasks for migrating a file" thread="MainThread" TPVID="V10001" byteSizeThreshold="80000000000" maxFiles="500" archiveFileID="100" fSeq="59" path="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f?eos.lfn=fxid:0000009c"
<158>2017-02-06T10:22:22.416838+01:00 cta-taped[325]: LVL="Info" TID="342" MSG="Opened disk file for read" thread="DiskRead" TPVID="V10001" threadID="3" path="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f?eos.lfn=fxid:0000009c" actualURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f?eos.lfn=fxid:0000009c" fileId="100"
<158>2017-02-06T10:22:22.550667+01:00 cta-taped[325]: LVL="Info" TID="342" MSG="File successfully read from disk" thread="DiskRead" TPVID="V10001" threadID="3" path="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f?eos.lfn=fxid:0000009c" actualURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f?eos.lfn=fxid:0000009c" fileId="100" readWriteTime="0.000303" checksumingTime="0.000000" waitFreeMemoryTime="0.133492" waitDataTime="0.000000" waitReportingTime="0.000000" checkingErrorTime="0.000001" openingTime="0.001755" transferTime="0.135555" totalTime="0.135555" dataVolume="420" globalPayloadTransferSpeedMBps="0.003098" diskPerformanceMBps="0.003098" openRWCloseToTransferTimeRatio="0.015182" FILEID="100"
<159>2017-02-06T10:22:22.691859+01:00 cta-taped[325]: LVL="Debug" TID="350" MSG="Successfully opened the tape file for writing" thread="TapeWrite" TPVID="V10001" fileId="100" fileSize="420" fSeq="59" diskURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f?eos.lfn=fxid:0000009c"
<158>2017-02-06T10:22:22.698984+01:00 cta-taped[325]: LVL="Info" TID="350" MSG="File successfully transmitted to drive" thread="TapeWrite" TPVID="V10001" fileId="100" fileSize="420" fSeq="59" diskURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f?eos.lfn=fxid:0000009c" readWriteTime="0.010788" checksumingTime="0.000002" waitDataTime="0.000003" waitReportingTime="0.000024" transferTime="0.010817" totalTime="0.010806" dataVolume="420" headerVolume="480" driveTransferSpeedMBps="0.083287" payloadTransferSpeedMBps="0.038867" archiveFileID="100" reconciliationTime="0" LBPMode="LBP_Off"
<158>2017-02-06T10:22:24.507639+01:00 cta-taped[325]: LVL="Info" TID="351" MSG="Reported to the client a full file archival" thread="ReportPacker" TPVID="V10001" archiveFileId="100" diskInstance="ctaeos" diskFileId="156" lastKnownDiskPath="/eos/ctaeos/cta/testdir/0b82c4f4-f826-4382-8bee-d3c9e00aae2f" reportURL="eosQuery://ctaeos//eos/wfe/passwd?mgm.pcmd=event&mgm.fid=0000009c&mgm.logid=cta&mgm.event=archived&mgm.workflow=default&mgm.path=/eos/wfe/passwd&mgm.ruid=0&mgm.rgid=0"
A problematic file has those 3 symptoms:
- Orphaned ArchiveRequest:
[root@tpsrv /]# cta-objectstore-list rados://cta-eric@tapetest:cta-eric | grep ArchiveRequest | head -1 | xargs -i cta-objectstore-dump-object rados://cta-eric@tapetest:cta-eric {}
Object store path: rados://cta-eric@tapetest:cta-eric object name=ArchiveRequest-OStoreDBFactory-ctafrontend-291-20170206-10:16:24-1082
ArchiveRequest
{
"checksumtype":"ADLER32",
"checksumvalue":"0XF68883B6",
"diskfileid":"4434",
"diskinstance":"ctaeos",
"archivereporturl":"eosQuery:\/\/ctaeos\/\/eos\/wfe\/passwd?mgm.pcmd=event&mgm.fid=00001152&mgm.logid=cta&mgm.event=archived&mgm.workflow=default&mgm.path=\/eos\/wfe\/passwd&mgm.ruid=0&mgm.rgid=0",
"filesize":420,
"srcurl":"root:\/\/ctaeos.ctatest.svc.cluster.local\/\/eos\/ctaeos\/cta\/testdir\/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152",
"storageclass":"ctaStorageClass",
"creationlog":{
"host":"",
"time":0,
"username":""
},
"jobs":[
{
"copynb":1,
"lastmountwithfailure":0,
"maxretrieswithinmount":3,
"maxtotalretries":6,
"owner":"OStoreDBFactory-tpsrv-360-20170206-10:22:28",
"retrieswithinmount":0,
"status":1,
"tapepool":"ctasystest",
"tapepoolAddress":"archiveQueue-OStoreDBFactory-ctafrontend-291-20170206-10:16:24-107",
"totalRetries":0
}
],
"diskfileinfo":{
"recoveryblob":"\"fmd={ keylength.file=60 file=\/eos\/ctaeos\/cta\/testdir\/2d91ee97-f774-45c8-aef1-f1d1cc380f4c size=420 mtime=1486373058.359369000 ctime=1486373058.313686611 mode=0644 uid=3 gid=4 fxid=00001152 fid=4434 ino=1190242811904 pid=27 pxid=0000001b xstype=adler xs=f68883b6 etag=1190242811904:f68883b6 layout=replica nstripes=1 lid=00100012 nrep=1 fsid=1 } dmd={ keylength.file=24 file=\/eos\/ctaeos\/cta\/testdir\/ treesize=0 container=0 files=1100 mtime=1486373059.548281788 ctime=1486372925.755724633 mode=42777 uid=0 gid=0 fxid=0000001b fid=27 ino=27 pid=16 pxid=00000010 etag=1b:0.000 xattrn=CTA_StorageClass xattrv=ctaStorageClass xattrn=CTA_TapeFsId xattrv=65535 xattrn=sys.attr.link xattrv=\/eos\/ctaeos\/proc\/cta\/workflow xattrn=sys.forced.blocksize xattrv=4k xattrn=sys.forced.checksum xattrv=adler xattrn=sys.forced.layout xattrv=replica xattrn=sys.forced.nstripes xattrv=1 xattrn=sys.forced.space xattrv=default }\"",
"group":"adm",
"owner":"adm",
"path":"\/eos\/ctaeos\/cta\/testdir\/2d91ee97-f774-45c8-aef1-f1d1cc380f4c"
},
"requester":{
"name":"adm",
"group":"adm"
}
}
- No tape copy in EOS:
[root@ctaeos /]# eos file info /eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c
File: '/eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c' Flags: 0644
Size: 420
Modify: Mon Feb 6 10:24:27 2017 Timestamp: 1486373067.462492388
Change: Mon Feb 6 10:24:18 2017 Timestamp: 1486373058.313686611
CUid: 3 CGid: 4 Fxid: 00001152 Fid: 4434 Pid: 27 Pxid: 0000001b
XStype: adler XS: f6 88 83 b6 ETAG: 1190242811904:f68883b6
replica Stripes: 1 Blocksize: 4k LayoutId: 00100012
#Rep: 1
# fs-id #...................................................................................................................................
# host # schedgroup # path # boot # configstatus # drain # active # geotag
#...................................................................................................................................
0 1 ...est.svc.cluster.local default.0 /fst booted rw nodrain online ...est.svc.cluster.local
*******
- Missing log for reporting in cta-taped.log:
[root@tpsrv /]# grep 2d91ee97-f774-45c8-aef1-f1d1cc380f4c /cta-taped.log
<158>2017-02-06T10:24:56.440808+01:00 cta-taped[360]: LVL="Info" TID="392" MSG="Created tasks for migrating a file" thread="MigrationTaskInjector" TPVID="V10001" archiveFileID="1078" fSeq="1074" path="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152"
<158>2017-02-06T10:25:06.222680+01:00 cta-taped[360]: LVL="Info" TID="377" MSG="Opened disk file for read" thread="DiskRead" TPVID="V10001" threadID="3" path="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152" actualURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152" fileId="1078"
<158>2017-02-06T10:25:06.416446+01:00 cta-taped[360]: LVL="Info" TID="377" MSG="File successfully read from disk" thread="DiskRead" TPVID="V10001" threadID="3" path="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152" actualURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152" fileId="1078" readWriteTime="0.000762" checksumingTime="0.000000" waitFreeMemoryTime="0.192964" waitDataTime="0.000000" waitReportingTime="0.000000" checkingErrorTime="0.000002" openingTime="0.002147" transferTime="0.195879" totalTime="0.195879" dataVolume="420" globalPayloadTransferSpeedMBps="0.002144" diskPerformanceMBps="0.002144" openRWCloseToTransferTimeRatio="0.014851" FILEID="1078"
<159>2017-02-06T10:25:06.600933+01:00 cta-taped[360]: LVL="Debug" TID="388" MSG="Successfully opened the tape file for writing" thread="TapeWrite" TPVID="V10001" fileId="1078" fileSize="420" fSeq="1074" diskURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152"
<158>2017-02-06T10:25:06.611282+01:00 cta-taped[360]: LVL="Info" TID="388" MSG="File successfully transmitted to drive" thread="TapeWrite" TPVID="V10001" fileId="1078" fileSize="420" fSeq="1074" diskURL="root://ctaeos.ctatest.svc.cluster.local//eos/ctaeos/cta/testdir/2d91ee97-f774-45c8-aef1-f1d1cc380f4c?eos.lfn=fxid:00001152" readWriteTime="0.015563" checksumingTime="0.000004" waitDataTime="0.000088" waitReportingTime="0.000015" transferTime="0.015670" totalTime="0.015654" dataVolume="420" headerVolume="480" driveTransferSpeedMBps="0.057493" payloadTransferSpeedMBps="0.026830" archiveFileID="1078" reconciliationTime="0" LBPMode="LBP_Off"
Implementing #60 (closed) will probably help with debugging this issue.