Commit 1c11ec23 authored by Giuseppe Lo Presti's avatar Giuseppe Lo Presti

Allow abort of failed requests, full implementation

parent 2c05218e
Pipeline #331449 passed with stages
in 27 minutes and 20 seconds
......@@ -773,10 +773,7 @@ BEGIN
SubRequest.id, CastorFile.id, CastorFile.fileId, CastorFile.nsHost, SubRequest.subreqId
FROM SubRequest, CastorFile
WHERE SubRequest.castorFile = CastorFile.id
AND request = origReqId
AND status IN (dconst.SUBREQUEST_START, dconst.SUBREQUEST_RESTART, dconst.SUBREQUEST_RETRY,
dconst.SUBREQUEST_WAITSUBREQ, dconst.SUBREQUEST_WAITTAPERECALL,
dconst.SUBREQUEST_REPACK));
AND request = origReqId);
SELECT COUNT(*) INTO nbItems FROM processBulkAbortFileReqsHelper;
-- handle aborts in bulk while avoiding deadlocks
WHILE nbItems > 0 LOOP
......@@ -826,8 +823,10 @@ BEGIN
-- if the migration had failed, the file remained in the original tape
INSERT INTO ProcessRepackAbortHelperDCmigr (cfId) VALUES (sr.cfId);
WHEN abortedSRstatus IN (dconst.SUBREQUEST_FAILED,
dconst.SUBREQUEST_FINISHED,
dconst.SUBREQUEST_FAILED_FINISHED,
dconst.SUBREQUEST_FAILED_FINISHED) THEN
-- also for failed requests, trigger the restore of the CastorFile's tapeStatus
INSERT INTO ProcessRepackAbortHelperDCmigr (cfId) VALUES (sr.cfId);
WHEN abortedSRstatus IN (dconst.SUBREQUEST_FINISHED,
dconst.SUBREQUEST_ARCHIVED) THEN
-- nothing to be done here
NULL;
......
......@@ -349,6 +349,7 @@ def cancelRepack():
FROM StageRepackRequest
WHERE StageRepackRequest.RepackVID = :vid
AND status IN (6, 0, 1, 3, 4) -- SUBMITTED, STARTING, ONGOING, FAILED, ABORTING
ORDER BY creationTime DESC
'''
stAbort = '''
BEGIN
......
......@@ -211,6 +211,138 @@ BEGIN
END;
/
/* PL/SQL method to process bulk abort on a given Repack request */
CREATE OR REPLACE PROCEDURE processBulkAbortForRepack(origReqId IN INTEGER) AS
abortedSRstatus INTEGER := -1;
srsToUpdate "numList";
dcmigrsToUpdate "numList";
nbItems INTEGER;
nbItemsDone INTEGER := 0;
SrLocked EXCEPTION;
PRAGMA EXCEPTION_INIT (SrLocked, -54);
cfId INTEGER;
srId INTEGER;
firstOne BOOLEAN := TRUE;
commitWork BOOLEAN := FALSE;
varOriginalVID VARCHAR2(2048);
BEGIN
-- get the VID of the aborted repack request
SELECT repackVID INTO varOriginalVID FROM StageRepackRequest WHERE id = origReqId;
-- Gather the list of subrequests to abort
INSERT INTO ProcessBulkAbortFileReqsHelper (srId, cfId, fileId, nsHost, uuid) (
SELECT /*+ INDEX_RS_ASC(Subrequest I_Subrequest_CastorFile)*/
SubRequest.id, CastorFile.id, CastorFile.fileId, CastorFile.nsHost, SubRequest.subreqId
FROM SubRequest, CastorFile
WHERE SubRequest.castorFile = CastorFile.id
AND request = origReqId);
SELECT COUNT(*) INTO nbItems FROM processBulkAbortFileReqsHelper;
-- handle aborts in bulk while avoiding deadlocks
WHILE nbItems > 0 LOOP
FOR sr IN (SELECT srId, cfId, fileId, nsHost, uuid FROM processBulkAbortFileReqsHelper) LOOP
BEGIN
IF firstOne THEN
-- on the first item, we take a blocking lock as we are sure that we will not
-- deadlock and we would like to process at least one item to not loop endlessly
SELECT id INTO cfId FROM CastorFile WHERE id = sr.cfId FOR UPDATE;
firstOne := FALSE;
ELSE
-- on the other items, we go for a non blocking lock. If we get it, that's
-- good and we process this extra subrequest within the same session. If
-- we do not get the lock, then we close the session here and go for a new
-- one. This will prevent dead locks while ensuring that a minimal number of
-- commits is performed.
SELECT id INTO cfId FROM CastorFile WHERE id = sr.cfId FOR UPDATE NOWAIT;
END IF;
-- note the revalidation of the status and even of the existence of the subrequest
-- as it may have changed before we got the lock on the Castorfile in processBulkAbortFileReqs
SELECT /*+ INDEX(Subrequest PK_Subrequest_Id)*/ status
INTO abortedSRstatus
FROM SubRequest
WHERE id = sr.srId;
CASE
WHEN abortedSRstatus = dconst.SUBREQUEST_START
OR abortedSRstatus = dconst.SUBREQUEST_RESTART
OR abortedSRstatus = dconst.SUBREQUEST_RETRY
OR abortedSRstatus = dconst.SUBREQUEST_WAITSUBREQ THEN
-- easy case, we only have to fail the subrequest
INSERT INTO ProcessRepackAbortHelperSR (srId) VALUES (sr.srId);
WHEN abortedSRstatus = dconst.SUBREQUEST_WAITTAPERECALL THEN
-- recall case, fail the subRequest and cancel the recall if needed
failRecallSubReq(sr.srId, sr.cfId);
WHEN abortedSRstatus = dconst.SUBREQUEST_REPACK THEN
-- trigger the update the subrequest status to FAILED
INSERT INTO ProcessRepackAbortHelperSR (srId) VALUES (sr.srId);
-- delete migration jobs of this repack, hence stopping selectively the migrations
DELETE FROM MigrationJob WHERE castorfile = sr.cfId AND originalVID = varOriginalVID;
-- delete migrated segments if no migration jobs remain
BEGIN
SELECT id INTO cfId FROM MigrationJob WHERE castorfile = sr.cfId AND ROWNUM < 2;
EXCEPTION WHEN NO_DATA_FOUND THEN
DELETE FROM MigratedSegment WHERE castorfile = sr.cfId;
END;
-- trigger the restore of the CastorFile's tapeStatus to ONTAPE in all cases:
-- if the migration had failed, the file remained in the original tape
INSERT INTO ProcessRepackAbortHelperDCmigr (cfId) VALUES (sr.cfId);
WHEN abortedSRstatus IN (dconst.SUBREQUEST_FAILED,
dconst.SUBREQUEST_FAILED_FINISHED) THEN
-- also for failed requests, trigger the restore of the CastorFile's tapeStatus
INSERT INTO ProcessRepackAbortHelperDCmigr (cfId) VALUES (sr.cfId);
WHEN abortedSRstatus IN (dconst.SUBREQUEST_FINISHED,
dconst.SUBREQUEST_ARCHIVED) THEN
-- nothing to be done here
NULL;
END CASE;
DELETE FROM processBulkAbortFileReqsHelper WHERE srId = sr.srId;
nbItemsDone := nbItemsDone + 1;
EXCEPTION WHEN SrLocked THEN
commitWork := TRUE;
END;
-- commit anyway from time to time, to avoid too long redo logs
IF commitWork OR nbItemsDone >= 1000 THEN
-- exit the current loop and restart a new one, in order to commit without getting invalid ROWID errors
EXIT;
END IF;
END LOOP;
-- do the bulk updates
SELECT srId BULK COLLECT INTO srsToUpdate FROM ProcessRepackAbortHelperSR;
FORALL i IN 1 .. srsToUpdate.COUNT
UPDATE /*+ INDEX(Subrequest PK_Subrequest_Id)*/ SubRequest
SET diskCopy = NULL, lastModificationTime = getTime(),
status = dconst.SUBREQUEST_FAILED_FINISHED,
errorCode = 1701, errorMessage = 'Aborted explicitely' -- ESTCLEARED
WHERE id = srsToUpdate(i);
SELECT cfId BULK COLLECT INTO dcmigrsToUpdate FROM ProcessRepackAbortHelperDCmigr;
FORALL i IN 1 .. dcmigrsToUpdate.COUNT
UPDATE CastorFile SET tapeStatus = dconst.CASTORFILE_ONTAPE WHERE id = dcmigrsToUpdate(i);
-- commit
COMMIT;
-- reset all counters
nbItems := nbItems - nbItemsDone;
nbItemsDone := 0;
firstOne := TRUE;
commitWork := FALSE;
END LOOP;
-- archive the request
BEGIN
SELECT id, status INTO srId, abortedSRstatus
FROM SubRequest
WHERE request = origReqId
AND status IN (dconst.SUBREQUEST_FINISHED, dconst.SUBREQUEST_FAILED_FINISHED)
AND ROWNUM = 1;
-- This procedure should really be called 'terminateSubReqAndArchiveRequest', and this is
-- why we call it here: we need to trigger the logic to mark the whole request and all of its subrequests
-- as ARCHIVED, so that they are cleaned up afterwards. Note that this is effectively
-- a no-op for the status change of the single fetched SubRequest.
archiveSubReq(srId, abortedSRstatus);
EXCEPTION WHEN NO_DATA_FOUND THEN
-- Should never happen, anyway ignore as there's nothing else to do
NULL;
END;
COMMIT;
END;
/
/* Recompile all invalid procedures, triggers and functions */
/************************************************************/
BEGIN
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment