Commit 3723dd00 authored by Paul Millar's avatar Paul Millar
Browse files

client: add support for metalink source

Motivation:

FTS supports bulk operations, but using its own specific format.  The
metalink is a well-documented (see RFC 5854) XML format that describes
how to receive a set of files.

Support for metalink would allow FTS to accept a bulk transfer request
with the FTS user presenting the information in a standard fashion.

Modification:

Add initial support for parsing the metalink XML format.  The idea is
that the user presents a base URL that all files (within the metalink
file) are resolved against.

There are several limitations with the current implementation:

 *  Metalink allows for a file to have multiple sources.  This patch
    selects only the best url (the one with the lowest priority value).

 *  There are several types of source URLs supported by metalink (http,
    bittorrent, etc.).  The code currently assumes that any source URL
    is acceptable by FTS

 *  In principle, the metalink allows a file to have different digest
    values, as calculated by different checksum algorithms.  Currently,
    only adler32 checksums are supported.

Result:

It is now possible to submit a bulk request to FTS using a metalink
file.
parent bbdadc18
Pipeline #3997378 passed with stages
in 10 minutes and 14 seconds
......@@ -20,6 +20,13 @@ import time
from .base import Base
from fts3.rest.client import Submitter, Delegator, Inquirer
import urllib.parse as urlparse
import defusedxml.ElementTree as ET
# Tell urllib.parse to append relative paths for URLs that use the s3
# scheme:
urlparse.uses_relative.append("s3")
urlparse.uses_netloc.append("s3")
DEFAULT_PARAMS = {
"checksum": "ADLER32",
......@@ -255,9 +262,23 @@ class JobSubmitter(Base):
self.opt_parser.add_option(
"-f",
"--file",
metavar="FILE",
dest="bulk_file",
type="string",
help="Name of configuration file",
help="Name of configuration or metalink file",
)
self.opt_parser.add_option(
"--metalink-target",
dest="metalink_target",
action="callback",
metavar="URL",
type="string",
callback=self._parse_metalink,
help="Indicate that FILE (see option -f/--file) uses the "
"metalink format. This option takes the destination base "
"URL as an argument. For each transferred file, the "
"destination is built by resolving the file's name against "
"this base URL.",
)
self.opt_parser.add_option(
"--retry",
......@@ -313,8 +334,17 @@ class JobSubmitter(Base):
help="disable all checks, just copy the file",
)
def _parse_metalink(self, option, opt, value, parser):
parser.values.metalink_target = value if value[-1] == "/" else (value + "/")
def validate(self):
self.checksum = None
if self.options.metalink_target and not self.options.bulk_file:
self.logger.critical(
"The --metalink-target option requires the metalink file be given as the --file/-f option."
)
sys.exit(1)
if not self.options.bulk_file:
if len(self.args) < 2:
self.logger.critical("Need a source and a destination")
......@@ -344,10 +374,60 @@ class JobSubmitter(Base):
"Multiple overwrite flags can not be used at the same time"
)
def _build_transfers_from_metalink(self, data):
transfers = []
root = ET.fromstring(data)
for file in root.findall("{urn:ietf:params:xml:ns:metalink}file"):
transfer = {}
name = file.attrib["name"]
destination = urlparse.urljoin(self.options.metalink_target, name)
transfer["destinations"] = [destination]
# TODO: add support for other checksum types (e.g., MD5). Add support for multiple checksums
for hash in file.findall("{urn:ietf:params:xml:ns:metalink}hash"):
hash_type = hash.attrib["type"]
hash_value = hash.text
if hash_type == "adler32":
transfer["checksum"] = "ADLER32:" + hash_value
self.logger.debug(
"File %s has checksum %s value %s." % (name, hash_type, hash_value)
)
size = file.find("{urn:ietf:params:xml:ns:metalink}size")
if size is not None:
transfer["filesize"] = int(size.text)
else:
self.logger.debug("File %s is missing size information." % name)
# TODO: present multiple URLs (if present) to FTS when submitting the request.
best_url = None
best_priority = 1000000
for url in file.findall("{urn:ietf:params:xml:ns:metalink}url"):
# TODO: Skip URLs that FTS can't handle.
p = int(url.attrib["priority"]) if "priority" in url.attrib else 999999
if p < best_priority:
best_priority = p
best_url = url.text
if best_url is None:
self.logger.warning(
"Skipping file %s as there are no URLs we can use." % name
)
continue
transfer["sources"] = [best_url]
transfers.append(transfer)
return transfers
def _build_transfers(self):
if self.options.bulk_file:
with open(self.options.bulk_file, "r") as file:
filecontent = file.read()
if self.options.metalink_target:
return self._build_transfers_from_metalink(filecontent)
bulk = json.loads(filecontent)
if "files" in bulk:
return bulk["files"]
......@@ -363,7 +443,7 @@ class JobSubmitter(Base):
params = dict()
params.update(DEFAULT_PARAMS)
if self.options.bulk_file:
if self.options.bulk_file and not self.options.metalink_target:
with open(self.options.bulk_file, "r") as file:
filecontent = file.read()
bulk = json.loads(filecontent)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment