Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
webservices
cern-search
cern-search-rest-api
Commits
35053fe9
Commit
35053fe9
authored
Jun 10, 2021
by
Carina Antunes
Browse files
Add indico config
parent
0d38aae3
Changes
38
Hide whitespace changes
Inline
Side-by-side
.env
View file @
35053fe9
CELERY_LOG_LEVEL=error
CERN_SEARCH_INSTANCE=test
CERN_SEARCH_REMOTE_APP_RESOURCE=localhost
CERN_SEARCH_SERVER_NAME=localhost
...
...
@@ -16,11 +14,9 @@ WORKER_APP=invenio_app.celery
DEFAULT_RECORDS_FILES_LOCATION=/usr/share/cern-search-api/files
ENV=dev
FLASK_DEBUG=True
FLASK_SKIP_DOTENV=1
FLASK_DEBUG='True'
INVENIO_DEBUG=0
ENV=development
FLOWER_PASS=password
...
...
@@ -63,3 +59,5 @@ INVENIO_FILES_PROCESSOR_TIKA_SERVER_ENDPOINT=http://tika:9998
SQLALCHEMY_POOL_SIZE=10
SQLALCHEMY_MAX_OVERFLOW=15
CERN_SEARCH_COPY_TO_METADATA='True'
.env-indico
View file @
35053fe9
CERN_SEARCH_INSTANCE=indico
INVENIO_INDEXER_DEFAULT_DOC_TYPE=events_v1.0.0
INVENIO_INDEXER_DEFAULT_INDEX=indico-events_v1.0.0
CERN_SEARCH_PROCESS_FILE_META='["collection"]'
.isort.cfg
View file @
35053fe9
[settings]
line_length=120
known_third_party = celery,click,elasticsearch,elasticsearch_dsl,flask,flask_login,flask_security,invenio_accounts,invenio_app,invenio_db,invenio_files_processor,invenio_files_rest,invenio_indexer,invenio_oauth2server,invenio_oauthclient,invenio_pidstore,invenio_records,invenio_records_files,invenio_records_rest,invenio_rest,invenio_search,kombu,marshmallow,pytest,setuptools,six,werkzeug
known_third_party = celery,click,elasticsearch,elasticsearch_dsl,flask,flask_login,flask_security,invenio_accounts,invenio_app,invenio_db,invenio_files_processor,invenio_files_rest,invenio_indexer,invenio_oauth2server,invenio_oauthclient,invenio_pidstore,invenio_records,invenio_records_files,invenio_records_rest,invenio_rest,invenio_search,kombu,marshmallow,pytest,setuptools,six,
sqlalchemy,
werkzeug
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
...
...
Dockerfile
View file @
35053fe9
...
...
@@ -7,7 +7,7 @@
# under the terms of the MIT License; see LICENSE file for more details.
# Use CentOS7:
FROM
gitlab-registry.cern.ch/webservices/cern-search/cern-search-rest-api/cern-search-rest-api-base:
1b4abb66064462b81cacb82f9047d7f05e92f72f
FROM
gitlab-registry.cern.ch/webservices/cern-search/cern-search-rest-api/cern-search-rest-api-base:
b8a11ad40fd7ab069e460badb83a42c73b5d5b7b
ARG
build_env
# CERN Search installation
...
...
@@ -33,11 +33,14 @@ ENV TIKA_LOG_PATH=${LOGS_DIR}
# Install UI
USER
invenio
# Collect static files
RUN
invenio collect
-v
RUN
invenio webpack buildall
# Move static files to instance folder
RUN
cp
/
${
WORKING_DIR
}
/src/static/images/cernsearchicon.png
${
INVENIO_INSTANCE_PATH
}
/static/images/cernsearchicon.png
# Build assets
RUN
invenio webpack buildall
EXPOSE
5000
# uWSGI configuration
...
...
Dockerfile-base
View file @
35053fe9
...
...
@@ -10,24 +10,30 @@
FROM inveniosoftware/centos8-python:3.8
# Install pre-requisites
RUN yum update -y && \
yum install -y \
RUN yum update -y && yum install -y \
gcc \
openssl \
openldap-devel \
https://linuxsoft.cern.ch/cern/centos/8/CERN/x86_64/Packages/CERN-CA-certs-20200530-1.el8.cern.noarch.rpm \
mailcap
# Uninstall python3.6 due to poetry bug (and leave node)
# https://github.com/python-poetry/poetry/issues/3463
RUN rpm -e --nodeps python36 && node -v
# Symlink python
RUN ln -nsf /usr/bin/python3.8 /usr/bin/python
RUN ln -nsf /usr/bin/python3.8 /usr/bin/python
&& python -V && whereis python
# CERN Search installation
WORKDIR /${WORKING_DIR}/src
COPY poetry.lock pyproject.toml /${WORKING_DIR}/src/
# Install dependencies globally
RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python
# Still using get-poetry due to https://github.com/python-poetry/poetry/issues/3870
# RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | POETRY_VERSION=1.1.6 python
# ENV PATH="${PATH}:/root/.local/bin"
RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | POETRY_VERSION=1.1.6 python
ENV PATH="${PATH}:/root/.poetry/bin"
RUN poetry config virtualenvs.create false -vvv && \
RUN poetry
--version && poetry
config virtualenvs.create false -vvv && \
poetry install --no-root --no-dev --no-interaction --no-ansi
Makefile
View file @
35053fe9
...
...
@@ -31,6 +31,11 @@ build-env:
docker-compose
-f
$(DOCKER_FILE)
up
-d
--remove-orphans
.PHONY
:
build-env
es
:
docker-compose
-f
docker-compose.es.yml up
-d
--remove-orphans
.PHONY
:
es
rebuild-env
:
docker-compose
-f
$(DOCKER_FILE)
build
--no-cache
--parallel
.PHONY
:
rebuild-env
...
...
cern_search_rest_api/celery.py
0 → 100644
View file @
35053fe9
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of CERN Search.
# Copyright (C) 2018-2021 CERN.
#
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Celery utilities."""
# from flask_celeryext import create_celery_app
# from invenio_app.factory import create_app
#
# LOGGING_SENTRY_CELERY = True
#
# celery = create_celery_app(create_app(LOGGING_SENTRY_CELERY=LOGGING_SENTRY_CELERY))
# celery.flask_app.logger.debug("force logger init")
cern_search_rest_api/config.py
View file @
35053fe9
...
...
@@ -14,6 +14,7 @@ import ast
import
copy
import
os
from
elasticsearch.exceptions
import
TransportError
from
elasticsearch_dsl
import
A
from
flask
import
request
from
invenio_oauthclient.contrib
import
cern_openid
...
...
@@ -31,6 +32,7 @@ from cern_search_rest_api.modules.cernsearch.permissions import (
record_read_permission_factory
,
record_update_permission_factory
,
)
from
cern_search_rest_api.modules.cernsearch.views
import
elasticsearch_version_conflict_engine_exception_handler
def
_
(
x
):
...
...
@@ -109,6 +111,9 @@ SEARCH_INSTANCE_IMMUTABLE = ast.literal_eval(os.getenv("CERN_SEARCH_INSTANCE_IMM
# File indexer capabilities enabled
SEARCH_FILE_INDEXER
=
ast
.
literal_eval
(
os
.
getenv
(
"CERN_SEARCH_FILE_INDEXER"
,
"True"
))
# Copy to fields are moved to metadata
SEARCH_COPY_TO_METADATA
=
ast
.
literal_eval
(
os
.
getenv
(
"CERN_SEARCH_COPY_TO_METADATA"
,
"False"
))
# Records REST configuration
# ===========================
...
...
@@ -162,6 +167,9 @@ RECORDS_REST_ENDPOINTS = dict(
}
},
},
error_handlers
=
{
TransportError
:
elasticsearch_version_conflict_engine_exception_handler
,
},
)
)
...
...
@@ -202,53 +210,123 @@ cern_rest_facets = {
"url_match"
:
simple_query_string
(
"url"
),
},
}
indico_date_ranges
=
[
{
"key"
:
"Over a year ago"
,
"to"
:
"now-1y/y"
},
{
"key"
:
"Up to a year ago"
,
"from"
:
"now-1y/y"
,
"to"
:
"now"
},
{
"key"
:
"Up to a month ago"
,
"from"
:
"now-1M/M"
,
"to"
:
"now"
},
{
"key"
:
"Up to a week ago"
,
"from"
:
"now-1w/w"
,
"to"
:
"now"
},
{
"key"
:
"Today"
,
"from"
:
"now/d"
,
"to"
:
"now"
},
{
"key"
:
"Tomorrow"
,
"from"
:
"now+1d/d"
,
"to"
:
"now+2d/d"
},
{
"key"
:
"This week"
,
"from"
:
"now/w"
,
"to"
:
"now+1w/w"
},
{
"key"
:
"Next week"
,
"from"
:
"now+1w/w"
,
"to"
:
"now+2w/w"
},
{
"key"
:
"After next week"
,
"from"
:
"now+2w/w"
},
]
RECORDS_REST_FACETS
=
{
"cernsearchqa-*"
:
cern_rest_facets
,
"webservices"
:
cern_rest_facets
,
"indico"
:
{
"aggs"
:
{
"event_type"
:
{
"terms"
:
{
"field"
:
"_data.event_type"
}},
"speakers_chairs"
:
{
"terms"
:
{
"field"
:
"_data.speakers_chairs.exact_match"
}},
"list_of_persons"
:
{
"terms"
:
{
"field"
:
"_data.list_of_persons.exact_match"
}},
}
"type_format"
:
{
"terms"
:
{
"field"
:
"type_format"
}},
"author"
:
{
"terms"
:
{
"field"
:
"_data.authors.exact_match"
}},
"person"
:
{
"nested"
:
{
"path"
:
"_data.persons"
},
"aggs"
:
{
"name"
:
{
"terms"
:
{
"field"
:
"_data.persons.name.exact_match_case_insensitive"
},
"aggs"
:
{
"most_common"
:
{
"terms"
:
{
"size"
:
1
,
"field"
:
"_data.persons.name"
}}},
},
"affiliation"
:
{
"terms"
:
{
"field"
:
"_data.persons.affiliation.exact_match_case_insensitive"
},
"aggs"
:
{
"most_common"
:
{
"terms"
:
{
"size"
:
1
,
"field"
:
"_data.persons.affiliation"
}}},
},
},
},
"venue"
:
{
"terms"
:
{
"field"
:
"_data.location.venue_name.exact_match_case_insensitive"
},
"aggs"
:
{
"most_common"
:
{
"terms"
:
{
"size"
:
1
,
"field"
:
"_data.location.venue_name"
}}},
},
"keyword"
:
{
"terms"
:
{
"field"
:
"_data.keywords.exact_match"
}},
"start_range"
:
{
"date_range"
:
{
"field"
:
"start_dt"
,
"format"
:
"yyyy-MM-dd"
,
"ranges"
:
indico_date_ranges
}},
"end_range"
:
{
"date_range"
:
{
"field"
:
"end_dt"
,
"format"
:
"yyyy-MM-dd"
,
"ranges"
:
indico_date_ranges
}},
"category"
:
{
"terms"
:
{
"field"
:
"category_path.title.exact_match"
}},
},
"post_filters"
:
{
"event_id"
:
terms_filter
(
"event_id"
),
"type_format"
:
terms_filter
(
"type_format"
),
"type"
:
terms_filter
(
"type"
),
"person_name"
:
terms_filter
(
"_data.persons_index.name.exact_match_case_insensitive"
),
"author"
:
terms_filter
(
"_data.authors.exact_match"
),
"person_affiliation"
:
terms_filter
(
"_data.persons_index.affiliation.exact_match_case_insensitive"
),
"venue"
:
terms_filter
(
"_data.location.venue_name"
),
"keyword"
:
terms_filter
(
"_data.keywords.exact_match"
),
"category"
:
terms_filter
(
"category_path.title.exact_match"
),
"category_id"
:
terms_filter
(
"category_path.id"
),
"exact_category_id"
:
terms_filter
(
"category_id"
),
},
"nested"
:
{
"_data.persons"
:
{
"nperson"
:
terms_filter
(
"_data.persons.name"
),
"naffiliation"
:
terms_filter
(
"_data.persons.affiliation"
),
}
},
},
}
cern_sort_options
=
{
"bestmatch"
:
{
"fields"
:
[
"-_score"
],
"fields"
:
[
"-_score"
,
"-_updated"
],
"title"
:
"Best match"
,
"default_order"
:
"
a
sc"
,
"default_order"
:
"
de
sc"
,
},
"mostrecent"
:
{
"fields"
:
[
"_updated"
],
"fields"
:
[
"
-
_updated"
],
"title"
:
"Newest"
,
"default_order"
:
"
a
sc"
,
"default_order"
:
"
de
sc"
,
},
}
RECORDS_REST_SORT_OPTIONS
=
{
"webservices"
:
cern_sort_options
,
"cernsearchqa-*"
:
cern_sort_options
,
"edms"
:
{
"edms"
:
cern_sort_options
,
"indico"
:
{
"bestmatch"
:
{
"fields"
:
[
"-_score"
],
"fields"
:
[
"-_score"
,
"-start_dt"
],
"title"
:
"Best match"
,
"default_order"
:
"
a
sc"
,
"default_order"
:
"
de
sc"
,
},
"mostrecent"
:
{
"fields"
:
[
"
_updated
"
],
"fields"
:
[
"
-start_dt
"
],
"title"
:
"Newest"
,
"default_order"
:
"
a
sc"
,
"default_order"
:
"
de
sc"
,
},
},
}
default_sort_options
=
dict
(
query
=
"bestmatch"
,
noquery
=
"mostrecent"
,
)
RECORDS_REST_DEFAULT_SORT
=
dict
(
indico
=
default_sort_options
,
edms
=
default_sort_options
,
archives
=
default_sort_options
,
webservices
=
default_sort_options
,
test
=
default_sort_options
,
)
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS
=
copy
.
deepcopy
(
irr_config
.
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS
)
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS
[
"mapper_parsing_exception"
]
=
"cern_search_rest_api.modules.cernsearch.views:elasticsearch_mapper_parsing_exception_handler"
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS
[
"query_parsing_exception"
]
=
"cern_search_rest_api.modules.cernsearch.views:elasticsearch_query_parsing_exception_handler"
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS
[
"query_shard_exception"
]
=
"cern_search_rest_api.modules.cernsearch.views:elasticsearch_query_parsing_exception_handler"
# App
# ===
...
...
@@ -272,6 +350,21 @@ SECURITY_REGISTERABLE = False # Avoid user registration outside of CERN SSO
SECURITY_RECOVERABLE
=
False
# Avoid user password recovery
SESSION_COOKIE_SECURE
=
True
SQLALCHEMY_ENGINE_OPTIONS
=
{
"pool_size"
:
int
(
os
.
getenv
(
"SQLALCHEMY_POOL_SIZE"
,
5
)),
"max_overflow"
:
int
(
os
.
getenv
(
"SQLALCHEMY_MAX_OVERFLOW"
,
10
)),
"pool_recycle"
:
int
(
os
.
getenv
(
"SQLALCHEMY_POOL_RECYCLE"
,
300
)),
# in seconds
}
SEARCH_CLIENT_CONFIG
=
dict
(
# allow up to 25 connections to each node
maxsize
=
int
(
os
.
getenv
(
"ELASTICSEARCH_MAX_SIZE"
,
5
)),
timeout
=
int
(
os
.
getenv
(
"ELASTICSEARCH_TIMEOUT"
,
10
)),
)
# Processes file metadata
PROCESS_FILE_META
=
ast
.
literal_eval
(
os
.
getenv
(
"CERN_SEARCH_PROCESS_FILE_META"
,
"False"
))
# Celery Configuration
# ====================
FILES_PROCESSOR_QUEUE
=
os
.
getenv
(
"CERN_SEARCH_FILES_PROCESSOR_QUEUE"
,
"files_processor"
)
...
...
@@ -308,17 +401,9 @@ CELERY_TASK_DEFAULT_QUEUE = "celery"
CELERY_BROKER_POOL_LIMIT
=
os
.
getenv
(
"BROKER_POOL_LIMIT"
,
None
)
SQLALCHEMY_ENGINE_OPTIONS
=
{
"pool_size"
:
int
(
os
.
getenv
(
"SQLALCHEMY_POOL_SIZE"
,
5
)),
"max_overflow"
:
int
(
os
.
getenv
(
"SQLALCHEMY_MAX_OVERFLOW"
,
10
)),
"pool_recycle"
:
int
(
os
.
getenv
(
"SQLALCHEMY_POOL_RECYCLE"
,
300
)),
# in seconds
}
CELERY_TASK_CREATE_MISSING_QUEUES
=
True
SEARCH_CLIENT_CONFIG
=
dict
(
# allow up to 25 connections to each node
maxsize
=
int
(
os
.
getenv
(
"ELASTICSEARCH_MAX_SIZE"
,
5
)),
timeout
=
int
(
os
.
getenv
(
"ELASTICSEARCH_TIMEOUT"
,
10
)),
)
# Processes file metadata
PROCESS_FILE_META
=
ast
.
literal_eval
(
os
.
getenv
(
"CERN_SEARCH_PROCESS_FILE_META"
,
"False"
))
CELERYCONF_V6
=
{
# Fix: https://github.com/celery/celery/issues/1926
"worker_proc_alive_timeout"
:
10.0
}
cern_search_rest_api/modules/cernsearch/errors.py
View file @
35053fe9
...
...
@@ -8,7 +8,7 @@
# under the terms of the MIT License; see LICENSE file for more details.
"""Custom errors."""
from
invenio_rest.errors
import
RESTValidationError
from
invenio_rest.errors
import
RESTException
,
RESTValidationError
class
InvalidRecordFormatError
(
RESTValidationError
):
...
...
@@ -32,3 +32,32 @@ class ObjectNotFoundError(SearchError):
def
__str__
(
self
):
"""Return description."""
return
f
"
{
self
.
message
}
not found."
class
Error
(
object
):
"""Represents a generic error.
.. note:: This is not an actual exception.
"""
def
__init__
(
self
,
cause
:
str
):
"""Init object.
:param cause: The string error.
"""
self
.
res
=
dict
(
cause
=
cause
)
def
to_dict
(
self
):
"""Convert to dictionary.
:returns: A dictionary with field, message and, if initialized, the
HTTP status code.
"""
return
self
.
res
class
ConflictError
(
RESTException
):
"""Conflict Error exception."""
code
=
409
description
=
"An internal error occurred due to a conflict in the internal state."
cern_search_rest_api/modules/cernsearch/ext.py
View file @
35053fe9
...
...
@@ -32,7 +32,7 @@ class CERNSearch(object):
app
.
register_blueprint
(
blueprint_record_files_content
)
current_celery
.
steps
[
"worker"
].
add
(
DeclareDeadletter
)
current_celery
.
conf
.
update
(
app
.
config
[
"CELERYCONF_V6"
])
self
.
register_signals
(
app
)
app
.
extensions
[
"cern-search"
]
=
self
...
...
@@ -47,19 +47,18 @@ class CERNSearch(object):
def
register_signals
(
self
,
app
):
"""Register signals."""
if
app
.
config
[
"SEARCH_FILE_INDEXER"
]:
from
cern_search_rest_api.modules.cernsearch.indexer
import
(
index_file_content
,
)
from
invenio_files_processor.signals
import
file_processed
from
invenio_files_rest.signals
import
file_deleted
,
file_uploaded
from
invenio_indexer.signals
import
before_record_index
from
invenio_records.signals
import
after_record_delete
from
cern_search_rest_api.modules.cernsearch.indexer
import
index_file_content
from
cern_search_rest_api.modules.cernsearch.receivers
import
(
file_deleted_listener
,
file_processed_listener
,
file_uploaded_listener
,
record_deleted_listener
,
)
from
invenio_files_processor.signals
import
file_processed
from
invenio_files_rest.signals
import
file_deleted
,
file_uploaded
from
invenio_indexer.signals
import
before_record_index
from
invenio_records.signals
import
after_record_delete
file_uploaded
.
connect
(
file_uploaded_listener
)
file_processed
.
connect
(
file_processed_listener
)
...
...
cern_search_rest_api/modules/cernsearch/facets.py
View file @
35053fe9
...
...
@@ -121,8 +121,8 @@ def match_phrase_filter(field):
return
inner
def
_
create_match
_dsl
(
urlkwargs
,
definitions
):
"""Create a
match DSL expression
."""
def
_
query_factory
_dsl
(
urlkwargs
,
definitions
):
"""Create a
list with query definitions applied to url args
."""
filters
=
[]
for
name
,
filter_factory
in
definitions
.
items
():
values
=
request
.
values
.
getlist
(
name
,
type
=
text_type
)
...
...
@@ -136,7 +136,7 @@ def _create_match_dsl(urlkwargs, definitions):
def
_match_filter
(
search
,
urlkwargs
,
definitions
):
"""Ingest match filter in query."""
matches
,
urlkwargs
=
_
create_match
_dsl
(
urlkwargs
,
definitions
)
matches
,
urlkwargs
=
_
query_factory
_dsl
(
urlkwargs
,
definitions
)
for
match_
in
matches
:
search
=
search
.
query
(
match_
)
...
...
@@ -144,6 +144,17 @@ def _match_filter(search, urlkwargs, definitions):
return
(
search
,
urlkwargs
)
def
_nested_filter
(
search
,
urlkwargs
,
definitions
):
"""Ingest nested bool filter in query."""
for
path
,
definition
in
definitions
.
items
():
nested
,
urlkwargs
=
_query_factory_dsl
(
urlkwargs
,
definition
)
if
nested
:
search
=
search
.
query
(
Q
(
"nested"
,
path
=
path
,
query
=
Q
(
"bool"
,
filter
=
nested
)))
return
(
search
,
urlkwargs
)
def
saas_facets_factory
(
search
,
index
):
"""Add custom items to query.
...
...
@@ -160,5 +171,7 @@ def saas_facets_factory(search, index):
if
facets
is
not
None
:
# Match filter
search
,
urlkwargs
=
_match_filter
(
search
,
urlkwargs
,
facets
.
get
(
"matches"
,
{}))
# Nested filter
search
,
urlkwargs
=
_nested_filter
(
search
,
urlkwargs
,
facets
.
get
(
"nested"
,
{}))
return
(
search
,
urlkwargs
)
cern_search_rest_api/modules/cernsearch/file_meta.py
View file @
35053fe9
...
...
@@ -27,6 +27,9 @@ def extract_metadata_from_processor(metadata):
"""Prepare metadata from processor."""
extracted
=
{}
if
not
metadata
:
return
extracted
if
metadata
.
get
(
"Author"
):
authors
=
metadata
[
"Author"
]
extracted
[
"authors"
]
=
authors
.
strip
(
" "
)
if
isinstance
(
authors
,
str
)
else
", "
.
join
(
authors
)
...
...
@@ -50,6 +53,12 @@ def extract_metadata_from_processor(metadata):
def
mime_type_to_file_collection
(
mime_type
):
"""Convert mime type to a friendly name collection."""
if
isinstance
(
mime_type
,
list
):
mime_type
=
mime_type
[
0
]
if
not
isinstance
(
mime_type
,
str
):
return
FILE_EXT_DEFAULT_COLLECTION
extensions
=
mimetypes
.
guess_all_extensions
(
mime_type
.
split
(
";"
)[
0
],
strict
=
False
)
if
not
extensions
:
return
FILE_EXT_DEFAULT_COLLECTION
...
...
cern_search_rest_api/modules/cernsearch/files.py
View file @
35053fe9
...
...
@@ -13,7 +13,7 @@ from io import BytesIO
from
flask
import
current_app
from
invenio_db
import
db
from
invenio_files_rest.models
import
Bucket
,
FileInstance
,
ObjectVersion
from
invenio_records_files.api
import
FilesIterator
from
invenio_records_files.api
import
FileObject
,
FilesIterator
from
invenio_records_files.models
import
RecordsBuckets
from
cern_search_rest_api.modules.cernsearch.api
import
CernSearchRecord
...
...
@@ -46,14 +46,20 @@ def persist_file_content(record: CernSearchRecord, file_content: dict, filename:
def
delete_previous_record_file_if_exists
(
obj
:
ObjectVersion
):
"""Delete all previous associated files to record if existing, since only one file per record is allowed."""
record
=
record_from_object_version
(
obj
)
# type: CernSearchRecord
current_app
.
logger
.
debug
(
"
Cleanup old files: %s, count %s"
,
str
(
obj
),
len
(
record
.
files
))
current_app
.
logger
.
debug
(
"
Delete previous files: %s"
,
str
(
obj
))
current_app
.
logger
.
debug
(
"Delete previous file"
)
__delete_all_files_except
(
record
.
files
,
obj
)
current_app
.
logger
.
debug
(
"Delete previous file content"
)
__delete_all_files_except
(
record
.
files_content
,
obj
)
def
delete_object_version
(
obj
:
ObjectVersion
):
"""Delete file on filesystem and soft delete on database."""
if
obj
.
deleted
:
return
current_app
.
logger
.
debug
(
"Delete Object Version: %s"
,
str
(
obj
))
# Soft delete bucket
...
...
@@ -66,27 +72,27 @@ def delete_object_version(obj: ObjectVersion):
def
delete_file_instance
(
obj
:
ObjectVersion
):
"""Delete file on filesystem and mark as not readable."""
current_app
.
logger
.
debug
(
"Delete file instance: %s"
,
str
(
obj
))
if
obj
.
file_id
:
f
=
FileInstance
.
get
(
str
(
obj
.
file_id
))
# type: FileInstance
if
obj
.
deleted
:
return
is_readable
=
f
.
readable
# Mark file not readable
f
.
readable
=
False
# Remove the file on disk
if
is_readable
:
f
.
storage
().
delete
()
f
=
FileInstance
.
get
(
str
(
obj
.
file_id
))
# type: FileInstance
if
not
f
.
readable
:
return
current_app
.
logger
.
debug
(
"Delete file instance: object %s - file %s"
,
str
(
obj
),
str
(
f
))
# Mark file not readable
f
.
readable
=
False
db
.
session
.
commit
()
# Remove the file on disk
# This leaves the possibility of having a file on disk dangling in case the database removal works,
# and the disk file removal doesn't work.
f
.
storage
().
delete
()
def
delete_record_file
(
obj
:
ObjectVersion
):
def
delete_record_file
(
record
:
CernSearchRecord
,
obj
:
ObjectVersion
):
"""Delete associated file to record."""
record
=
record_from_object_version
(
obj
)
# type: CernSearchRecord
current_app
.
logger
.
debug
(
"Cleanup file: %s"
,
str
(
obj
))
current_app
.
logger
.
debug
(
"Delete file: %s"
,
str
(
obj
))
delete_object_version
(
obj
)
if
obj
.
key
in
record
.
files_content
:
...
...
@@ -95,28 +101,36 @@ def delete_record_file(obj: ObjectVersion):
def
delete_all_record_files
(
record
:
CernSearchRecord
):
"""Delete all associated files to record."""
current_app
.
logger
.
debug
(
"
Cleanup
files: %s"
,
str
(
record
))
current_app
.
logger
.
debug
(
"
Delete all record
files: %s"
,
str
(
record
))
__delete_all_files
(
record
.
files
)
__delete_all_files
(
record
.
files_content
)
def
__delete_all_files
(
objects
:
FilesIterator
):
for
file
in
objects
:
for
file
in
objects
:
# type: FileObject
delete_object_version
(
file
.
obj
)
def
__delete_all_files_except
(
objects
:
FilesIterator
,
obj
:
ObjectVersion
):
for
file
in
objects
:
if
file
.
obj
.
key
==
obj
.
key
:
for
file
in
objects
:
# type: FileObject
file_obj
=
file
.
obj
# type: ObjectVersion
if
not
file_obj
.
is_head
or
file_obj
.
deleted
:
continue
# delete previous file object versions with same name
if
file_obj
.
key
==
obj
.
key
:
__delete_object_versions_except
(
obj
,
objects
.
bucket
)
continue
delete_object_version
(
file
.
obj
)
# if file has different name, delete all version
delete_object_version
(
file_obj
)
def
__delete_object_versions_except
(
obj
:
ObjectVersion
,
bucket
:
Bucket
):
for
version
in
ObjectVersion
.
get_versions
(
bucket
,
obj
.
key
):
versions
=
ObjectVersion
.
get_versions
(
bucket
,
obj
.
key
)
for
version
in
versions
:
if
version
.
version_id
!=
obj
.
version_id
:
delete_file_instance
(
version
)