Commit 35053fe9 authored by Carina Antunes's avatar Carina Antunes
Browse files

Add indico config

parent 0d38aae3
CELERY_LOG_LEVEL=error
CERN_SEARCH_INSTANCE=test
CERN_SEARCH_REMOTE_APP_RESOURCE=localhost
CERN_SEARCH_SERVER_NAME=localhost
......@@ -16,11 +14,9 @@ WORKER_APP=invenio_app.celery
DEFAULT_RECORDS_FILES_LOCATION=/usr/share/cern-search-api/files
ENV=dev
FLASK_DEBUG=True
FLASK_SKIP_DOTENV=1
FLASK_DEBUG='True'
INVENIO_DEBUG=0
ENV=development
FLOWER_PASS=password
......@@ -63,3 +59,5 @@ INVENIO_FILES_PROCESSOR_TIKA_SERVER_ENDPOINT=http://tika:9998
SQLALCHEMY_POOL_SIZE=10
SQLALCHEMY_MAX_OVERFLOW=15
CERN_SEARCH_COPY_TO_METADATA='True'
CERN_SEARCH_INSTANCE=indico
INVENIO_INDEXER_DEFAULT_DOC_TYPE=events_v1.0.0
INVENIO_INDEXER_DEFAULT_INDEX=indico-events_v1.0.0
CERN_SEARCH_PROCESS_FILE_META='["collection"]'
[settings]
line_length=120
known_third_party = celery,click,elasticsearch,elasticsearch_dsl,flask,flask_login,flask_security,invenio_accounts,invenio_app,invenio_db,invenio_files_processor,invenio_files_rest,invenio_indexer,invenio_oauth2server,invenio_oauthclient,invenio_pidstore,invenio_records,invenio_records_files,invenio_records_rest,invenio_rest,invenio_search,kombu,marshmallow,pytest,setuptools,six,werkzeug
known_third_party = celery,click,elasticsearch,elasticsearch_dsl,flask,flask_login,flask_security,invenio_accounts,invenio_app,invenio_db,invenio_files_processor,invenio_files_rest,invenio_indexer,invenio_oauth2server,invenio_oauthclient,invenio_pidstore,invenio_records,invenio_records_files,invenio_records_rest,invenio_rest,invenio_search,kombu,marshmallow,pytest,setuptools,six,sqlalchemy,werkzeug
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
......
......@@ -7,7 +7,7 @@
# under the terms of the MIT License; see LICENSE file for more details.
# Use CentOS7:
FROM gitlab-registry.cern.ch/webservices/cern-search/cern-search-rest-api/cern-search-rest-api-base:1b4abb66064462b81cacb82f9047d7f05e92f72f
FROM gitlab-registry.cern.ch/webservices/cern-search/cern-search-rest-api/cern-search-rest-api-base:b8a11ad40fd7ab069e460badb83a42c73b5d5b7b
ARG build_env
# CERN Search installation
......@@ -33,11 +33,14 @@ ENV TIKA_LOG_PATH=${LOGS_DIR}
# Install UI
USER invenio
# Collect static files
RUN invenio collect -v
RUN invenio webpack buildall
# Move static files to instance folder
RUN cp /${WORKING_DIR}/src/static/images/cernsearchicon.png ${INVENIO_INSTANCE_PATH}/static/images/cernsearchicon.png
# Build assets
RUN invenio webpack buildall
EXPOSE 5000
# uWSGI configuration
......
......@@ -10,24 +10,30 @@
FROM inveniosoftware/centos8-python:3.8
# Install pre-requisites
RUN yum update -y && \
yum install -y \
RUN yum update -y && yum install -y \
gcc \
openssl \
openldap-devel \
https://linuxsoft.cern.ch/cern/centos/8/CERN/x86_64/Packages/CERN-CA-certs-20200530-1.el8.cern.noarch.rpm \
mailcap
# Uninstall python3.6 due to poetry bug (and leave node)
# https://github.com/python-poetry/poetry/issues/3463
RUN rpm -e --nodeps python36 && node -v
# Symlink python
RUN ln -nsf /usr/bin/python3.8 /usr/bin/python
RUN ln -nsf /usr/bin/python3.8 /usr/bin/python && python -V && whereis python
# CERN Search installation
WORKDIR /${WORKING_DIR}/src
COPY poetry.lock pyproject.toml /${WORKING_DIR}/src/
# Install dependencies globally
RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | python
# Still using get-poetry due to https://github.com/python-poetry/poetry/issues/3870
# RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | POETRY_VERSION=1.1.6 python
# ENV PATH="${PATH}:/root/.local/bin"
RUN curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py | POETRY_VERSION=1.1.6 python
ENV PATH="${PATH}:/root/.poetry/bin"
RUN poetry config virtualenvs.create false -vvv && \
RUN poetry --version && poetry config virtualenvs.create false -vvv && \
poetry install --no-root --no-dev --no-interaction --no-ansi
......@@ -31,6 +31,11 @@ build-env:
docker-compose -f $(DOCKER_FILE) up -d --remove-orphans
.PHONY: build-env
es:
docker-compose -f docker-compose.es.yml up -d --remove-orphans
.PHONY: es
rebuild-env:
docker-compose -f $(DOCKER_FILE) build --no-cache --parallel
.PHONY: rebuild-env
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of CERN Search.
# Copyright (C) 2018-2021 CERN.
#
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Celery utilities."""
# from flask_celeryext import create_celery_app
# from invenio_app.factory import create_app
#
# LOGGING_SENTRY_CELERY = True
#
# celery = create_celery_app(create_app(LOGGING_SENTRY_CELERY=LOGGING_SENTRY_CELERY))
# celery.flask_app.logger.debug("force logger init")
......@@ -14,6 +14,7 @@ import ast
import copy
import os
from elasticsearch.exceptions import TransportError
from elasticsearch_dsl import A
from flask import request
from invenio_oauthclient.contrib import cern_openid
......@@ -31,6 +32,7 @@ from cern_search_rest_api.modules.cernsearch.permissions import (
record_read_permission_factory,
record_update_permission_factory,
)
from cern_search_rest_api.modules.cernsearch.views import elasticsearch_version_conflict_engine_exception_handler
def _(x):
......@@ -109,6 +111,9 @@ SEARCH_INSTANCE_IMMUTABLE = ast.literal_eval(os.getenv("CERN_SEARCH_INSTANCE_IMM
# File indexer capabilities enabled
SEARCH_FILE_INDEXER = ast.literal_eval(os.getenv("CERN_SEARCH_FILE_INDEXER", "True"))
# Copy to fields are moved to metadata
SEARCH_COPY_TO_METADATA = ast.literal_eval(os.getenv("CERN_SEARCH_COPY_TO_METADATA", "False"))
# Records REST configuration
# ===========================
......@@ -162,6 +167,9 @@ RECORDS_REST_ENDPOINTS = dict(
}
},
},
error_handlers={
TransportError: elasticsearch_version_conflict_engine_exception_handler,
},
)
)
......@@ -202,53 +210,123 @@ cern_rest_facets = {
"url_match": simple_query_string("url"),
},
}
indico_date_ranges = [
{"key": "Over a year ago", "to": "now-1y/y"},
{"key": "Up to a year ago", "from": "now-1y/y", "to": "now"},
{"key": "Up to a month ago", "from": "now-1M/M", "to": "now"},
{"key": "Up to a week ago", "from": "now-1w/w", "to": "now"},
{"key": "Today", "from": "now/d", "to": "now"},
{"key": "Tomorrow", "from": "now+1d/d", "to": "now+2d/d"},
{"key": "This week", "from": "now/w", "to": "now+1w/w"},
{"key": "Next week", "from": "now+1w/w", "to": "now+2w/w"},
{"key": "After next week", "from": "now+2w/w"},
]
RECORDS_REST_FACETS = {
"cernsearchqa-*": cern_rest_facets,
"webservices": cern_rest_facets,
"indico": {
"aggs": {
"event_type": {"terms": {"field": "_data.event_type"}},
"speakers_chairs": {"terms": {"field": "_data.speakers_chairs.exact_match"}},
"list_of_persons": {"terms": {"field": "_data.list_of_persons.exact_match"}},
}
"type_format": {"terms": {"field": "type_format"}},
"author": {"terms": {"field": "_data.authors.exact_match"}},
"person": {
"nested": {"path": "_data.persons"},
"aggs": {
"name": {
"terms": {"field": "_data.persons.name.exact_match_case_insensitive"},
"aggs": {"most_common": {"terms": {"size": 1, "field": "_data.persons.name"}}},
},
"affiliation": {
"terms": {"field": "_data.persons.affiliation.exact_match_case_insensitive"},
"aggs": {"most_common": {"terms": {"size": 1, "field": "_data.persons.affiliation"}}},
},
},
},
"venue": {
"terms": {"field": "_data.location.venue_name.exact_match_case_insensitive"},
"aggs": {"most_common": {"terms": {"size": 1, "field": "_data.location.venue_name"}}},
},
"keyword": {"terms": {"field": "_data.keywords.exact_match"}},
"start_range": {"date_range": {"field": "start_dt", "format": "yyyy-MM-dd", "ranges": indico_date_ranges}},
"end_range": {"date_range": {"field": "end_dt", "format": "yyyy-MM-dd", "ranges": indico_date_ranges}},
"category": {"terms": {"field": "category_path.title.exact_match"}},
},
"post_filters": {
"event_id": terms_filter("event_id"),
"type_format": terms_filter("type_format"),
"type": terms_filter("type"),
"person_name": terms_filter("_data.persons_index.name.exact_match_case_insensitive"),
"author": terms_filter("_data.authors.exact_match"),
"person_affiliation": terms_filter("_data.persons_index.affiliation.exact_match_case_insensitive"),
"venue": terms_filter("_data.location.venue_name"),
"keyword": terms_filter("_data.keywords.exact_match"),
"category": terms_filter("category_path.title.exact_match"),
"category_id": terms_filter("category_path.id"),
"exact_category_id": terms_filter("category_id"),
},
"nested": {
"_data.persons": {
"nperson": terms_filter("_data.persons.name"),
"naffiliation": terms_filter("_data.persons.affiliation"),
}
},
},
}
cern_sort_options = {
"bestmatch": {
"fields": ["-_score"],
"fields": ["-_score", "-_updated"],
"title": "Best match",
"default_order": "asc",
"default_order": "desc",
},
"mostrecent": {
"fields": ["_updated"],
"fields": ["-_updated"],
"title": "Newest",
"default_order": "asc",
"default_order": "desc",
},
}
RECORDS_REST_SORT_OPTIONS = {
"webservices": cern_sort_options,
"cernsearchqa-*": cern_sort_options,
"edms": {
"edms": cern_sort_options,
"indico": {
"bestmatch": {
"fields": ["-_score"],
"fields": ["-_score", "-start_dt"],
"title": "Best match",
"default_order": "asc",
"default_order": "desc",
},
"mostrecent": {
"fields": ["_updated"],
"fields": ["-start_dt"],
"title": "Newest",
"default_order": "asc",
"default_order": "desc",
},
},
}
default_sort_options = dict(
query="bestmatch",
noquery="mostrecent",
)
RECORDS_REST_DEFAULT_SORT = dict(
indico=default_sort_options,
edms=default_sort_options,
archives=default_sort_options,
webservices=default_sort_options,
test=default_sort_options,
)
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS = copy.deepcopy(irr_config.RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS)
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS[
"mapper_parsing_exception"
] = "cern_search_rest_api.modules.cernsearch.views:elasticsearch_mapper_parsing_exception_handler"
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS[
"query_parsing_exception"
] = "cern_search_rest_api.modules.cernsearch.views:elasticsearch_query_parsing_exception_handler"
RECORDS_REST_ELASTICSEARCH_ERROR_HANDLERS[
"query_shard_exception"
] = "cern_search_rest_api.modules.cernsearch.views:elasticsearch_query_parsing_exception_handler"
# App
# ===
......@@ -272,6 +350,21 @@ SECURITY_REGISTERABLE = False # Avoid user registration outside of CERN SSO
SECURITY_RECOVERABLE = False # Avoid user password recovery
SESSION_COOKIE_SECURE = True
SQLALCHEMY_ENGINE_OPTIONS = {
"pool_size": int(os.getenv("SQLALCHEMY_POOL_SIZE", 5)),
"max_overflow": int(os.getenv("SQLALCHEMY_MAX_OVERFLOW", 10)),
"pool_recycle": int(os.getenv("SQLALCHEMY_POOL_RECYCLE", 300)), # in seconds
}
SEARCH_CLIENT_CONFIG = dict(
# allow up to 25 connections to each node
maxsize=int(os.getenv("ELASTICSEARCH_MAX_SIZE", 5)),
timeout=int(os.getenv("ELASTICSEARCH_TIMEOUT", 10)),
)
# Processes file metadata
PROCESS_FILE_META = ast.literal_eval(os.getenv("CERN_SEARCH_PROCESS_FILE_META", "False"))
# Celery Configuration
# ====================
FILES_PROCESSOR_QUEUE = os.getenv("CERN_SEARCH_FILES_PROCESSOR_QUEUE", "files_processor")
......@@ -308,17 +401,9 @@ CELERY_TASK_DEFAULT_QUEUE = "celery"
CELERY_BROKER_POOL_LIMIT = os.getenv("BROKER_POOL_LIMIT", None)
SQLALCHEMY_ENGINE_OPTIONS = {
"pool_size": int(os.getenv("SQLALCHEMY_POOL_SIZE", 5)),
"max_overflow": int(os.getenv("SQLALCHEMY_MAX_OVERFLOW", 10)),
"pool_recycle": int(os.getenv("SQLALCHEMY_POOL_RECYCLE", 300)), # in seconds
}
CELERY_TASK_CREATE_MISSING_QUEUES = True
SEARCH_CLIENT_CONFIG = dict(
# allow up to 25 connections to each node
maxsize=int(os.getenv("ELASTICSEARCH_MAX_SIZE", 5)),
timeout=int(os.getenv("ELASTICSEARCH_TIMEOUT", 10)),
)
# Processes file metadata
PROCESS_FILE_META = ast.literal_eval(os.getenv("CERN_SEARCH_PROCESS_FILE_META", "False"))
CELERYCONF_V6 = {
# Fix: https://github.com/celery/celery/issues/1926
"worker_proc_alive_timeout": 10.0
}
......@@ -8,7 +8,7 @@
# under the terms of the MIT License; see LICENSE file for more details.
"""Custom errors."""
from invenio_rest.errors import RESTValidationError
from invenio_rest.errors import RESTException, RESTValidationError
class InvalidRecordFormatError(RESTValidationError):
......@@ -32,3 +32,32 @@ class ObjectNotFoundError(SearchError):
def __str__(self):
"""Return description."""
return f"{self.message} not found."
class Error(object):
"""Represents a generic error.
.. note:: This is not an actual exception.
"""
def __init__(self, cause: str):
"""Init object.
:param cause: The string error.
"""
self.res = dict(cause=cause)
def to_dict(self):
"""Convert to dictionary.
:returns: A dictionary with field, message and, if initialized, the
HTTP status code.
"""
return self.res
class ConflictError(RESTException):
"""Conflict Error exception."""
code = 409
description = "An internal error occurred due to a conflict in the internal state."
......@@ -32,7 +32,7 @@ class CERNSearch(object):
app.register_blueprint(blueprint_record_files_content)
current_celery.steps["worker"].add(DeclareDeadletter)
current_celery.conf.update(app.config["CELERYCONF_V6"])
self.register_signals(app)
app.extensions["cern-search"] = self
......@@ -47,19 +47,18 @@ class CERNSearch(object):
def register_signals(self, app):
"""Register signals."""
if app.config["SEARCH_FILE_INDEXER"]:
from cern_search_rest_api.modules.cernsearch.indexer import (
index_file_content,
)
from invenio_files_processor.signals import file_processed
from invenio_files_rest.signals import file_deleted, file_uploaded
from invenio_indexer.signals import before_record_index
from invenio_records.signals import after_record_delete
from cern_search_rest_api.modules.cernsearch.indexer import index_file_content
from cern_search_rest_api.modules.cernsearch.receivers import (
file_deleted_listener,
file_processed_listener,
file_uploaded_listener,
record_deleted_listener,
)
from invenio_files_processor.signals import file_processed
from invenio_files_rest.signals import file_deleted, file_uploaded
from invenio_indexer.signals import before_record_index
from invenio_records.signals import after_record_delete
file_uploaded.connect(file_uploaded_listener)
file_processed.connect(file_processed_listener)
......
......@@ -121,8 +121,8 @@ def match_phrase_filter(field):
return inner
def _create_match_dsl(urlkwargs, definitions):
"""Create a match DSL expression."""
def _query_factory_dsl(urlkwargs, definitions):
"""Create a list with query definitions applied to url args."""
filters = []
for name, filter_factory in definitions.items():
values = request.values.getlist(name, type=text_type)
......@@ -136,7 +136,7 @@ def _create_match_dsl(urlkwargs, definitions):
def _match_filter(search, urlkwargs, definitions):
"""Ingest match filter in query."""
matches, urlkwargs = _create_match_dsl(urlkwargs, definitions)
matches, urlkwargs = _query_factory_dsl(urlkwargs, definitions)
for match_ in matches:
search = search.query(match_)
......@@ -144,6 +144,17 @@ def _match_filter(search, urlkwargs, definitions):
return (search, urlkwargs)
def _nested_filter(search, urlkwargs, definitions):
"""Ingest nested bool filter in query."""
for path, definition in definitions.items():
nested, urlkwargs = _query_factory_dsl(urlkwargs, definition)
if nested:
search = search.query(Q("nested", path=path, query=Q("bool", filter=nested)))
return (search, urlkwargs)
def saas_facets_factory(search, index):
"""Add custom items to query.
......@@ -160,5 +171,7 @@ def saas_facets_factory(search, index):
if facets is not None:
# Match filter
search, urlkwargs = _match_filter(search, urlkwargs, facets.get("matches", {}))
# Nested filter
search, urlkwargs = _nested_filter(search, urlkwargs, facets.get("nested", {}))
return (search, urlkwargs)
......@@ -27,6 +27,9 @@ def extract_metadata_from_processor(metadata):
"""Prepare metadata from processor."""
extracted = {}
if not metadata:
return extracted
if metadata.get("Author"):
authors = metadata["Author"]
extracted["authors"] = authors.strip(" ") if isinstance(authors, str) else ", ".join(authors)
......@@ -50,6 +53,12 @@ def extract_metadata_from_processor(metadata):
def mime_type_to_file_collection(mime_type):
"""Convert mime type to a friendly name collection."""
if isinstance(mime_type, list):
mime_type = mime_type[0]
if not isinstance(mime_type, str):
return FILE_EXT_DEFAULT_COLLECTION
extensions = mimetypes.guess_all_extensions(mime_type.split(";")[0], strict=False)
if not extensions:
return FILE_EXT_DEFAULT_COLLECTION
......
......@@ -13,7 +13,7 @@ from io import BytesIO
from flask import current_app
from invenio_db import db
from invenio_files_rest.models import Bucket, FileInstance, ObjectVersion
from invenio_records_files.api import FilesIterator
from invenio_records_files.api import FileObject, FilesIterator
from invenio_records_files.models import RecordsBuckets
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
......@@ -46,14 +46,20 @@ def persist_file_content(record: CernSearchRecord, file_content: dict, filename:
def delete_previous_record_file_if_exists(obj: ObjectVersion):
"""Delete all previous associated files to record if existing, since only one file per record is allowed."""
record = record_from_object_version(obj) # type: CernSearchRecord
current_app.logger.debug("Cleanup old files: %s, count %s", str(obj), len(record.files))
current_app.logger.debug("Delete previous files: %s", str(obj))
current_app.logger.debug("Delete previous file")
__delete_all_files_except(record.files, obj)
current_app.logger.debug("Delete previous file content")
__delete_all_files_except(record.files_content, obj)
def delete_object_version(obj: ObjectVersion):
"""Delete file on filesystem and soft delete on database."""
if obj.deleted:
return
current_app.logger.debug("Delete Object Version: %s", str(obj))
# Soft delete bucket
......@@ -66,27 +72,27 @@ def delete_object_version(obj: ObjectVersion):
def delete_file_instance(obj: ObjectVersion):
"""Delete file on filesystem and mark as not readable."""
current_app.logger.debug("Delete file instance: %s", str(obj))
if obj.file_id:
f = FileInstance.get(str(obj.file_id)) # type: FileInstance
if obj.deleted:
return
is_readable = f.readable
# Mark file not readable
f.readable = False
# Remove the file on disk
if is_readable:
f.storage().delete()
f = FileInstance.get(str(obj.file_id)) # type: FileInstance
if not f.readable:
return
current_app.logger.debug("Delete file instance: object %s - file %s", str(obj), str(f))
# Mark file not readable
f.readable = False
db.session.commit()
# Remove the file on disk
# This leaves the possibility of having a file on disk dangling in case the database removal works,
# and the disk file removal doesn't work.
f.storage().delete()
def delete_record_file(obj: ObjectVersion):
def delete_record_file(record: CernSearchRecord, obj: ObjectVersion):
"""Delete associated file to record."""
record = record_from_object_version(obj) # type: CernSearchRecord
current_app.logger.debug("Cleanup file: %s", str(obj))
current_app.logger.debug("Delete file: %s", str(obj))
delete_object_version(obj)
if obj.key in record.files_content:
......@@ -95,28 +101,36 @@ def delete_record_file(obj: ObjectVersion):
def delete_all_record_files(record: CernSearchRecord):
"""Delete all associated files to record."""
current_app.logger.debug("Cleanup files: %s", str(record))
current_app.logger.debug("Delete all record files: %s", str(record))
__delete_all_files(record.files)
__delete_all_files(record.files_content)
def __delete_all_files(objects: FilesIterator):
for file in objects:
for file in objects: # type: FileObject
delete_object_version(file.obj)
def __delete_all_files_except(objects: FilesIterator, obj: ObjectVersion):
for file in objects:
if file.obj.key == obj.key:
for file in objects: # type: FileObject
file_obj = file.obj # type: ObjectVersion
if not file_obj.is_head or file_obj.deleted:
continue
# delete previous file object versions with same name
if file_obj.key == obj.key:
__delete_object_versions_except(obj, objects.bucket)
continue
delete_object_version(file.obj)
# if file has different name, delete all version
delete_object_version(file_obj)
def __delete_object_versions_except(obj: ObjectVersion, bucket: Bucket):
for version in ObjectVersion.get_versions(bucket, obj.key):
versions = ObjectVersion.get_versions(bucket, obj.key)
for version in versions:
if version.version_id != obj.version_id:
delete_file_instance(version)