Commit d3a9a2df authored by Carina Antunes's avatar Carina Antunes
Browse files

[SEARCH-115] refactor archives (add discourse and notifications archives)

parent 1f9c09ab
......@@ -9,6 +9,7 @@ CERN_SEARCH_FILES_PROCESSOR_QUEUE_DLX=files_processor_dlx
CERN_SEARCH_FILES_PROCESSOR_EXCHANGE=default
CERN_SEARCH_FILES_PROCESSOR_EXCHANGE_DLX=dlx
CERN_SEARCH_INSTANCE_IMMUTABLE='False'
CERN_SEARCH_FILE_INDEXER='True'
CONTAINER_NAME=cern-search-rest-api
WORKER_APP=invenio_app.celery
......
CERN_SEARCH_INSTANCE=cernsearchqa-*
INVENIO_INDEXER_DEFAULT_DOC_TYPE=""
INVENIO_INDEXER_DEFAULT_INDEX=""
INVENIO_SEARCH_INDEX_PREFIX=""
CERN_SEARCH_PROCESS_FILE_META='False'
CERN_SEARCH_INSTANCE_IMMUTABLE='True'
CERN_SEARCH_INSTANCE=codimd
INVENIO_INDEXER_DEFAULT_DOC_TYPE=notes_v1.0.0
INVENIO_INDEXER_DEFAULT_INDEX=codimd-notes_v1.0.0
CERN_SEARCH_INSTANCE=archives
INVENIO_INDEXER_DEFAULT_DOC_TYPE=archive_v3.0.0
INVENIO_INDEXER_DEFAULT_INDEX=archives-archive_v3.0.0
INVENIO_SEARCH_INDEX_PREFIX=cernsearch-discourse
CERN_SEARCH_INSTANCE=egroupsarchives
CERN_SEARCH_INSTANCE=archives
INVENIO_INDEXER_DEFAULT_DOC_TYPE=archive_v3.0.0
INVENIO_INDEXER_DEFAULT_INDEX=egroupsarchives-archive_v3.0.0
INVENIO_INDEXER_DEFAULT_INDEX=archives-archive_v3.0.0
INVENIO_SEARCH_INDEX_PREFIX=cernsearch-egroups
CERN_SEARCH_INSTANCE=archives
INVENIO_INDEXER_DEFAULT_DOC_TYPE=archive_v3.0.0
INVENIO_INDEXER_DEFAULT_INDEX=archives-archive_v3.0.0
INVENIO_SEARCH_INDEX_PREFIX=cernsearch-notifications
......@@ -95,6 +95,9 @@ SEARCH_DOC_PIPELINES = ast.literal_eval(os.getenv('CERN_SEARCH_DOC_PIPELINES', '
# Alias instance - don't allow updates, allow only search
SEARCH_INSTANCE_IMMUTABLE = ast.literal_eval(os.getenv('CERN_SEARCH_INSTANCE_IMMUTABLE', 'False'))
# File indexer capabilities enabled
SEARCH_FILE_INDEXER = ast.literal_eval(os.getenv('CERN_SEARCH_FILE_INDEXER', 'True'))
# Records REST configuration
# ===========================
......@@ -166,34 +169,36 @@ def aggs_filter(field):
return inner
RECORDS_REST_FACETS = {
'webservices': {
'aggs': {
'collection': {
'terms': {'field': 'collection'}
},
'type_format': {
'terms': {'field': 'type_format'}
},
'author': regex_aggregation('_data.authors.exact_match', 'authors_suggest'),
'site': regex_aggregation('_data.site.exact_match', 'sites_suggest'),
'keyword': regex_aggregation('_data.keywords.exact_match', 'keywords_suggest')
cern_rest_facets = {
'aggs': {
'collection': {
'terms': {'field': 'collection'}
},
'filters': {
'collection': terms_filter("collection"),
'type_format': terms_filter("type_format"),
'author': terms_filter("_data.authors.exact_match"),
'site': terms_filter("_data.site.exact_match"),
'keyword': terms_filter("_data.keywords.exact_match"),
'type_format': {
'terms': {'field': 'type_format'}
},
'matches': {
'author_match': simple_query_string("_data.authors"),
'keyword_match': simple_query_string("_data.keywords"),
'site_match': simple_query_string("_data.site"),
'name_match': simple_query_string("_data.name"),
'url_match': simple_query_string("url"),
}
'author': regex_aggregation('_data.authors.exact_match', 'authors_suggest'),
'site': regex_aggregation('_data.site.exact_match', 'sites_suggest'),
'keyword': regex_aggregation('_data.keywords.exact_match', 'keywords_suggest')
},
'filters': {
'collection': terms_filter("collection"),
'type_format': terms_filter("type_format"),
'author': terms_filter("_data.authors.exact_match"),
'site': terms_filter("_data.site.exact_match"),
'keyword': terms_filter("_data.keywords.exact_match"),
},
'matches': {
'author_match': simple_query_string("_data.authors"),
'keyword_match': simple_query_string("_data.keywords"),
'site_match': simple_query_string("_data.site"),
'name_match': simple_query_string("_data.name"),
'url_match': simple_query_string("url"),
}
}
RECORDS_REST_FACETS = {
'cernsearchqa-*': cern_rest_facets,
'webservices': cern_rest_facets,
'indico': {
'aggs': {
'event_type': {
......@@ -209,19 +214,22 @@ RECORDS_REST_FACETS = {
}
}
RECORDS_REST_SORT_OPTIONS = {
'webservices': {
'bestmatch': {
'fields': ['-_score'],
'title': 'Best match',
'default_order': 'asc',
},
'mostrecent': {
'fields': ['_updated'],
'title': 'Newest',
'default_order': 'asc',
}
cern_sort_options = {
'bestmatch': {
'fields': ['-_score'],
'title': 'Best match',
'default_order': 'asc',
},
'mostrecent': {
'fields': ['_updated'],
'title': 'Newest',
'default_order': 'asc',
}
}
RECORDS_REST_SORT_OPTIONS = {
'webservices': cern_sort_options,
'cernsearchqa-*': cern_sort_options,
'edms': {
'bestmatch': {
'fields': ['-_score'],
......@@ -311,5 +319,5 @@ SEARCH_CLIENT_CONFIG = dict(
maxsize=int(os.getenv("ELASTICSEARCH_MAX_SIZE", 5)),
)
# FILE
# Processes file metadata
PROCESS_FILE_META = ast.literal_eval(os.getenv("CERN_SEARCH_PROCESS_FILE_META", 'False'))
......@@ -9,14 +9,7 @@
"""Cern Search module."""
from celery import current_app as current_celery
from cern_search_rest_api.modules.cernsearch.celery import DeclareDeadletter
from cern_search_rest_api.modules.cernsearch.indexer import index_file_content
from cern_search_rest_api.modules.cernsearch.receivers import (file_deleted_listener, file_processed_listener,
file_uploaded_listener, record_deleted_listener)
from cern_search_rest_api.modules.cernsearch.views import build_blueprint, build_blueprint_record_files_content
from invenio_files_processor.signals import file_processed
from invenio_files_rest.signals import file_deleted, file_uploaded
from invenio_indexer.signals import before_record_index
from invenio_records.signals import after_record_delete
class CERNSearch(object):
......@@ -39,7 +32,7 @@ class CERNSearch(object):
current_celery.steps['worker'].add(DeclareDeadletter)
self.register_signals()
self.register_signals(app)
app.extensions["cern-search"] = self
......@@ -50,10 +43,21 @@ class CERNSearch(object):
if k.startswith('CERN_SEARCH'):
app.config.setdefault(k, getattr(app.config, k))
def register_signals(self):
def register_signals(self, app):
"""Register signals."""
file_uploaded.connect(file_uploaded_listener)
file_processed.connect(file_processed_listener)
file_deleted.connect(file_deleted_listener)
after_record_delete.connect(record_deleted_listener)
before_record_index.connect(index_file_content)
if app.config['SEARCH_FILE_INDEXER']:
from cern_search_rest_api.modules.cernsearch.indexer import index_file_content
from cern_search_rest_api.modules.cernsearch.receivers import (file_deleted_listener,
file_processed_listener,
file_uploaded_listener,
record_deleted_listener)
from invenio_files_processor.signals import file_processed
from invenio_files_rest.signals import file_deleted, file_uploaded
from invenio_indexer.signals import before_record_index
from invenio_records.signals import after_record_delete
file_uploaded.connect(file_uploaded_listener)
file_processed.connect(file_processed_listener)
file_deleted.connect(file_deleted_listener)
after_record_delete.connect(record_deleted_listener)
before_record_index.connect(index_file_content)
{
"title": "Archive schema v3.0.0",
"id": "http://<host:port>/schemas/egroupsarchives/archive_v3.0.0.json",
"$schema": "http://<host:port>/schemas/egroupsarchives/archive_v3.0.0.json",
"type": "object",
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "array",
"items": {
"type": "string"
}
},
"read": {
"type": "array",
"items": {
"type": "string"
}
},
"update": {
"type": "array",
"items": {
"type": "string"
}
},
"delete": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"_data": {
"type": "object",
"properties": {
"from": {
"type": "string",
"description": "Mail header from."
},
"subject": {
"type": "string",
"description": "Mail header subject."
},
"attachments": {
"type": "array",
"items": {
"type": "string"
},
"description": "Mail attachments filenames."
},
"body": {
"type": "string",
"description": "Mail body."
}
}
},
"mailid": {
"type": "string",
"description": "Mail Id."
},
"date": {
"type": "string",
"description": "Mail header date."
},
"group": {
"type": "string",
"description": "Group name."
},
"control_number": {
"type": "string"
},
"$schema": {
"type": "string"
}
}
}
{
"title": "Archive schema v3.0.0",
"id": "http://<host:port>/schemas/archives/archive_v3.0.0.json",
"$schema": "http://<host:port>/schemas/archives/archive_v3.0.0.json",
"type": "object",
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "array",
"items": {
"type": "string"
}
},
"read": {
"type": "array",
"items": {
"type": "string"
}
},
"update": {
"type": "array",
"items": {
"type": "string"
}
},
"delete": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"_data": {
"type": "object",
"properties": {
"from": {
"type": "string",
"description": "Mail header from."
},
"subject": {
"type": "string",
"description": "Mail header subject."
},
"attachments": {
"type": "array",
"items": {
"type": "string"
},
"description": "Mail attachments filenames."
},
"body": {
"type": "string",
"description": "Mail body."
}
}
},
"mailid": {
"type": "string",
"description": "Mail Id."
},
"date": {
"type": "string",
"description": "Mail header date."
},
"group": {
"type": "string",
"description": "Group name."
},
"control_number": {
"type": "string"
},
"$schema": {
"type": "string"
}
}
}
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
# CERN Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Init file."""
{
"title": "CodiMD Notes schema v1.0.0",
"id": "http://<host:port>/schemas/codimd/notes_v1.0.0.json",
"$schema": "http://<host:port>/schemas/codimd/notes_v1.0.0.json",
"type": "object",
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "array",
"items": {
"type": "string"
}
},
"read": {
"type": "array",
"items": {
"type": "string"
}
},
"update": {
"type": "array",
"items": {
"type": "string"
}
},
"delete": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"_data": {
"type": "object",
"properties": {
"owner": {
"type": "string",
"description": "Note owner."
},
"title": {
"type": "string",
"description": "Note title."
},
"body": {
"type": "string",
"description": "Note body."
}
}
},
"noteid": {
"type": "string",
"description": "Note Id."
},
"date": {
"type": "string",
"description": "Note creation date."
},
"control_number": {
"type": "string"
},
"$schema": {
"type": "string"
}
}
}
{
"title": "Archive schema v1.0.0",
"id": "http://<host:port>/schemas/egroupsarchives/archive_v1.0.0.json",
"$schema": "http://<host:port>/schemas/egroupsarchives/archive_v1.0.0.json",
"type": "object",
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "array",
"items": {
"type": "string"
}
},
"read": {
"type": "array",
"items": {
"type": "string"
}
},
"update": {
"type": "array",
"items": {
"type": "string"
}
},
"delete": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"_data": {
"type": "object",
"properties": {
"from": {
"type": "string",
"description": "Mail header from."
},
"subject": {
"type": "string",
"description": "Mail header subject."
},
"attachments": {
"type": "array",
"items": {
"type": "string"
},
"description": "Mail attachments filenames."
},
"body": {
"type": "string",
"description": "Mail body."
}
}
},
"date": {
"type": "string",
"description": "Mail header date."
},
"group": {
"type": "string",
"description": "Group name."
},
"control_number": {
"type": "string"
},
"$schema": {
"type": "string"
}
}
}
{
"title": "Archive schema v2.0.0",
"id": "http://<host:port>/schemas/egroupsarchives/archive_v2.0.0.json",
"$schema": "http://<host:port>/schemas/egroupsarchives/archive_v2.0.0.json",
"type": "object",
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "array",
"items": {
"type": "string"
}
},
"read": {
"type": "array",
"items": {
"type": "string"
}
},
"update": {
"type": "array",
"items": {
"type": "string"
}
},
"delete": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"_data": {
"type": "object",
"properties": {
"from": {
"type": "string",
"description": "Mail header from."
},
"subject": {
"type": "string",
"description": "Mail header subject."
},
"attachments": {
"type": "array",
"items": {
"type": "string"
},