Commit 78646751 authored by Carina Antunes's avatar Carina Antunes
Browse files

[SEARCH-66] Add highlight and explain params to search api

parent 056c747f
......@@ -14,6 +14,8 @@ import ast
import copy
import os
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
from cern_search_rest_api.modules.cernsearch.indexer import CernSearchRecordIndexer
from cern_search_rest_api.modules.cernsearch.permissions import (record_create_permission_factory,
record_delete_permission_factory,
record_list_permission_factory,
......@@ -103,7 +105,8 @@ RECORDS_REST_ENDPOINTS = dict(
item_route='/record/<{0}:pid_value>'.format(_Record_PID),
list_route='/records/',
links_factory_imp='invenio_records_rest.links:default_links_factory',
record_class='cern_search_rest_api.modules.cernsearch.api:CernSearchRecord',
record_class=CernSearchRecord,
indexer_class=CernSearchRecordIndexer,
record_serializers={
'application/json': ('cern_search_rest_api.modules.cernsearch.serializers'
':json_v1_response'),
......
......@@ -10,6 +10,7 @@
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
from flask import current_app
from invenio_files_rest.storage import FileStorage
from invenio_indexer.api import RecordIndexer
READ_MODE_BINARY = 'rb'
ATTACHMENT_KEY = '_attachment'
......@@ -17,9 +18,18 @@ FILE_KEY = '_file'
DATA_KEY = '_data'
class CernSearchRecordIndexer(RecordIndexer):
"""Record Indexer."""
record_cls = CernSearchRecord
def index_file_content(sender, json=None, record: CernSearchRecord = None, index=None, doc_type=None,
arguments=None, **kwargs):
"""Index file content in search."""
if not record.files_content:
return
for file_obj in record.files_content:
current_app.logger.debug(f"Index file content {file_obj.obj.basename} in {record.id}")
......
{
"settings": {
"index.percolator.map_unmapped_fields_as_string": true,
"index.mapping.total_fields.limit": 3000
},
"mappings": {
"file_v0.0.2": {
"dynamic": "strict",
"numeric_detection": true,
"_meta": {
"_owner": "CernSearch-Administrators@cern.ch"
},
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "keyword"
},
"read": {
"type": "keyword"
},
"update": {
"type": "keyword"
},
"delete": {
"type": "keyword"
}
}
},
"_data": {
"type": "object",
"properties": {
"title": {
"type": "keyword"
}
}
},
"_bucket": {
"type": "keyword"
},
"_updated": {
"type": "date"
},
"_created": {
"type": "date"
},
"control_number": {
"type": "keyword"
},
"$schema": {
"enabled": false
}
}
}
}
}
{
"settings": {
"index.percolator.map_unmapped_fields_as_string": true,
"index.mapping.total_fields.limit": 3000
},
"mappings": {
"file_v0.0.3": {
"dynamic": "strict",
"numeric_detection": true,
"_meta": {
"_owner": "CernSearch-Administrators@cern.ch"
},
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "keyword"
},
"read": {
"type": "keyword"
},
"update": {
"type": "keyword"
},
"delete": {
"type": "keyword"
}
}
},
"_data": {
"type": "object",
"properties": {
"title": {
"type": "keyword"
}
}
},
"_bucket": {
"type": "keyword"
},
"_bucket_content": {
"type": "keyword"
},
"_updated": {
"type": "date"
},
"_created": {
"type": "date"
},
"control_number": {
"type": "keyword"
},
"$schema": {
"enabled": false
}
}
}
}
}
......@@ -4,30 +4,22 @@
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
# CERN Search is free software; you can redistribute it and/or modify it
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Marshmallow for records and search results."""
from cern_search_rest_api.modules.cernsearch.utils import record_from_index
from flask import current_app
from invenio_indexer.utils import default_record_to_index
from invenio_records_rest.schemas import RecordMetadataSchemaJSONV1
from invenio_records_rest.schemas.json import RecordSchemaJSONV1
from marshmallow import ValidationError, post_dump, validates_schema
def has_and_needs_binary(original_data):
es_index, doc = default_record_to_index(original_data)
binary_index_list = current_app.config['SEARCH_DOC_PIPELINES']
if doc in binary_index_list and not original_data.get("_data").get('b64'):
return False
return True
from marshmallow import ValidationError, fields, validates_schema
class CSASRecordSchemaV1(RecordMetadataSchemaJSONV1):
"""Record schema."""
@validates_schema(pass_original=True)
def validate_record(self, data, original_data):
"""Validate record."""
if not original_data.get('_access'):
raise ValidationError('Missing field _access')
delete = original_data.get('_access').get('delete')
......@@ -44,19 +36,11 @@ class CSASRecordSchemaV1(RecordMetadataSchemaJSONV1):
'field _access.owner')
if not original_data.get('_data'):
raise ValidationError('Missing field _data')
if not has_and_needs_binary(original_data):
raise ValidationError('Record to be index belongs to binary index '
'but does not contain the [b64] field')
return
class CSASRecordSearchSchemaJSONV1(RecordSchemaJSONV1):
"""Record Search schema."""
@post_dump()
def remove_base64(self, data):
""" If needed remove the base64 data from the response """
es_index, doc = record_from_index(data)
binary_index_list = current_app.config['SEARCH_DOC_PIPELINES']
if doc in binary_index_list:
data.get('metadata').get("_data").pop('b64')
return data
highlight = fields.Raw()
explanation = fields.Raw()
......@@ -4,8 +4,9 @@
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
# CERN Search is free software; you can redistribute it and/or modify it
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Search utilities."""
from cern_search_rest_api.modules.cernsearch.utils import get_user_provides
from elasticsearch_dsl import Q
......@@ -13,6 +14,7 @@ from flask import current_app, request
from invenio_records_rest.query import default_search_factory
from invenio_search import RecordsSearch
from invenio_search.api import DefaultFilter
from werkzeug.datastructures import MultiDict
"""
......@@ -62,6 +64,7 @@ def cern_search_filter():
def get_egroups():
"""Get egroups from access param, config or authenticated user."""
egroups = request.args.get('access', None)
# If access rights are sent or is a search query
if egroups or (request.path == '/records/' and request.method == 'GET'):
......@@ -76,10 +79,26 @@ def get_egroups():
return get_user_provides()
def search_factory(self, search, query_parser=None):
class RecordCERNSearch(RecordsSearch):
"""CERN search class with Elasticsearch DSL."""
class Meta:
"""Configuration for ``Search`` and ``FacetedSearch`` classes."""
doc_types = None
default_filter = DefaultFilter(cern_search_filter)
def search_factory(self, search: RecordCERNSearch, query_parser=None):
"""Parse query using elasticsearch DSL query.
:param self: REST view.
:param search: Elastic search DSL search instance.
:returns: Tuple with search instance and URL arguments.
"""
def _csas_query_parser(qstr=None):
"""Default parser that uses the Q() from elasticsearch_dsl."""
"""Parse with Q() from elasticsearch_dsl."""
if qstr:
return Q(
'query_string',
......@@ -93,15 +112,15 @@ def search_factory(self, search, query_parser=None):
search = search.params(search_type="dfs_query_then_fetch") # search across all shards
return search, urlkwargs
highlights = request.args.getlist('highlight', None)
if highlights:
search = search.highlight(*highlights)
csas_search_factory = search_factory
explain = request.args.get('explain', None)
if explain:
search = search.extra(explain=explain)
return search, urlkwargs
class RecordCERNSearch(RecordsSearch):
"""CERN search class."""
class Meta:
doc_types = None
default_filter = DefaultFilter(cern_search_filter)
csas_search_factory = search_factory
......@@ -6,19 +6,20 @@
#
# CERN Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Serializers for records and search results."""
from __future__ import absolute_import, print_function
from cern_search_rest_api.modules.cernsearch.marshmallow import CSASRecordSchemaV1, CSASRecordSearchSchemaJSONV1
from invenio_records_rest.serializers.json import JSONSerializer
from cern_search_rest_api.modules.cernsearch.serializers.json import CernJSONSerializer
from invenio_records_rest.serializers.response import record_responsify, search_responsify
# Serializers
# ===========
#: JSON serializer definition.
json_v1 = JSONSerializer(CSASRecordSchemaV1, replace_refs=True)
json_v1_records = JSONSerializer(CSASRecordSearchSchemaJSONV1)
json_v1 = CernJSONSerializer(CSASRecordSchemaV1, replace_refs=True)
json_v1_records = CernJSONSerializer(CSASRecordSearchSchemaJSONV1)
# Records-REST serializers
# ========================
......
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Mixin helper class for preprocessing records and search results."""
from __future__ import absolute_import, print_function
from invenio_records_rest.serializers.base import PreprocessorMixin
class CernPreprocessorMixin(PreprocessorMixin):
"""Base class for serializers."""
@staticmethod
def preprocess_search_hit(pid, record_hit, links_factory=None, **kwargs):
"""Prepare a record hit from Elasticsearch for serialization."""
record = super(CernPreprocessorMixin, CernPreprocessorMixin).preprocess_search_hit(
pid,
record_hit,
links_factory=None,
**kwargs
)
record["highlight"] = record_hit.get('highlight', dict())
record["explanation"] = record_hit.get('_explanation', dict())
return record
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Marshmallow based JSON serializer for records."""
from __future__ import absolute_import, print_function
from cern_search_rest_api.modules.cernsearch.serializers.base import CernPreprocessorMixin
from invenio_records_rest.serializers import JSONSerializer
class CernJSONSerializer(JSONSerializer, CernPreprocessorMixin):
"""Marshmallow based JSON serializer for records."""
......@@ -86,6 +86,22 @@ def test_testclient(app, client, user):
assert description is not None
assert description == 'This contains CernSearch and should appear'
# Test query params
resp = client.get(
'/records/',
headers=get_headers(),
query_string={'q': 'CernSearch', 'explain': 'true', 'highlight': '*'}
)
assert resp.status_code == HTTPStatus.OK
resp_hits = resp.json['hits']
explanation = resp_hits['hits'][0].get('explanation')
print(resp_hits['hits'][0])
assert explanation
highlight = resp_hits['hits'][0].get('highlight')
assert highlight
# Clean the instance. Delete record
resp = client.delete(
'/record/{control_number}'.format(control_number=control_number_one),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment