indexer.py 1.41 KB
Newer Older
1
2
#!/usr/bin/python
# -*- coding: utf-8 -*-
3
4
5
6
#
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
7
# Citadel Search is free software; you can redistribute it and/or modify it
8
# under the terms of the MIT License; see LICENSE file for more details.
9
10
"""Indexer utilities."""
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
11
from flask import current_app
12
from invenio_files_rest.storage import FileStorage
13
from invenio_indexer.api import RecordIndexer
14
15
16
17
18

READ_MODE_BINARY = 'rb'
ATTACHMENT_KEY = '_attachment'
FILE_KEY = '_file'
DATA_KEY = '_data'
19
20


21
22
23
24
25
26
class CernSearchRecordIndexer(RecordIndexer):
    """Record Indexer."""

    record_cls = CernSearchRecord


27
28
29
def index_file_content(sender, json=None, record: CernSearchRecord = None, index=None, doc_type=None,
                       arguments=None, **kwargs):
    """Index file content in search."""
30
31
32
    if not record.files_content:
        return

33
34
    for file_obj in record.files_content:
        current_app.logger.debug(f"Index file content {file_obj.obj.basename} in {record.id}")
35

36
37
        storage = file_obj.obj.file.storage()  # type: FileStorage
        fp = storage.open(mode=READ_MODE_BINARY)
38

39
40
41
42
43
44
        try:
            file_content = fp.read()
            json[DATA_KEY][ATTACHMENT_KEY] = dict(_content=file_content)
            json[FILE_KEY] = file_obj.obj.basename
        finally:
            fp.close()
45

46
47
        # Index first or none
        break