Commit 56a80ece authored by Carina Antunes's avatar Carina Antunes
Browse files

[SEARCH-96] jacow: scrape metadata + new filters

parent 2a12971d
CERN_SEARCH_INSTANCE=egroupsarchives
INVENIO_INDEXER_DEFAULT_DOC_TYPE=archive_v1.0.0
INVENIO_INDEXER_DEFAULT_INDEX=egroupsarchives-archive_v1.0.0
INVENIO_INDEXER_DEFAULT_DOC_TYPE=archive_v3.0.0
INVENIO_INDEXER_DEFAULT_INDEX=egroupsarchives-archive_v3.0.0
CERN_SEARCH_INSTANCE=webservices
INVENIO_INDEXER_DEFAULT_DOC_TYPE=generic_website_v2.0.0
INVENIO_INDEXER_DEFAULT_INDEX=webservices-generic_website_v2.0.0
INVENIO_INDEXER_DEFAULT_DOC_TYPE=generic_website_v3.0.0
INVENIO_INDEXER_DEFAULT_INDEX=webservices-generic_website_v3.0.0
CERN_SEARCH_PROCESS_FILE_META='True'
......@@ -10,10 +10,15 @@
FROM gitlab-registry.cern.ch/webservices/cern-search/cern-search-rest-api/cern-search-rest-api-base:bfdd86117598a031f427328c9d276f7f1b782520
ARG build_env
# Switch to base once issues with pipenv are fixed
RUN yum update -y && \
yum install -y mailcap
# CERN Search installation
WORKDIR /${WORKING_DIR}/src
ADD . /${WORKING_DIR}/src
RUN pip freeze
# If env is development, install development dependencies
RUN if [ "$build_env" != "prod" ]; then pipenv install --system --ignore-pipfile --deploy --dev; fi
......
......@@ -15,7 +15,8 @@ RUN yum update -y && \
gcc \
openssl \
openldap-devel \
https://linuxsoft.cern.ch/cern/centos/7/cern/x86_64/Packages/CERN-CA-certs-20180516-1.el7.cern.noarch.rpm
https://linuxsoft.cern.ch/cern/centos/7/cern/x86_64/Packages/CERN-CA-certs-20180516-1.el7.cern.noarch.rpm \
mailcap
# CERN Search installation
WORKDIR /${WORKING_DIR}/src
......
......@@ -15,12 +15,14 @@ import copy
import os
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
from cern_search_rest_api.modules.cernsearch.facets import match_filter, regex_aggregation
from cern_search_rest_api.modules.cernsearch.indexer import CernSearchRecordIndexer
from cern_search_rest_api.modules.cernsearch.permissions import (record_create_permission_factory,
record_delete_permission_factory,
record_list_permission_factory,
record_read_permission_factory,
record_update_permission_factory)
from elasticsearch_dsl import A
from flask import request
from invenio_oauthclient.contrib import cern
from invenio_records_rest import config as irr_config
......@@ -129,9 +131,28 @@ RECORDS_REST_ENDPOINTS = dict(
create_permission_factory_imp=record_create_permission_factory,
update_permission_factory_imp=record_update_permission_factory,
delete_permission_factory_imp=record_delete_permission_factory,
suggesters={
'phrase': {
'completion': {
'field': 'suggest_keywords',
}
},
},
)
)
def aggs_filter(field):
"""Create a term filter.
:param field: Field name.
:returns: Function that returns the Terms query.
"""
def inner(values):
return A('terms', field=field, include=f'.*{values[0]}.*')
return inner
RECORDS_REST_FACETS = {
'webservices': {
'aggs': {
......@@ -141,14 +162,22 @@ RECORDS_REST_FACETS = {
'type_format': {
'terms': {'field': 'type_format'}
},
'authors': {
'terms': {'field': '_data.authors.exact_match'}
}
'author': regex_aggregation('_data.authors.exact_match', 'authors_suggest'),
'site': regex_aggregation('_data.site.exact_match', 'sites_suggest'),
'keyword': regex_aggregation('_data.keywords.exact_match', 'keywords_suggest')
},
'filters': {
'collection': terms_filter("collection"),
'type_format': terms_filter("type_format"),
'authors': terms_filter("_data.authors.exact_match")
'author': terms_filter("_data.authors.exact_match"),
'site': terms_filter("_data.site.exact_match"),
'keyword': terms_filter("_data.keywords.exact_match"),
},
'matches': {
'author_match': match_filter("_data.authors"),
'keyword_match': match_filter("_data.keywords"),
'site_match': match_filter("_data.site"),
'name_match': match_filter("_data.name"),
}
}
}
......@@ -254,3 +283,6 @@ SEARCH_CLIENT_CONFIG = dict(
# allow up to 25 connections to each node
maxsize=int(os.getenv("ELASTICSEARCH_MAX_SIZE", 5)),
)
# FILE
PROCESS_FILE_META = ast.literal_eval(os.getenv("CERN_SEARCH_PROCESS_FILE_META", 'False'))
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Custom Facets and factories for result filtering and aggregation.
See :data:`invenio_records_rest.config.RECORDS_REST_FACETS` for more
information on how to specify aggregations and filters.
"""
from __future__ import absolute_import, print_function
from elasticsearch_dsl import A, Q
from flask import current_app, request
from six import text_type
from werkzeug.datastructures import MultiDict
def regex_aggregation(field, query_param):
"""Create a regex aggregation.
:param field: Field name.
:param query_param: Query param name.
:returns: Function that returns the A query.
"""
def inner():
value = request.values.get(query_param, type=text_type)
if value:
return A('terms', field=field, include=f'.*{value}.*')
else:
return A('terms', field=field)
return inner
def match_filter(field):
"""Create a match query.
:param field: Field name.
:returns: Function that returns the match query.
"""
def inner(values):
return Q("match", **{field: ' '.join(values)})
return inner
def _create_match_dsl(urlkwargs, definitions):
"""Create a match DSL expression."""
filters = []
for name, filter_factory in definitions.items():
values = request.values.getlist(name, type=text_type)
if values:
filters.append(filter_factory(values))
for v in values:
urlkwargs.add(name, v)
return (filters, urlkwargs)
def _match_filter(search, urlkwargs, definitions):
"""Ingest match filter in query."""
matches, urlkwargs = _create_match_dsl(urlkwargs, definitions)
for match_ in matches:
search = search.query(match_)
return (search, urlkwargs)
def saas_facets_factory(search, index):
"""Add custom items to query.
It's possible to select facets which should be added to query
by passing their name in `facets` parameter.
:param search: Basic search object.
:param index: Index name.
:returns: A tuple containing the new search object and a dictionary with
all fields and values used.
"""
urlkwargs = MultiDict()
facets = current_app.config['RECORDS_REST_FACETS'].get(index)
if facets is not None:
# Match filter
search, urlkwargs = _match_filter(search, urlkwargs, facets.get("matches", {}))
return (search, urlkwargs)
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This file is part of CERN Search.
# Copyright (C) 2018-2019 CERN.
#
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""File Meta utilities."""
import mimetypes
from cern_search_rest_api.modules.cernsearch.utils import reverse_dict_list
FILE_EXT_COLLECTIONS = {
"Document": ["doc", "docx", "odt", "pages", "rtf", "tex", "wpd", "txt"],
"PDF": ["pdf"],
"Sheet": ["ods", "xlsx", "xlsm", "xls", "numbers"],
"Slides": ["ppt", "pptx", "pps", "odp", "key"]
}
FILE_EXT_DEFAULT_COLLECTION = "Other"
FILE_EXTENSION_MAP = reverse_dict_list(FILE_EXT_COLLECTIONS)
def extract_metadata_from_processor(metadata):
"""Prepare metadata from processor."""
extracted = {}
if metadata.get('Author'):
authors = metadata['Author']
extracted['authors'] = authors.strip(' ') if isinstance(authors, str) else ', '.join(authors)
if metadata.get('Content-Type'):
extracted['content_type'] = mime_type_to_file_collection(metadata['Content-Type'])
if metadata.get('title'):
extracted['title'] = metadata['title']
if metadata.get('Keywords'):
keywords = metadata['Keywords']
if not isinstance(keywords, list):
keywords = keywords.split(",")
# strip
keywords = [keyword.strip(' ') for keyword in keywords]
extracted['keywords'] = keywords
if metadata.get('Creation-Date'):
extracted['creation_date'] = metadata['Creation-Date']
return extracted
def mime_type_to_file_collection(mime_type):
"""Convert mime type to a friendly name collection."""
extensions = mimetypes.guess_all_extensions(mime_type.split(";")[0], strict=False)
if not extensions:
return FILE_EXT_DEFAULT_COLLECTION
def strip_dot(extension):
return extension.strip(".")
for ext in extensions:
collection = FILE_EXTENSION_MAP.get(strip_dot(ext))
if collection:
return collection
return FILE_EXT_DEFAULT_COLLECTION
......@@ -7,7 +7,7 @@
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""File utilities."""
import json
from io import BytesIO
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
......@@ -31,12 +31,14 @@ def record_from_object_version(obj: ObjectVersion):
return record
def persist_file_content(record: CernSearchRecord, file_content: str, filename: str):
def persist_file_content(record: CernSearchRecord, file_content: dict, filename: str):
"""Persist file's extracted content in bucket on filesystem and database."""
current_app.logger.debug(f"Persist file: {filename} in record {record.id}")
file_content.pop("attachments", None)
bucket_content = record.files_content.bucket
ObjectVersion.create(bucket_content, filename, stream=BytesIO(file_content.encode()))
ObjectVersion.create(bucket_content, filename, stream=BytesIO(json.dumps(file_content).encode()))
db.session.commit()
......
......@@ -7,15 +7,26 @@
# Citadel Search is free software; you can redistribute it and/or modify it
# under the terms of the MIT License; see LICENSE file for more details.
"""Indexer utilities."""
import json as json_lib
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
from cern_search_rest_api.modules.cernsearch.file_meta import extract_metadata_from_processor
from flask import current_app
from invenio_files_rest.storage import FileStorage
from invenio_indexer.api import RecordIndexer
READ_MODE_BINARY = 'rb'
ATTACHMENT_KEY = '_attachment'
FILE_KEY = '_file'
READ_WRITE_MODE_BINARY = 'rb+'
CONTENT_KEY = 'content'
FILE_KEY = 'file'
FILE_FORMAT_KEY = 'file_extension'
DATA_KEY = '_data'
AUTHORS_KEY = 'authors'
COLLECTION_KEY = 'collection'
NAME_KEY = 'name'
KEYWORDS_KEY = 'keywords'
CREATION_KEY = 'creation_date'
class CernSearchRecordIndexer(RecordIndexer):
......@@ -34,14 +45,55 @@ def index_file_content(sender, json=None, record: CernSearchRecord = None, index
current_app.logger.debug(f"Index file content {file_obj.obj.basename} in {record.id}")
storage = file_obj.obj.file.storage() # type: FileStorage
fp = storage.open(mode=READ_MODE_BINARY)
try:
file_content = fp.read()
json[DATA_KEY][ATTACHMENT_KEY] = dict(_content=file_content)
json[FILE_KEY] = file_obj.obj.basename
finally:
fp.close()
file_content = bc_file_content(storage)
json[DATA_KEY][CONTENT_KEY] = file_content['content']
json[FILE_KEY] = file_obj.obj.basename
if current_app.config.get('PROCESS_FILE_META'):
metadata = extract_metadata_from_processor(file_content['metadata'])
if metadata.get('authors'):
json[DATA_KEY][AUTHORS_KEY] = metadata.get('authors')
if metadata.get('content_type'):
json[COLLECTION_KEY] = metadata['content_type']
if metadata.get('title'):
json[DATA_KEY][NAME_KEY] = metadata['title']
if metadata.get('keywords'):
json[DATA_KEY][KEYWORDS_KEY] = metadata['keywords']
if metadata.get('creation_date'):
json[CREATION_KEY] = metadata['creation_date']
if "." in file_obj.obj.basename:
json[FILE_FORMAT_KEY] = file_obj.obj.basename.split(".")[-1]
# Index first or none
break
def bc_file_content(storage):
"""Get file content: backward compatible with files without metadata.
Except clause and write can be removed after:
https://its.cern.ch/jira/browse/SEARCH-84
"""
try:
with storage.open(mode=READ_WRITE_MODE_BINARY) as fp:
file_content = json_lib.load(fp)
if isinstance(file_content, dict) and 'content' in file_content:
return file_content
file_content = {'content': file_content}
fp.seek(0)
fp.write(json_lib.dumps(file_content).encode())
return file_content
except ValueError:
with storage.open(mode=READ_WRITE_MODE_BINARY) as fp:
file_content = fp.read().decode()
file_content = {'content': file_content}
fp.seek(0)
fp.write(json_lib.dumps(file_content).encode())
return file_content
......@@ -166,7 +166,7 @@
"control_number": {
"type": "string"
},
"_url": {
"url": {
"type": "string"
},
"$schema": {
......
......@@ -43,7 +43,7 @@
"control_number": {
"type": "string"
},
"_url": {
"url": {
"type": "string"
},
"$schema": {
......
{
"title": "Webservice File schema v3.0.0",
"id": "http://0.0.0.0:5000/schemas/webservices/file_v1.0.0.json",
"$schema": "http://0.0.0.0:5000/schemas/webservices/file_v1.0.0.json",
"type": "object",
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "array",
"items": {
"type": "string"
}
},
"read": {
"type": "array",
"items": {
"type": "string"
}
},
"update": {
"type": "array",
"items": {
"type": "string"
}
},
"delete": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"_data": {
"type": "object",
"properties": {
"site": {
"type": "string",
"description": "Base website URL"
},
"origin": {
"type": "string",
"description": "Website origin. Meaning FL or ATT, or SRC if its equals than url"
}
}
},
"suggest": {
"type": "array",
"items": {
"type": "string"
}
},
"promoted": {
"type": "boolean"
},
"promoted_keywords": {
"type": "array",
"items": {
"type": "string"
}
},
"image_source": {
"type": "string"
},
"analytics_relevance": {
"type": "number",
"description": "Matomo analytics based relevance of the site"
},
"last_updated": {
"type": "string",
"description": "Datetime of when the web page content was last updated"
},
"url": {
"type": "string",
"description": "Full path website URL"
},
"_updated": {
"type": "string",
"description": "Internal last updated datetime of the record"
},
"_created": {
"type": "string",
"description": "Internal creation datetime of the record"
},
"control_number": {
"type": "string"
},
"$schema": {
"type": "string"
}
}
}
{
"title": "Webservice Generic Website schema v3.0.0",
"id": "http://0.0.0.0:5000/schemas/webservices/generic_website_v3.0.0.json",
"$schema": "http://0.0.0.0:5000/schemas/webservices/generic_website_v3.0.0.json",
"type": "object",
"properties": {
"_access": {
"type": "object",
"properties": {
"owner": {
"type": "array",
"items": {
"type": "string"
}
},
"read": {
"type": "array",
"items": {
"type": "string"
}
},
"update": {
"type": "array",
"items": {
"type": "string"
}
},
"delete": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"_data": {
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "Website name or title"
},
"site": {
"type": "string",
"description": "Base website URL"
},
"origin": {
"type": "string",
"description": "Website origin. Meaning FL or ATT, or SRC if its equals than url"
},
"content": {
"type": "string",
"description": "Website content"
},
"authors": {
"type": "array",
"description": "Authors of the site",
"items": {
"type": "string",
"description": "Author name"
}
},
"keywords": {
"type": "array",
"description": "Keywords of the site",
"items": {
"type": "string",
"description": "Keyword"
}
}
}