Commit 44aca735 authored by Carina Antunes's avatar Carina Antunes
Browse files

[#SEARCH-108] Facets: querystring + url filter

parent a84b7fd3
......@@ -33,7 +33,7 @@ build-env:
rebuild-env:
docker-compose -f $(DOCKER_FILE) up -d --build --remove-orphans
.PHONY: build-env
.PHONY: rebuild-env
es-setup:
curl -XPUT "http://localhost:9200/_settings" -H 'Content-Type: application/json' -d' \
......
......@@ -15,7 +15,7 @@ import copy
import os
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
from cern_search_rest_api.modules.cernsearch.facets import match_filter, regex_aggregation
from cern_search_rest_api.modules.cernsearch.facets import regex_aggregation, simple_query_string
from cern_search_rest_api.modules.cernsearch.indexer import CernSearchRecordIndexer
from cern_search_rest_api.modules.cernsearch.permissions import (record_create_permission_factory,
record_delete_permission_factory,
......@@ -188,10 +188,11 @@ RECORDS_REST_FACETS = {
'keyword': terms_filter("_data.keywords.exact_match"),
},
'matches': {
'author_match': match_filter("_data.authors"),
'keyword_match': match_filter("_data.keywords"),
'site_match': match_filter("_data.site"),
'name_match': match_filter("_data.name"),
'author_match': simple_query_string("_data.authors"),
'keyword_match': simple_query_string("_data.keywords"),
'site_match': simple_query_string("_data.site"),
'name_match': simple_query_string("_data.name"),
'url_match': simple_query_string("url"),
}
}
}
......
......@@ -52,6 +52,79 @@ def match_filter(field):
return inner
def query_string(field):
"""Create a query_string query.
:param field: Field name.
:returns: Function that returns the match query.
"""
def inner(values):
return Q(
'query_string',
query=f"{field}:({' '.join(values)})",
rewrite="top_terms_1000", # calculates score for wildcards queries
)
return inner
def simple_query_string(field):
"""Create a query_string query.
:param field: Field name.
:returns: Function that returns the match query.
"""
def inner(values):
return Q(
'simple_query_string',
query=' '.join(values),
fields=[field]
)
return inner
def match_phrase_filter(field):
"""Create a match_phrase or match query. [WIP: missing checking if inside value there's a string]
:param field: Field name.
:returns: Function that returns the match query.
"""
def inner(values):
current_app.logger.warning(f"match_phrase_filter: {values}")
matches = []
phrase_matches = []
for value in values:
current_app.logger.warning(f"value: {value}")
if not value.startswith("\""):
matches.append(value)
continue
if value.endswith("\"") and len(value) > 1:
phrase_matches.append(value)
query_match = Q("match", **{field: ' '.join(matches)})
query_match_phrase = Q("match_phrase", **{field: ' '.join(phrase_matches)})
current_app.logger.warning(**{field: ' '.join(matches)})
if matches and phrase_matches:
return Q('bool', must=[query_match, query_match_phrase])
if phrase_matches:
return query_match_phrase
return query_match
return inner
def _create_match_dsl(urlkwargs, definitions):
"""Create a match DSL expression."""
filters = []
......
......@@ -35,8 +35,15 @@
]
}
},
"char_filter": {
"strip_dot_pattern": {
"type": "pattern_replace",
"pattern": "\\.",
"replacement": " "
}
},
"analyzer": {
"url_analyzer": {
"cern_url_analyzer": {
"type": "custom",
"tokenizer": "url_tokenizer",
"filter": [
......@@ -44,6 +51,16 @@
"lowercase"
]
},
"url_analyzer": {
"tokenizer": "standard",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
},
"autocomplete": {
"tokenizer": "autocomplete",
"filter": [
......@@ -52,6 +69,13 @@
},
"autocomplete_search": {
"tokenizer": "lowercase"
},
"case_insensitive_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
}
},
"normalizer": {
......@@ -62,6 +86,16 @@
"lowercase",
"asciifolding"
]
},
"url_normalizer": {
"type": "custom",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
......@@ -108,7 +142,7 @@
},
"site": {
"type": "text",
"analyzer": "url_analyzer",
"analyzer": "cern_url_analyzer",
"fields": {
"exact_match": {
"type": "keyword",
......@@ -119,7 +153,7 @@
},
"origin": {
"type": "text",
"analyzer": "url_analyzer",
"analyzer": "cern_url_analyzer",
"fields": {
"exact_match": {
"type": "keyword"
......@@ -207,7 +241,7 @@
"analyzer": "url_analyzer",
"fields": {
"exact_match": {
"normalizer": "case_insensitive_normalizer",
"normalizer": "url_normalizer",
"type": "keyword"
}
}
......
......@@ -35,8 +35,15 @@
]
}
},
"char_filter": {
"strip_dot_pattern": {
"type": "pattern_replace",
"pattern": "\\.",
"replacement": " "
}
},
"analyzer": {
"url_analyzer": {
"cern_url_analyzer": {
"type": "custom",
"tokenizer": "url_tokenizer",
"filter": [
......@@ -52,6 +59,23 @@
},
"autocomplete_search": {
"tokenizer": "lowercase"
},
"case_insensitive_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
},
"url_analyzer": {
"tokenizer": "standard",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
}
},
"normalizer": {
......@@ -62,6 +86,16 @@
"lowercase",
"asciifolding"
]
},
"url_normalizer": {
"type": "custom",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
......@@ -108,7 +142,7 @@
},
"site": {
"type": "text",
"analyzer": "url_analyzer",
"analyzer": "cern_url_analyzer",
"fields": {
"exact_match": {
"type": "keyword",
......@@ -119,7 +153,7 @@
},
"origin": {
"type": "text",
"analyzer": "url_analyzer",
"analyzer": "cern_url_analyzer",
"fields": {
"exact_match": {
"type": "keyword"
......@@ -190,7 +224,7 @@
"analyzer": "url_analyzer",
"fields": {
"exact_match": {
"normalizer": "case_insensitive_normalizer",
"normalizer": "url_normalizer",
"type": "keyword"
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment