Commit 44aca735 authored by Carina Antunes's avatar Carina Antunes
Browse files

[#SEARCH-108] Facets: querystring + url filter

parent a84b7fd3
...@@ -33,7 +33,7 @@ build-env: ...@@ -33,7 +33,7 @@ build-env:
rebuild-env: rebuild-env:
docker-compose -f $(DOCKER_FILE) up -d --build --remove-orphans docker-compose -f $(DOCKER_FILE) up -d --build --remove-orphans
.PHONY: build-env .PHONY: rebuild-env
es-setup: es-setup:
curl -XPUT "http://localhost:9200/_settings" -H 'Content-Type: application/json' -d' \ curl -XPUT "http://localhost:9200/_settings" -H 'Content-Type: application/json' -d' \
......
...@@ -15,7 +15,7 @@ import copy ...@@ -15,7 +15,7 @@ import copy
import os import os
from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord from cern_search_rest_api.modules.cernsearch.api import CernSearchRecord
from cern_search_rest_api.modules.cernsearch.facets import match_filter, regex_aggregation from cern_search_rest_api.modules.cernsearch.facets import regex_aggregation, simple_query_string
from cern_search_rest_api.modules.cernsearch.indexer import CernSearchRecordIndexer from cern_search_rest_api.modules.cernsearch.indexer import CernSearchRecordIndexer
from cern_search_rest_api.modules.cernsearch.permissions import (record_create_permission_factory, from cern_search_rest_api.modules.cernsearch.permissions import (record_create_permission_factory,
record_delete_permission_factory, record_delete_permission_factory,
...@@ -188,10 +188,11 @@ RECORDS_REST_FACETS = { ...@@ -188,10 +188,11 @@ RECORDS_REST_FACETS = {
'keyword': terms_filter("_data.keywords.exact_match"), 'keyword': terms_filter("_data.keywords.exact_match"),
}, },
'matches': { 'matches': {
'author_match': match_filter("_data.authors"), 'author_match': simple_query_string("_data.authors"),
'keyword_match': match_filter("_data.keywords"), 'keyword_match': simple_query_string("_data.keywords"),
'site_match': match_filter("_data.site"), 'site_match': simple_query_string("_data.site"),
'name_match': match_filter("_data.name"), 'name_match': simple_query_string("_data.name"),
'url_match': simple_query_string("url"),
} }
} }
} }
......
...@@ -52,6 +52,79 @@ def match_filter(field): ...@@ -52,6 +52,79 @@ def match_filter(field):
return inner return inner
def query_string(field):
"""Create a query_string query.
:param field: Field name.
:returns: Function that returns the match query.
"""
def inner(values):
return Q(
'query_string',
query=f"{field}:({' '.join(values)})",
rewrite="top_terms_1000", # calculates score for wildcards queries
)
return inner
def simple_query_string(field):
"""Create a query_string query.
:param field: Field name.
:returns: Function that returns the match query.
"""
def inner(values):
return Q(
'simple_query_string',
query=' '.join(values),
fields=[field]
)
return inner
def match_phrase_filter(field):
"""Create a match_phrase or match query. [WIP: missing checking if inside value there's a string]
:param field: Field name.
:returns: Function that returns the match query.
"""
def inner(values):
current_app.logger.warning(f"match_phrase_filter: {values}")
matches = []
phrase_matches = []
for value in values:
current_app.logger.warning(f"value: {value}")
if not value.startswith("\""):
matches.append(value)
continue
if value.endswith("\"") and len(value) > 1:
phrase_matches.append(value)
query_match = Q("match", **{field: ' '.join(matches)})
query_match_phrase = Q("match_phrase", **{field: ' '.join(phrase_matches)})
current_app.logger.warning(**{field: ' '.join(matches)})
if matches and phrase_matches:
return Q('bool', must=[query_match, query_match_phrase])
if phrase_matches:
return query_match_phrase
return query_match
return inner
def _create_match_dsl(urlkwargs, definitions): def _create_match_dsl(urlkwargs, definitions):
"""Create a match DSL expression.""" """Create a match DSL expression."""
filters = [] filters = []
......
...@@ -35,8 +35,15 @@ ...@@ -35,8 +35,15 @@
] ]
} }
}, },
"char_filter": {
"strip_dot_pattern": {
"type": "pattern_replace",
"pattern": "\\.",
"replacement": " "
}
},
"analyzer": { "analyzer": {
"url_analyzer": { "cern_url_analyzer": {
"type": "custom", "type": "custom",
"tokenizer": "url_tokenizer", "tokenizer": "url_tokenizer",
"filter": [ "filter": [
...@@ -44,6 +51,16 @@ ...@@ -44,6 +51,16 @@
"lowercase" "lowercase"
] ]
}, },
"url_analyzer": {
"tokenizer": "standard",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
},
"autocomplete": { "autocomplete": {
"tokenizer": "autocomplete", "tokenizer": "autocomplete",
"filter": [ "filter": [
...@@ -52,6 +69,13 @@ ...@@ -52,6 +69,13 @@
}, },
"autocomplete_search": { "autocomplete_search": {
"tokenizer": "lowercase" "tokenizer": "lowercase"
},
"case_insensitive_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
} }
}, },
"normalizer": { "normalizer": {
...@@ -62,6 +86,16 @@ ...@@ -62,6 +86,16 @@
"lowercase", "lowercase",
"asciifolding" "asciifolding"
] ]
},
"url_normalizer": {
"type": "custom",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
} }
} }
} }
...@@ -108,7 +142,7 @@ ...@@ -108,7 +142,7 @@
}, },
"site": { "site": {
"type": "text", "type": "text",
"analyzer": "url_analyzer", "analyzer": "cern_url_analyzer",
"fields": { "fields": {
"exact_match": { "exact_match": {
"type": "keyword", "type": "keyword",
...@@ -119,7 +153,7 @@ ...@@ -119,7 +153,7 @@
}, },
"origin": { "origin": {
"type": "text", "type": "text",
"analyzer": "url_analyzer", "analyzer": "cern_url_analyzer",
"fields": { "fields": {
"exact_match": { "exact_match": {
"type": "keyword" "type": "keyword"
...@@ -207,7 +241,7 @@ ...@@ -207,7 +241,7 @@
"analyzer": "url_analyzer", "analyzer": "url_analyzer",
"fields": { "fields": {
"exact_match": { "exact_match": {
"normalizer": "case_insensitive_normalizer", "normalizer": "url_normalizer",
"type": "keyword" "type": "keyword"
} }
} }
......
...@@ -35,8 +35,15 @@ ...@@ -35,8 +35,15 @@
] ]
} }
}, },
"char_filter": {
"strip_dot_pattern": {
"type": "pattern_replace",
"pattern": "\\.",
"replacement": " "
}
},
"analyzer": { "analyzer": {
"url_analyzer": { "cern_url_analyzer": {
"type": "custom", "type": "custom",
"tokenizer": "url_tokenizer", "tokenizer": "url_tokenizer",
"filter": [ "filter": [
...@@ -52,6 +59,23 @@ ...@@ -52,6 +59,23 @@
}, },
"autocomplete_search": { "autocomplete_search": {
"tokenizer": "lowercase" "tokenizer": "lowercase"
},
"case_insensitive_analyzer": {
"tokenizer": "standard",
"filter": [
"lowercase",
"asciifolding"
]
},
"url_analyzer": {
"tokenizer": "standard",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
} }
}, },
"normalizer": { "normalizer": {
...@@ -62,6 +86,16 @@ ...@@ -62,6 +86,16 @@
"lowercase", "lowercase",
"asciifolding" "asciifolding"
] ]
},
"url_normalizer": {
"type": "custom",
"char_filter": [
"strip_dot_pattern"
],
"filter": [
"lowercase",
"asciifolding"
]
} }
} }
} }
...@@ -108,7 +142,7 @@ ...@@ -108,7 +142,7 @@
}, },
"site": { "site": {
"type": "text", "type": "text",
"analyzer": "url_analyzer", "analyzer": "cern_url_analyzer",
"fields": { "fields": {
"exact_match": { "exact_match": {
"type": "keyword", "type": "keyword",
...@@ -119,7 +153,7 @@ ...@@ -119,7 +153,7 @@
}, },
"origin": { "origin": {
"type": "text", "type": "text",
"analyzer": "url_analyzer", "analyzer": "cern_url_analyzer",
"fields": { "fields": {
"exact_match": { "exact_match": {
"type": "keyword" "type": "keyword"
...@@ -190,7 +224,7 @@ ...@@ -190,7 +224,7 @@
"analyzer": "url_analyzer", "analyzer": "url_analyzer",
"fields": { "fields": {
"exact_match": { "exact_match": {
"normalizer": "case_insensitive_normalizer", "normalizer": "url_normalizer",
"type": "keyword" "type": "keyword"
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment