Commit 91eb2779 authored by Carina Antunes's avatar Carina Antunes
Browse files

[EDMS] Wildcard improvements / Invenio-DB/SQLAlchemy Fix

parent f1d2002b
CERN_SEARCH_INSTANCE=indico
INVENIO_INDEXER_DEFAULT_DOC_TYPE=events_v1.0.0
INVENIO_INDEXER_DEFAULT_INDEX=indico-events_v1.0.0
INVENIO_SEARCH_INDEX_PREFIX=cernsearch-lcagenda_
[settings] [settings]
line_length=120 line_length=120
known_third_party = celery,click,elasticsearch,elasticsearch_dsl,flask,flask_login,flask_security,invenio_accounts,invenio_app,invenio_db,invenio_files_processor,invenio_files_rest,invenio_indexer,invenio_oauth2server,invenio_oauthclient,invenio_pidstore,invenio_records,invenio_records_files,invenio_records_rest,invenio_rest,invenio_search,kombu,marshmallow,pytest,setuptools,six,sqlalchemy,werkzeug known_third_party = celery,click,elasticsearch,elasticsearch_dsl,flask,flask_login,flask_security,invenio_accounts,invenio_app,invenio_db,invenio_files_processor,invenio_files_rest,invenio_indexer,invenio_oauth2server,invenio_oauthclient,invenio_pidstore,invenio_records,invenio_records_files,invenio_records_rest,invenio_rest,invenio_search,kombu,marshmallow,pytest,setuptools,six,sqlalchemy,sqlalchemy_continuum,werkzeug
multi_line_output = 3 multi_line_output = 3
include_trailing_comma = True include_trailing_comma = True
force_grid_wrap = 0 force_grid_wrap = 0
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
# under the terms of the MIT License; see LICENSE file for more details. # under the terms of the MIT License; see LICENSE file for more details.
# Use CentOS7: # Use CentOS7:
FROM gitlab-registry.cern.ch/webservices/cern-search/cern-search-rest-api/cern-search-rest-api-base:6c8529ea9d6817aaab9f64f4c3c55870c5b28d4f FROM gitlab-registry.cern.ch/webservices/cern-search/cern-search-rest-api/cern-search-rest-api-base:795c35db2e0302474454fde0ba453253c8916eef
ARG build_env ARG build_env
# CERN Search installation # CERN Search installation
......
...@@ -2,10 +2,38 @@ ...@@ -2,10 +2,38 @@
"settings": { "settings": {
"index.percolator.map_unmapped_fields_as_text": true, "index.percolator.map_unmapped_fields_as_text": true,
"index.mapping.total_fields.limit": 500, "index.mapping.total_fields.limit": 500,
"index.number_of_shards": 5,
"index.max_ngram_diff": 100,
"index.query.default_field": [ "index.query.default_field": [
"_data.*" "_data.*"
], ],
"analysis": { "analysis": {
"tokenizer": {
"keyword_edge_ngram": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 100,
"token_chars": [
"letter",
"digit",
"whitespace",
"punctuation",
"symbol"
]
},
"keyword_ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 50,
"token_chars": [
"letter",
"digit",
"whitespace",
"punctuation",
"symbol"
]
}
},
"analyzer": { "analyzer": {
"case_accent_analyzer": { "case_accent_analyzer": {
"tokenizer": "standard", "tokenizer": "standard",
...@@ -13,6 +41,27 @@ ...@@ -13,6 +41,27 @@
"lowercase", "lowercase",
"asciifolding" "asciifolding"
] ]
},
"wildcard_edge": {
"tokenizer": "keyword_edge_ngram",
"filter": [
"lowercase",
"asciifolding"
]
},
"wildcard": {
"tokenizer": "keyword_ngram",
"filter": [
"lowercase",
"asciifolding"
]
},
"lowercase_keyword_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"asciifolding"
]
} }
}, },
"normalizer": { "normalizer": {
...@@ -61,8 +110,9 @@ ...@@ -61,8 +110,9 @@
"boost": 5 "boost": 5
}, },
"cern_id": { "cern_id": {
"type": "keyword", "type": "text",
"normalizer": "case_accent_normalizer", "analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 5 "boost": 5
}, },
"cern_id_version": { "cern_id_version": {
...@@ -105,6 +155,12 @@ ...@@ -105,6 +155,12 @@
"type": "keyword", "type": "keyword",
"normalizer": "case_accent_normalizer", "normalizer": "case_accent_normalizer",
"boost": 1.5 "boost": 1.5
},
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.5
} }
} }
}, },
...@@ -122,6 +178,12 @@ ...@@ -122,6 +178,12 @@
"type": "text", "type": "text",
"boost": 1.25, "boost": 1.25,
"analyzer": "french" "analyzer": "french"
},
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.25
} }
} }
}, },
...@@ -133,6 +195,12 @@ ...@@ -133,6 +195,12 @@
"analyzer": "case_accent_analyzer", "analyzer": "case_accent_analyzer",
"boost": 1.2 "boost": 1.2
}, },
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.2
},
"email": { "email": {
"type": "text", "type": "text",
"analyzer": "case_accent_analyzer", "analyzer": "case_accent_analyzer",
...@@ -159,6 +227,12 @@ ...@@ -159,6 +227,12 @@
"type": "keyword", "type": "keyword",
"normalizer": "case_accent_normalizer", "normalizer": "case_accent_normalizer",
"boost": 1.4 "boost": 1.4
},
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.5
} }
} }
}, },
......
...@@ -2,10 +2,27 @@ ...@@ -2,10 +2,27 @@
"settings": { "settings": {
"index.percolator.map_unmapped_fields_as_text": true, "index.percolator.map_unmapped_fields_as_text": true,
"index.mapping.total_fields.limit": 500, "index.mapping.total_fields.limit": 500,
"index.highlight.max_analyzed_offset": 10000000,
"index.number_of_shards": 5,
"index.max_ngram_diff": 100,
"index.query.default_field": [ "index.query.default_field": [
"_data.*" "_data.*"
], ],
"analysis": { "analysis": {
"tokenizer": {
"keyword_ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 50,
"token_chars": [
"letter",
"digit",
"whitespace",
"punctuation",
"symbol"
]
}
},
"analyzer": { "analyzer": {
"case_accent_analyzer": { "case_accent_analyzer": {
"tokenizer": "standard", "tokenizer": "standard",
...@@ -13,6 +30,20 @@ ...@@ -13,6 +30,20 @@
"lowercase", "lowercase",
"asciifolding" "asciifolding"
] ]
},
"wildcard": {
"tokenizer": "keyword_ngram",
"filter": [
"lowercase",
"asciifolding"
]
},
"lowercase_keyword_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"asciifolding"
]
} }
}, },
"normalizer": { "normalizer": {
...@@ -81,9 +112,17 @@ ...@@ -81,9 +112,17 @@
"boost": 1.9 "boost": 1.9
}, },
"cern_id": { "cern_id": {
"type": "keyword", "type": "text",
"normalizer": "case_accent_normalizer", "analyzer": "wildcard",
"boost": 1.9 "search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.9,
"fields": {
"exact_match": {
"type": "keyword",
"normalizer": "case_accent_normalizer",
"boost": 1.9
}
}
}, },
"cern_id_version": { "cern_id_version": {
"type": "keyword", "type": "keyword",
...@@ -125,6 +164,12 @@ ...@@ -125,6 +164,12 @@
"type": "keyword", "type": "keyword",
"normalizer": "case_accent_normalizer", "normalizer": "case_accent_normalizer",
"boost": 1.4 "boost": 1.4
},
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.4
} }
} }
}, },
...@@ -142,6 +187,12 @@ ...@@ -142,6 +187,12 @@
"type": "text", "type": "text",
"boost": 1.2, "boost": 1.2,
"analyzer": "french" "analyzer": "french"
},
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.2
} }
} }
}, },
...@@ -157,6 +208,12 @@ ...@@ -157,6 +208,12 @@
"type": "text", "type": "text",
"analyzer": "case_accent_analyzer", "analyzer": "case_accent_analyzer",
"boost": 1.15 "boost": 1.15
},
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.15
} }
} }
}, },
...@@ -179,6 +236,12 @@ ...@@ -179,6 +236,12 @@
"type": "keyword", "type": "keyword",
"boost": 1.3, "boost": 1.3,
"normalizer": "case_accent_normalizer" "normalizer": "case_accent_normalizer"
},
"wildcard": {
"type": "text",
"analyzer": "wildcard",
"search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.3
} }
} }
}, },
......
...@@ -2,10 +2,25 @@ ...@@ -2,10 +2,25 @@
"settings": { "settings": {
"index.percolator.map_unmapped_fields_as_text": true, "index.percolator.map_unmapped_fields_as_text": true,
"index.mapping.total_fields.limit": 500, "index.mapping.total_fields.limit": 500,
"index.max_ngram_diff": 100,
"index.query.default_field": [ "index.query.default_field": [
"_data.*" "_data.*"
], ],
"analysis": { "analysis": {
"tokenizer": {
"keyword_ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 50,
"token_chars": [
"letter",
"digit",
"whitespace",
"punctuation",
"symbol"
]
}
},
"analyzer": { "analyzer": {
"case_accent_analyzer": { "case_accent_analyzer": {
"tokenizer": "standard", "tokenizer": "standard",
...@@ -13,6 +28,20 @@ ...@@ -13,6 +28,20 @@
"lowercase", "lowercase",
"asciifolding" "asciifolding"
] ]
},
"wildcard": {
"tokenizer": "keyword_ngram",
"filter": [
"lowercase",
"asciifolding"
]
},
"lowercase_keyword_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"asciifolding"
]
} }
}, },
"normalizer": { "normalizer": {
...@@ -50,14 +79,30 @@ ...@@ -50,14 +79,30 @@
"type": "object", "type": "object",
"properties": { "properties": {
"code": { "code": {
"type": "keyword", "type": "text",
"normalizer": "case_accent_normalizer", "analyzer": "wildcard",
"boost": 1.5 "search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.5,
"fields": {
"exact_match": {
"type": "keyword",
"normalizer": "case_accent_normalizer",
"boost": 1.5
}
}
}, },
"other_id": { "other_id": {
"type": "keyword", "type": "text",
"normalizer": "case_accent_normalizer", "analyzer": "wildcard",
"boost": 1.4 "search_analyzer": "lowercase_keyword_analyzer",
"boost": 1.4,
"fields": {
"exact_match": {
"type": "keyword",
"normalizer": "case_accent_normalizer",
"boost": 1.4
}
}
}, },
"class": { "class": {
"type": "keyword", "type": "keyword",
......
...@@ -2,10 +2,25 @@ ...@@ -2,10 +2,25 @@
"settings": { "settings": {
"index.percolator.map_unmapped_fields_as_text": true, "index.percolator.map_unmapped_fields_as_text": true,
"index.mapping.total_fields.limit": 500, "index.mapping.total_fields.limit": 500,
"index.max_ngram_diff": 100,
"index.query.default_field": [ "index.query.default_field": [
"_data.*" "_data.*"
], ],
"analysis": { "analysis": {
"tokenizer": {
"keyword_ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 50,
"token_chars": [
"letter",
"digit",
"whitespace",
"punctuation",
"symbol"
]
}
},
"analyzer": { "analyzer": {
"case_accent_analyzer": { "case_accent_analyzer": {
"tokenizer": "standard", "tokenizer": "standard",
...@@ -13,6 +28,20 @@ ...@@ -13,6 +28,20 @@
"lowercase", "lowercase",
"asciifolding" "asciifolding"
] ]
},
"wildcard": {
"tokenizer": "keyword_ngram",
"filter": [
"lowercase",
"asciifolding"
]
},
"lowercase_keyword_analyzer": {
"tokenizer": "keyword",
"filter": [
"lowercase",
"asciifolding"
]
} }
}, },
"normalizer": { "normalizer": {
...@@ -50,9 +79,17 @@ ...@@ -50,9 +79,17 @@
"type": "object", "type": "object",
"properties": { "properties": {
"item_id": { "item_id": {
"type": "keyword", "type": "text",
"normalizer": "case_accent_normalizer", "analyzer": "wildcard",
"boost": 2 "search_analyzer": "lowercase_keyword_analyzer",
"boost": 2,
"fields": {
"exact_match": {
"type": "keyword",
"normalizer": "case_accent_normalizer",
"boost": 2
}
}
}, },
"version": { "version": {
"type": "keyword", "type": "keyword",
......
...@@ -2,10 +2,25 @@ ...@@ -2,10 +2,25 @@
"settings": { "settings": {
"index.percolator.map_unmapped_fields_as_text": true, "index.percolator.map_unmapped_fields_as_text": true,
"index.mapping.total_fields.limit": 500, "index.mapping.total_fields.limit": 500,
"index.max_ngram_diff": 100,
"index.query.default_field": [ "index.query.default_field": [
"_data.*" "_data.*"
], ],
"analysis": { "analysis": {
"tokenizer": {
"keyword_ngram": {
"type": "ngram",
"min_gram": 3,
"max_gram": 50,
"token_chars": [
"letter",
"digit",
"whitespace",
"punctuation",
"symbol"
]
}
},
"analyzer": { "analyzer": {
"case_accent_analyzer": { "case_accent_analyzer": {
"tokenizer": "standard", "tokenizer": "standard",
...@@ -13,6 +28,20 @@ ...@@ -13,6 +28,20 @@
"lowercase", "lowercase",
"asciifolding" "asciifolding"