Commit ea0f4774 authored by Carina Antunes's avatar Carina Antunes
Browse files

limit 100MB on content files

parent e04cbc15
......@@ -33,6 +33,7 @@ INVENIO_CERN_APP_CREDENTIALS_CONSUMER_KEY=xxx
INVENIO_CELERY_BROKER_URL=amqp://guest:password@rabbitmq:5672
INVENIO_CELERY_RESULT_BACKEND=redis://redis:6379/2
INVENIO_COLLECT_STORAGE=flask_collect.storage.file
INVENIO_INDEXER_BULK_REQUEST_TIMEOUT=10
INVENIO_INDEXER_DEFAULT_DOC_TYPE=doc_v0.0.2
INVENIO_INDEXER_DEFAULT_INDEX=test-doc_v0.0.2
INVENIO_LOGGING_CONSOLE='True'
......
......@@ -32,7 +32,10 @@ def utils():
'--delayed', '-d', is_flag=True, help='Run indexing in background.')
@click.option(
'--chunk_size', '-s', default=500, type=int,
help='Chunks size.')
help='Number of docs in one chunk sent to es (default: 500)')
@click.option(
'--max_chunk_bytes', '-b', default=int(99.9 * 1024 * 1024), type=int,
help='The maximum size of the request in bytes (default: 100MB).')
@click.option(
'--concurrency', '-c', default=1, type=int,
help='Number of concurrent indexing tasks to start.')
......@@ -43,16 +46,24 @@ def utils():
'--raise-on-error/--skip-errors', default=True,
help='Controls if Elasticsearch bulk indexing errors raise an exception.')
@with_appcontext
def run(delayed, concurrency, chunk_size, version_type=None, queue=None,
raise_on_error=True):
def run(
delayed,
chunk_size,
max_chunk_bytes,
concurrency,
queue=None,
version_type=None,
raise_on_error=True
):
"""Run bulk record indexing."""
es_bulk_kwargs = {
'raise_on_error': raise_on_error,
'chunk_size': chunk_size,
'max_chunk_bytes': max_chunk_bytes
}
if delayed:
celery_kwargs = {
'kwargs': {
'version_type': version_type,
'es_bulk_kwargs': {'raise_on_error': raise_on_error, 'chunk_size': chunk_size},
}
}
celery_kwargs = {'kwargs': {'version_type': version_type, 'es_bulk_kwargs': es_bulk_kwargs}}
click.secho(
'Starting {0} tasks for indexing records...'.format(concurrency),
fg='green')
......@@ -62,8 +73,7 @@ def run(delayed, concurrency, chunk_size, version_type=None, queue=None,
process_bulk_queue.apply_async(**celery_kwargs)
else:
click.secho('Indexing records...', fg='green')
CernSearchRecordIndexer(version_type=version_type).process_bulk_queue(
es_bulk_kwargs={'raise_on_error': raise_on_error, 'chunk_size': chunk_size})
CernSearchRecordIndexer(version_type=version_type).process_bulk_queue(es_bulk_kwargs=es_bulk_kwargs)
@utils.command('reindex')
......
......@@ -27,6 +27,9 @@ COLLECTION_KEY = 'collection'
NAME_KEY = 'name'
KEYWORDS_KEY = 'keywords'
CREATION_KEY = 'creation_date'
# Hard limit on content on 99.9MB due to ES limitations
# Ref: https://www.elastic.co/guide/en/elasticsearch/reference/7.1/general-recommendations.html#maximum-document-size
CONTENT_HARD_LIMIT = int(99.9 * 1024 * 1024)
class CernSearchRecordIndexer(RecordIndexer):
......@@ -47,6 +50,10 @@ def index_file_content(sender, json=None, record: CernSearchRecord = None, index
storage = file_obj.obj.file.storage() # type: FileStorage
file_content = bc_file_content(storage)
if len(str(file_content['content'])) > CONTENT_HARD_LIMIT:
current_app.logger.warning(f"Truncated file content: {file_obj.obj.basename} in {record.id}")
file_content['content'] = str(file_content['content'])[:CONTENT_HARD_LIMIT]
json[DATA_KEY][CONTENT_KEY] = file_content['content']
json[FILE_KEY] = file_obj.obj.basename
......
......@@ -22,6 +22,8 @@ http {
ssl_certificate_key /etc/nginx/tls/tls.key;
rewrite ^/$ /account/settings/applications/;
client_max_body_size 0; # Disable body size limits for testing purposes
# Proxying connections to application servers
location / {
include uwsgi_params;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment