Skip to content
Snippets Groups Projects
Commit ea0f4774 authored by Carina Antunes's avatar Carina Antunes
Browse files

limit 100MB on content files

parent e04cbc15
No related branches found
No related tags found
No related merge requests found
......@@ -33,6 +33,7 @@ INVENIO_CERN_APP_CREDENTIALS_CONSUMER_KEY=xxx
INVENIO_CELERY_BROKER_URL=amqp://guest:password@rabbitmq:5672
INVENIO_CELERY_RESULT_BACKEND=redis://redis:6379/2
INVENIO_COLLECT_STORAGE=flask_collect.storage.file
INVENIO_INDEXER_BULK_REQUEST_TIMEOUT=10
INVENIO_INDEXER_DEFAULT_DOC_TYPE=doc_v0.0.2
INVENIO_INDEXER_DEFAULT_INDEX=test-doc_v0.0.2
INVENIO_LOGGING_CONSOLE='True'
......
......@@ -32,7 +32,10 @@ def utils():
'--delayed', '-d', is_flag=True, help='Run indexing in background.')
@click.option(
'--chunk_size', '-s', default=500, type=int,
help='Chunks size.')
help='Number of docs in one chunk sent to es (default: 500)')
@click.option(
'--max_chunk_bytes', '-b', default=int(99.9 * 1024 * 1024), type=int,
help='The maximum size of the request in bytes (default: 100MB).')
@click.option(
'--concurrency', '-c', default=1, type=int,
help='Number of concurrent indexing tasks to start.')
......@@ -43,16 +46,24 @@ def utils():
'--raise-on-error/--skip-errors', default=True,
help='Controls if Elasticsearch bulk indexing errors raise an exception.')
@with_appcontext
def run(delayed, concurrency, chunk_size, version_type=None, queue=None,
raise_on_error=True):
def run(
delayed,
chunk_size,
max_chunk_bytes,
concurrency,
queue=None,
version_type=None,
raise_on_error=True
):
"""Run bulk record indexing."""
if delayed:
celery_kwargs = {
'kwargs': {
'version_type': version_type,
'es_bulk_kwargs': {'raise_on_error': raise_on_error, 'chunk_size': chunk_size},
}
es_bulk_kwargs = {
'raise_on_error': raise_on_error,
'chunk_size': chunk_size,
'max_chunk_bytes': max_chunk_bytes
}
if delayed:
celery_kwargs = {'kwargs': {'version_type': version_type, 'es_bulk_kwargs': es_bulk_kwargs}}
click.secho(
'Starting {0} tasks for indexing records...'.format(concurrency),
fg='green')
......@@ -62,8 +73,7 @@ def run(delayed, concurrency, chunk_size, version_type=None, queue=None,
process_bulk_queue.apply_async(**celery_kwargs)
else:
click.secho('Indexing records...', fg='green')
CernSearchRecordIndexer(version_type=version_type).process_bulk_queue(
es_bulk_kwargs={'raise_on_error': raise_on_error, 'chunk_size': chunk_size})
CernSearchRecordIndexer(version_type=version_type).process_bulk_queue(es_bulk_kwargs=es_bulk_kwargs)
@utils.command('reindex')
......
......@@ -27,6 +27,9 @@ COLLECTION_KEY = 'collection'
NAME_KEY = 'name'
KEYWORDS_KEY = 'keywords'
CREATION_KEY = 'creation_date'
# Hard limit on content on 99.9MB due to ES limitations
# Ref: https://www.elastic.co/guide/en/elasticsearch/reference/7.1/general-recommendations.html#maximum-document-size
CONTENT_HARD_LIMIT = int(99.9 * 1024 * 1024)
class CernSearchRecordIndexer(RecordIndexer):
......@@ -47,6 +50,10 @@ def index_file_content(sender, json=None, record: CernSearchRecord = None, index
storage = file_obj.obj.file.storage() # type: FileStorage
file_content = bc_file_content(storage)
if len(str(file_content['content'])) > CONTENT_HARD_LIMIT:
current_app.logger.warning(f"Truncated file content: {file_obj.obj.basename} in {record.id}")
file_content['content'] = str(file_content['content'])[:CONTENT_HARD_LIMIT]
json[DATA_KEY][CONTENT_KEY] = file_content['content']
json[FILE_KEY] = file_obj.obj.basename
......
......@@ -22,6 +22,8 @@ http {
ssl_certificate_key /etc/nginx/tls/tls.key;
rewrite ^/$ /account/settings/applications/;
client_max_body_size 0; # Disable body size limits for testing purposes
# Proxying connections to application servers
location / {
include uwsgi_params;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment