limit 100MB on content files

ea0f4774 · Carina Antunes · e04cbc15 · ea0f4774 · ea0f4774 · ea0f4774
Commit ea0f4774 authored Aug 27, 2020 by Carina Antunes
--- a/.env
+++ b/.env
@@ -33,6 +33,7 @@ INVENIO_CERN_APP_CREDENTIALS_CONSUMER_KEY=xxx
 INVENIO_CELERY_BROKER_URL=amqp://guest:password@rabbitmq:5672
 INVENIO_CELERY_RESULT_BACKEND=redis://redis:6379/2
 INVENIO_COLLECT_STORAGE=flask_collect.storage.file
+INVENIO_INDEXER_BULK_REQUEST_TIMEOUT=10
 INVENIO_INDEXER_DEFAULT_DOC_TYPE=doc_v0.0.2
 INVENIO_INDEXER_DEFAULT_INDEX=test-doc_v0.0.2
 INVENIO_LOGGING_CONSOLE='True'

--- a/cern_search_rest_api/modules/cernsearch/cli.py
+++ b/cern_search_rest_api/modules/cernsearch/cli.py
@@ -32,7 +32,10 @@ def utils():
    '--delayed', '-d', is_flag=True, help='Run indexing in background.')
 @click.option(
    '--chunk_size', '-s', default=500, type=int,
-    help='Chunks size.')
+    help='Number of docs in one chunk sent to es (default: 500)')
+@click.option(
+    '--max_chunk_bytes', '-b', default=int(99.9 * 1024 * 1024), type=int,
+    help='The maximum size of the request in bytes (default: 100MB).')
 @click.option(
    '--concurrency', '-c', default=1, type=int,
    help='Number of concurrent indexing tasks to start.')
@@ -43,16 +46,24 @@ def utils():
    '--raise-on-error/--skip-errors', default=True,
    help='Controls if Elasticsearch bulk indexing errors raise an exception.')
 @with_appcontext
-def run(delayed, concurrency, chunk_size, version_type=None, queue=None,
-        raise_on_error=True):
+def run(
+        delayed,
+        chunk_size,
+        max_chunk_bytes,
+        concurrency,
+        queue=None,
+        version_type=None,
+        raise_on_error=True
+):
    """Run bulk record indexing."""
-    if delayed:
-        celery_kwargs = {
-            'kwargs': {
-                'version_type': version_type,
-                'es_bulk_kwargs': {'raise_on_error': raise_on_error, 'chunk_size': chunk_size},
-            }
+    es_bulk_kwargs = {
+        'raise_on_error': raise_on_error,
+        'chunk_size': chunk_size,
+        'max_chunk_bytes': max_chunk_bytes
    }
+
+    if delayed:
+        celery_kwargs = {'kwargs': {'version_type': version_type, 'es_bulk_kwargs': es_bulk_kwargs}}
        click.secho(
            'Starting {0} tasks for indexing records...'.format(concurrency),
            fg='green')
@@ -62,8 +73,7 @@ def run(delayed, concurrency, chunk_size, version_type=None, queue=None,
            process_bulk_queue.apply_async(**celery_kwargs)
    else:
        click.secho('Indexing records...', fg='green')
-        CernSearchRecordIndexer(version_type=version_type).process_bulk_queue(
-            es_bulk_kwargs={'raise_on_error': raise_on_error, 'chunk_size': chunk_size})
+        CernSearchRecordIndexer(version_type=version_type).process_bulk_queue(es_bulk_kwargs=es_bulk_kwargs)


 @utils.command('reindex')

--- a/cern_search_rest_api/modules/cernsearch/indexer.py
+++ b/cern_search_rest_api/modules/cernsearch/indexer.py
@@ -27,6 +27,9 @@ COLLECTION_KEY = 'collection'
 NAME_KEY = 'name'
 KEYWORDS_KEY = 'keywords'
 CREATION_KEY = 'creation_date'
+# Hard limit on content on 99.9MB due to ES limitations
+# Ref: https://www.elastic.co/guide/en/elasticsearch/reference/7.1/general-recommendations.html#maximum-document-size
+CONTENT_HARD_LIMIT = int(99.9 * 1024 * 1024)


 class CernSearchRecordIndexer(RecordIndexer):
@@ -47,6 +50,10 @@ def index_file_content(sender, json=None, record: CernSearchRecord = None, index
        storage = file_obj.obj.file.storage()  # type: FileStorage

        file_content = bc_file_content(storage)
+        if len(str(file_content['content'])) > CONTENT_HARD_LIMIT:
+            current_app.logger.warning(f"Truncated file content: {file_obj.obj.basename} in {record.id}")
+            file_content['content'] = str(file_content['content'])[:CONTENT_HARD_LIMIT]
+
        json[DATA_KEY][CONTENT_KEY] = file_content['content']
        json[FILE_KEY] = file_obj.obj.basename


--- a/nginx/nginx.conf
+++ b/nginx/nginx.conf
@@ -22,6 +22,8 @@ http {
    ssl_certificate_key /etc/nginx/tls/tls.key;
    rewrite ^/$ /account/settings/applications/;

+    client_max_body_size 0; # Disable body size limits for testing purposes
+
    # Proxying connections to application servers
    location / {
      include            uwsgi_params;