diff --git a/.env b/.env index 5c32128bd7bddd53e9fe23eff371c8e565e8c1bd..1a2f3e71d37941ade0391b2cbf419dd95c0bd321 100644 --- a/.env +++ b/.env @@ -33,6 +33,7 @@ INVENIO_CERN_APP_CREDENTIALS_CONSUMER_KEY=xxx INVENIO_CELERY_BROKER_URL=amqp://guest:password@rabbitmq:5672 INVENIO_CELERY_RESULT_BACKEND=redis://redis:6379/2 INVENIO_COLLECT_STORAGE=flask_collect.storage.file +INVENIO_INDEXER_BULK_REQUEST_TIMEOUT=10 INVENIO_INDEXER_DEFAULT_DOC_TYPE=doc_v0.0.2 INVENIO_INDEXER_DEFAULT_INDEX=test-doc_v0.0.2 INVENIO_LOGGING_CONSOLE='True' diff --git a/cern_search_rest_api/modules/cernsearch/cli.py b/cern_search_rest_api/modules/cernsearch/cli.py index 2765742eda9d0c3a58622877acf56b6e6ee4fc60..9234fe0f09058963892d2dd11f8060c207a97922 100644 --- a/cern_search_rest_api/modules/cernsearch/cli.py +++ b/cern_search_rest_api/modules/cernsearch/cli.py @@ -32,7 +32,10 @@ def utils(): '--delayed', '-d', is_flag=True, help='Run indexing in background.') @click.option( '--chunk_size', '-s', default=500, type=int, - help='Chunks size.') + help='Number of docs in one chunk sent to es (default: 500)') +@click.option( + '--max_chunk_bytes', '-b', default=int(99.9 * 1024 * 1024), type=int, + help='The maximum size of the request in bytes (default: 100MB).') @click.option( '--concurrency', '-c', default=1, type=int, help='Number of concurrent indexing tasks to start.') @@ -43,16 +46,24 @@ def utils(): '--raise-on-error/--skip-errors', default=True, help='Controls if Elasticsearch bulk indexing errors raise an exception.') @with_appcontext -def run(delayed, concurrency, chunk_size, version_type=None, queue=None, - raise_on_error=True): +def run( + delayed, + chunk_size, + max_chunk_bytes, + concurrency, + queue=None, + version_type=None, + raise_on_error=True +): """Run bulk record indexing.""" + es_bulk_kwargs = { + 'raise_on_error': raise_on_error, + 'chunk_size': chunk_size, + 'max_chunk_bytes': max_chunk_bytes + } + if delayed: - celery_kwargs = { - 'kwargs': { - 'version_type': version_type, - 'es_bulk_kwargs': {'raise_on_error': raise_on_error, 'chunk_size': chunk_size}, - } - } + celery_kwargs = {'kwargs': {'version_type': version_type, 'es_bulk_kwargs': es_bulk_kwargs}} click.secho( 'Starting {0} tasks for indexing records...'.format(concurrency), fg='green') @@ -62,8 +73,7 @@ def run(delayed, concurrency, chunk_size, version_type=None, queue=None, process_bulk_queue.apply_async(**celery_kwargs) else: click.secho('Indexing records...', fg='green') - CernSearchRecordIndexer(version_type=version_type).process_bulk_queue( - es_bulk_kwargs={'raise_on_error': raise_on_error, 'chunk_size': chunk_size}) + CernSearchRecordIndexer(version_type=version_type).process_bulk_queue(es_bulk_kwargs=es_bulk_kwargs) @utils.command('reindex') diff --git a/cern_search_rest_api/modules/cernsearch/indexer.py b/cern_search_rest_api/modules/cernsearch/indexer.py index a8cccf4187dcd075e85f5d9898c456cbdbfb4dcb..17b9253bcd8210549c2d009a9676d1403f599564 100644 --- a/cern_search_rest_api/modules/cernsearch/indexer.py +++ b/cern_search_rest_api/modules/cernsearch/indexer.py @@ -27,6 +27,9 @@ COLLECTION_KEY = 'collection' NAME_KEY = 'name' KEYWORDS_KEY = 'keywords' CREATION_KEY = 'creation_date' +# Hard limit on content on 99.9MB due to ES limitations +# Ref: https://www.elastic.co/guide/en/elasticsearch/reference/7.1/general-recommendations.html#maximum-document-size +CONTENT_HARD_LIMIT = int(99.9 * 1024 * 1024) class CernSearchRecordIndexer(RecordIndexer): @@ -47,6 +50,10 @@ def index_file_content(sender, json=None, record: CernSearchRecord = None, index storage = file_obj.obj.file.storage() # type: FileStorage file_content = bc_file_content(storage) + if len(str(file_content['content'])) > CONTENT_HARD_LIMIT: + current_app.logger.warning(f"Truncated file content: {file_obj.obj.basename} in {record.id}") + file_content['content'] = str(file_content['content'])[:CONTENT_HARD_LIMIT] + json[DATA_KEY][CONTENT_KEY] = file_content['content'] json[FILE_KEY] = file_obj.obj.basename diff --git a/nginx/nginx.conf b/nginx/nginx.conf index 29d48e0f61acc816af5b6402b3a5618089a724bd..cb00462899efc0f262ce10b3aead56fb2fb7e9a2 100644 --- a/nginx/nginx.conf +++ b/nginx/nginx.conf @@ -22,6 +22,8 @@ http { ssl_certificate_key /etc/nginx/tls/tls.key; rewrite ^/$ /account/settings/applications/; + client_max_body_size 0; # Disable body size limits for testing purposes + # Proxying connections to application servers location / { include uwsgi_params;