diff --git a/cern_web_crawler/settings.py b/cern_web_crawler/settings.py index 5a9e11082873d7280e9cbd9d785ea4f305dc0ac6..f34a247e40a83a61dec8bcd3690e31c7e16e6ea9 100644 --- a/cern_web_crawler/settings.py +++ b/cern_web_crawler/settings.py @@ -94,6 +94,8 @@ PUBLIC_CRAWLER_SENTRY_DSN = os.getenv('WCRAWLER_PUBLIC_CRAWLER_SENTRY_DSN') PUBLIC_CRAWLER_SENTRY_LOGLEVEL = os.getenv('WCRAWLER_PUBLIC_CRAWLER_SENTRY_LOGLEVEL', 'WARNING') DENY_LIST = ast.literal_eval(os.getenv('WCRAWLER_DENY_LIST', 'None')) or set() DENY_EXTENSIONS = ast.literal_eval(os.getenv('WCRAWLER_DENY_EXTENSIONS', 'None')) or set() + +WCRAWLER_ALLOW_LIST ast.literal_eval(os.getenv('WCRAWLER_ALLOW_LIST', 'None')) or set() ALLOWED_EXTENSIONS = ast.literal_eval(os.getenv('WCRAWLER_ALLOW_EXTENSIONS', 'None')) or set() FORCE = ast.literal_eval(os.getenv('WCRAWLER_FORCE', 'False')) CRAWL_WEB = ast.literal_eval(os.getenv('WCRAWLER_CRAWEL_WEB', 'True')) diff --git a/cern_web_crawler/spiders/public.py b/cern_web_crawler/spiders/public.py index a92b29ca8b8c81e9b6b093f2e2cee1477385bfdf..a06c6a2b68ad572d26ad4a8fe354a712e73633f5 100644 --- a/cern_web_crawler/spiders/public.py +++ b/cern_web_crawler/spiders/public.py @@ -56,6 +56,8 @@ class CERNPublic(CrawlSpider): # Offiste Middleware needs protocol but link extractor does not self.allowed_domains = [base_url] + self.allowed_domains.append(settings.WCRAWLER_ALLOW_LIST) + self.custom_settings = { 'LOG_LEVEL': settings.PUBLIC_CRAWLER_LOGLEVEL }