diff --git a/intelmq/bots/collectors/shadowserver/collector_reports_api.py b/intelmq/bots/collectors/shadowserver/collector_reports_api.py index 5e7117bd2..05bffa898 100644 --- a/intelmq/bots/collectors/shadowserver/collector_reports_api.py +++ b/intelmq/bots/collectors/shadowserver/collector_reports_api.py @@ -68,12 +68,19 @@ def init(self): if self.file_format is not None: if not (self.file_format == 'csv'): - raise ValueError('Invalid file_format') + raise ValueError("Invalid file_format '%s'. Must be 'csv'." % self.file_format) else: self.file_format = 'csv' self.preamble = f'{{ "apikey": "{self.api_key}" ' + def check(parameters: dict): + for key in parameters: + if key == 'file_format' and parameters[key] != 'csv': + return [["error", "Invalid file_format '%s'. Must be 'csv'." % parameters[key]]] + elif key == 'country': + return [["warning", "Deprecated parameter 'country' found. Please use 'reports' instead. The backwards-compatibility will be removed in IntelMQ version 4.0.0."]] + def _headers(self, data): return {'HMAC2': hmac.new(self.secret.encode(), data.encode('utf-8'), digestmod=hashlib.sha256).hexdigest()} diff --git a/intelmq/bots/parsers/shadowserver/README.md b/intelmq/bots/parsers/shadowserver/README.md index ae38dcb8c..cd750d00b 100644 --- a/intelmq/bots/parsers/shadowserver/README.md +++ b/intelmq/bots/parsers/shadowserver/README.md @@ -7,16 +7,28 @@ This module is maintained by [The Shadowserver Foundation](https://www.shadowser Please contact intelmq@shadowserver.org with any issues or concerns. -The report configuration is now stored in a _schema.json_ file downloaded from https://interchange.shadowserver.org/intelmq/v1/schema. +The report configuration is now stored in a _shadowserver-schema.json_ file downloaded from https://interchange.shadowserver.org/intelmq/v1/schema. -For environments that have internet connectivity the `update_schema.py` script should be called from a cron job to obtain the latest revision. -The parser will attempt to download a schema update on startup unless INTELMQ_SKIP_INTERNET is set. +The parser will attempt to download a schema update on startup when the *auto_update* option is enabled. -For air-gapped systems automation will be required to download and copy the _schema.json_ file into this directory. +Schema downloads can also be scheduled as a cron job: + +``` +02 01 * * * intelmq.bots.parsers.shadowserver.parser --update-schema +``` + +For air-gapped systems automation will be required to download and copy the file to VAR_STATE_PATH/shadowserver-schema.json. The parser will automatically reload the configuration when the file changes. +## Schema contract + +Once set the `classification.identifier`, `classification.taxonomy`, and `classification.type` fields will remain static. + +Once set report fields will not be deleted. + + ## Sample configuration: ``` @@ -46,6 +58,7 @@ shadowserver-parser: parameters: destination_queues: _default: [file-output-queue] + auto_update: true run_mode: continuous ``` diff --git a/intelmq/bots/parsers/shadowserver/_config.py b/intelmq/bots/parsers/shadowserver/_config.py index 5219fdb34..afe3a6b11 100644 --- a/intelmq/bots/parsers/shadowserver/_config.py +++ b/intelmq/bots/parsers/shadowserver/_config.py @@ -82,11 +82,12 @@ import base64 import binascii import json -import urllib.request import tempfile from typing import Optional, Dict, Tuple, Any import intelmq.lib.harmonization as harmonization +from intelmq.lib.utils import create_request_session +from intelmq import VAR_STATE_PATH class __Container: @@ -94,8 +95,10 @@ class __Container: __config = __Container() -__config.schema_file = os.path.join(os.path.dirname(__file__), 'schema.json') +__config.schema_file = os.path.join(VAR_STATE_PATH, 'shadowserver-schema.json') +__config.schema_base = os.path.join(os.path.dirname(__file__), 'schema.json.test') __config.schema_mtime = 0.0 +__config.auto_update = False __config.feedname_mapping = {} __config.filename_mapping = {} @@ -105,13 +108,16 @@ def set_logger(logger): __config.logger = logger +def enable_auto_update(enable): + """ Enable automatic schema update. """ + __config.auto_update = enable + + def get_feed_by_feedname(given_feedname: str) -> Optional[Dict[str, Any]]: - reload() return __config.feedname_mapping.get(given_feedname, None) def get_feed_by_filename(given_filename: str) -> Optional[Tuple[str, Dict[str, Any]]]: - reload() return __config.filename_mapping.get(given_filename, None) @@ -289,19 +295,18 @@ def reload(): else: __config.logger.info("The schema file does not exist.") - if __config.schema_mtime == 0.0 and mtime == 0.0 and not os.environ.get('INTELMQ_SKIP_INTERNET'): - __config.logger.info("Attempting to download schema.") + if __config.schema_mtime == 0.0 and mtime == 0.0 and __config.auto_update: update_schema() __config.feedname_mapping.clear() __config.filename_mapping.clear() - for schema_file in [__config.schema_file, ".".join([__config.schema_file, 'test'])]: + for schema_file in [__config.schema_file, __config.schema_base]: if os.path.isfile(schema_file): with open(schema_file) as fh: schema = json.load(fh) for report in schema: if report == "_meta": - __config.logger.info("Loading schema %s." % schema[report]['date_created']) + __config.logger.info("Loading schema %r." % schema[report]['date_created']) for msg in schema[report]['change_log']: __config.logger.info(msg) else: @@ -313,37 +318,55 @@ def reload(): def update_schema(): """ download the latest configuration """ if os.environ.get('INTELMQ_SKIP_INTERNET'): - return None + return False - (th, tmp) = tempfile.mkstemp(dir=os.path.dirname(__file__)) + # download the schema to a temp file + (th, tmp) = tempfile.mkstemp(dir=VAR_STATE_PATH) url = 'https://interchange.shadowserver.org/intelmq/v1/schema' + __config.logger.info("Attempting to download schema from %r" % url) + __config.logger.debug("Using temp file %r for the download." % tmp) try: - urllib.request.urlretrieve(url, tmp) + with create_request_session() as session: + with session.get(url, stream=True) as r: + r.raise_for_status() + with open(tmp, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) except: - raise ValueError("Failed to download %r" % url) + __config.logger.error("Failed to download %r" % url) + return False + __config.logger.info("Download successful.") new_version = '' old_version = '' try: + # validate the downloaded file with open(tmp) as fh: schema = json.load(fh) new_version = schema['_meta']['date_created'] except: # leave tempfile behind for diagnosis - raise ValueError("Failed to validate %r" % tmp) + __config.logger.error("Failed to validate %r" % tmp) + return False if os.path.exists(__config.schema_file): + # compare the new version against the old; rename the existing file try: with open(__config.schema_file) as fh: schema = json.load(fh) old_version = schema['_meta']['date_created'] if new_version != old_version: os.replace(__config.schema_file, ".".join([__config.schema_file, 'bak'])) - except: - pass + except Exception as e: + __config.logger.error("Unable to replace schema file: %s" % str(e)) + return False if new_version != old_version: os.replace(tmp, __config.schema_file) + __config.logger.info("New schema version is %r." % new_version) + return True else: os.unlink(tmp) + + return False diff --git a/intelmq/bots/parsers/shadowserver/parser.py b/intelmq/bots/parsers/shadowserver/parser.py index 668a81534..2e383a004 100644 --- a/intelmq/bots/parsers/shadowserver/parser.py +++ b/intelmq/bots/parsers/shadowserver/parser.py @@ -26,6 +26,8 @@ from intelmq.lib.bot import ParserBot from intelmq.lib.exceptions import InvalidKey, InvalidValue +from intelmq.bin.intelmqctl import IntelMQController +import intelmq.lib.utils as utils import intelmq.bots.parsers.shadowserver._config as config @@ -34,8 +36,7 @@ class ShadowserverParserBot(ParserBot): Parse all ShadowServer feeds Parameters: - schema_file (str): Path to the report schema file - + auto_update (boolean): Enable automatic schema download """ recover_line = ParserBot.recover_line_csv_dict @@ -45,13 +46,15 @@ class ShadowserverParserBot(ParserBot): feedname = None _mode = None overwrite = False + auto_update = False def init(self): config.set_logger(self.logger) - try: - config.update_schema() - except Exception as e: - self.logger.warning("Schema update failed: %s." % e) + if self.auto_update: + config.enable_auto_update(True) + self.logger.debug("Feature 'auto_update' is enabled.") + config.reload() + if self.feedname is not None: self._sparser_config = config.get_feed_by_feedname(self.feedname) if self._sparser_config: @@ -228,5 +231,35 @@ def parse_line(self, row, report): def shutdown(self): self.feedname = None + @classmethod + def _create_argparser(cls): + argparser = super()._create_argparser() + argparser.add_argument("--update-schema", action='store_true', help='downloads latest report schema') + argparser.add_argument("--verbose", action='store_true', help='be verbose') + return argparser + + @classmethod + def run(cls, parsed_args=None): + if not parsed_args: + parsed_args = cls._create_argparser().parse_args() + if parsed_args.update_schema: + logger = utils.log(__name__, log_path=None) + if parsed_args.verbose: + logger.setLevel('INFO') + else: + logger.setLevel('ERROR') + config.set_logger(logger) + if config.update_schema(): + runtime_conf = utils.get_bots_settings() + try: + ctl = IntelMQController() + for bot in runtime_conf: + if runtime_conf[bot]["module"] == __name__ and runtime_conf[bot]['parameters'].get('auto_update', True): + ctl.bot_reload(bot) + except Exception as e: + logger.error("Failed to signal bot: %r" % str(e)) + else: + super().run(parsed_args=parsed_args) + BOT = ShadowserverParserBot diff --git a/intelmq/bots/parsers/shadowserver/update_schema.py b/intelmq/bots/parsers/shadowserver/update_schema.py deleted file mode 100644 index a7975147e..000000000 --- a/intelmq/bots/parsers/shadowserver/update_schema.py +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-FileCopyrightText: 2023 The Shadowserver Foundation -# -# SPDX-License-Identifier: AGPL-3.0-or-later - -# -*- coding: utf-8 -*- - -import os -import intelmq.bots.parsers.shadowserver._config as config - -if __name__ == '__main__': # pragma: no cover - config.update_schema() diff --git a/intelmq/tests/bots/parsers/shadowserver/test_download_schema.py b/intelmq/tests/bots/parsers/shadowserver/test_download_schema.py new file mode 100644 index 000000000..e68587682 --- /dev/null +++ b/intelmq/tests/bots/parsers/shadowserver/test_download_schema.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2023 The Shadowserver Foundation +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 27 19:44:44 2023 + +""" + +import unittest +import os +import logging +from intelmq import VAR_STATE_PATH +import intelmq.bots.parsers.shadowserver._config as config +import intelmq.lib.utils as utils +import intelmq.lib.test as test + +@test.skip_internet() +class TestShadowserverSchemaDownload(unittest.TestCase): + + def test_download(self): + schema_file = os.path.join(VAR_STATE_PATH, 'shadowserver-schema.json') + config.set_logger(utils.log('test-bot', log_path=None)) + if os.path.exists(schema_file): + os.unlink(schema_file) + self.assertEqual(True, config.update_schema()) + self.assertEqual(True, os.path.exists(schema_file))