From e0772416e3271b51c325afa1d1d8e3b095297233 Mon Sep 17 00:00:00 2001 From: Mark Cottman-Fields Date: Thu, 16 Mar 2023 21:53:07 +1000 Subject: [PATCH] feat: improve scrapy web data gathering Move to v0.0.4. --- CHANGELOG.md | 7 + CONTRIBUTING.md | 2 +- VERSION | 2 +- src/gather_vision/app.py | 248 ++++++++++++++++-------------- src/gather_vision/cli.py | 34 +++- src/gather_vision/plugin/entry.py | 10 +- src/gather_vision/utils.py | 15 +- tests/conftest.py | 24 ++- tests/example_plugin.py | 30 ++-- tests/test_cli.py | 120 ++++++++------- 10 files changed, 286 insertions(+), 206 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f3e344..59d6fd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Change log +## [v0.0.4](https://github.com/anotherbyte-net/gather-vision/releases/tag/v0.0.4) + +[full change log](https://github.com/anotherbyte-net/gather-vision/compare/v0.0.3...v0.0.4) + +- allow providing data storage path +- improve scrapy web data gathering + ## [v0.0.3](https://github.com/anotherbyte-net/gather-vision/releases/tag/v0.0.3) [full change log](https://github.com/anotherbyte-net/gather-vision/compare/v0.0.2...v0.0.3) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 00a3a9a..7a262bd 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -85,7 +85,7 @@ source .venv-test/bin/activate python -m pip install --upgrade pip setuptools wheel python -m pip install --upgrade -r requirements.txt -GATHER_VISION_VERSION='0.0.3' +GATHER_VISION_VERSION='0.0.4' pip install --index-url https://test.pypi.org/simple/ --no-deps gather-vision==$GATHER_VISION_VERSION # or pip install dist/gather_vision-$GATHER_VISION_VERSION-py3-none-any.whl diff --git a/VERSION b/VERSION index bcab45a..81340c7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.0.3 +0.0.4 diff --git a/src/gather_vision/app.py b/src/gather_vision/app.py index 76a496d..521d825 100644 --- a/src/gather_vision/app.py +++ b/src/gather_vision/app.py @@ -1,19 +1,16 @@ """The main application features.""" + import dataclasses import logging -import pathlib import pickle -import tempfile import typing import scrapy -from importlib_metadata import EntryPoint, entry_points -from scrapy.crawler import CrawlerProcess -from scrapy.exporters import BaseItemExporter -from scrapy.http import Response, TextResponse +import importlib_metadata +from scrapy import crawler, exporters, http from gather_vision import utils -from gather_vision.plugin import data as plugin_data, entry as plugin_entry +from gather_vision.plugin import data, entry logger = logging.getLogger(__name__) @@ -22,12 +19,12 @@ class PluginItem: """Information about a plugin.""" - entry_point: EntryPoint - entry_class: typing.Type[plugin_entry.Entry] - entry_instance: plugin_entry.Entry + entry_point: importlib_metadata.EntryPoint + entry_class: typing.Type[entry.Entry] + entry_instance: entry.Entry -class AppPickleItemExporter(BaseItemExporter): +class AppPickleItemExporter(exporters.BaseItemExporter): def __init__(self, file, protocol=4, **kwargs): super().__init__(**kwargs) self.file = file @@ -62,7 +59,7 @@ def load(self) -> typing.List[PluginItem]: logger.info("Loaded %s plugins.", len(self._available)) return self._available - def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult: + def update(self, args: entry.UpdateArgs) -> entry.UpdateResult: """Execute the update action for all plugins or the plugin with the given name. Args: @@ -71,133 +68,145 @@ def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult: Returns: The result of running the plugin's update process. """ + loaded_plugins = list(self.load()) named_plugins = [ i - for i in self.load() + for i in loaded_plugins if args.name is None or i.entry_point.name == args.name ] + available_plugins = ", ".join( + sorted(i.entry_point.name for i in loaded_plugins) + ) - if len(named_plugins) == 0: + named_count = len(named_plugins) + available_count = len(loaded_plugins) + + if named_count == 0: raise utils.GatherVisionException( - f"Could not find plugin named '{args.name}'." + f"Could not find plugin named '{args.name}'. " + f"Available plugins ({available_count}): {available_plugins}." ) - if args.name and len(named_plugins) > 1: + if args.name and named_count > 1: raise utils.GatherVisionException( f"Found multiple plugins named '{args.name}'." + f"Available plugins ({available_count}): {available_plugins}." ) - # load data from local sources first - local_data: typing.List[plugin_data.LocalData] = [] + # get the data sources + local_data: typing.List[data.LocalData] = [] + web_data: typing.List[data.WebData] = [] for named_plugin in named_plugins: plugin_update_result = named_plugin.entry_instance.update(args) + + # load data from local sources for local_data_item in plugin_update_result.local_data: local_data_item.data = list(local_data_item.load_resources()) local_data.extend(plugin_update_result.local_data) + # get the web data sources + web_data.extend(plugin_update_result.web_data) + logger.info("Loaded %s local data sources.", len(local_data)) # allow running multiple plugins at once - # gather WebData subclasses and run the spider - web_data: typing.List[plugin_data.WebData] = [] - for named_plugin in named_plugins: - plugin_update_result = named_plugin.entry_instance.update(args) - web_data.extend(plugin_update_result.web_data) - + # run the spider logger.info("Starting %s web data sources.", len(web_data)) + web_data_map = dict([(self._data_item_id(i), i) for i in web_data]) - # run the web data to get items, using scrapy - # save the feed to a temp file, then read the items back in - feed_items = [] - with tempfile.TemporaryDirectory() as temp_dir: - feed_path = pathlib.Path(temp_dir, "feed_%(name)s_%(time)s.pickle") - feed_path_setting = str(feed_path).replace("\\", "/") - - files_path = pathlib.Path(temp_dir, "feed_%(name)s_%(time)s.files") - files_path_setting = str(files_path).replace("\\", "/") - - process = CrawlerProcess( - settings={ - "USER_AGENT": "gather-vision (+https://github.com/anotherbyte-net/gather-vision)", - # http cache - "HTTPCACHE_ENABLED": True, - "HTTPCACHE_DIR": ".httpcache", - "HTTPCACHE_POLICY": "scrapy.extensions.httpcache.DummyPolicy", - "HTTPCACHE_STORAGE": "scrapy.extensions.httpcache.FilesystemCacheStorage", - "EXTENSIONS": { - "scrapy.extensions.telnet.TelnetConsole": None, - }, - # feed - "FEED_EXPORTERS": { - "pickle_raw": "gather_vision.app.AppPickleItemExporter", - }, - "FEEDS": { - f"file:///{feed_path_setting}": {"format": "pickle_raw"}, - }, - "WEB_DATA_ITEMS": web_data, - # logs - "LOG_ENABLED": True, - "LOG_FILE": None, - "LOG_STDOUT": False, - "LOG_LEVEL": "ERROR", - # throttling requests - "DOWNLOAD_DELAY": 3, - "AUTOTHROTTLE_ENABLED": True, - "AUTOTHROTTLE_START_DELAY": 3, - "AUTOTHROTTLE_MAX_DELAY": 60, - "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, - # pipelines - "ITEM_PIPELINES": { - "scrapy.pipelines.files.FilesPipeline": 1, - }, - "FILES_STORE": files_path_setting, - "MEDIA_ALLOW_REDIRECTS": True, - # Set settings whose default value is deprecated to a future-proof value - "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", - "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", - "FEED_EXPORT_ENCODING": "utf-8", - }, - install_root_handler=True, - ) - - process.crawl(WebDataFetch) + # build the output paths - logging.getLogger("scrapy").setLevel("ERROR") - logging.getLogger("py.warnings").setLevel("CRITICAL") + if not args.data_path: + raise ValueError(f"Invalid data path '{args.data_path}'.") - # the script will block here until the crawling is finished - process.start() + feed_path = args.data_path / "feeds" / "feed_%(name)s_%(time)s.pickle" + feed_path_setting = str(feed_path).replace("\\", "/") - # f = io.BytesIO() - # pickle.dump(items, f) - # - # f.seek(0) - # result = pickle.load(f) + files_dir = args.data_path / "files" + files_dir_setting = str(files_dir).replace("\\", "/") - # load the feed items - for item in feed_path.parent.iterdir(): - if not item.is_file(): - continue - if item.suffix != ".pickle": - continue + http_cache_dir = args.data_path / "http_cache" + http_cache_dir_setting = str(http_cache_dir).replace("\\", "/") - with item.open("rb") as f: - while True: - try: - feed_items.append(pickle.load(f)) - except EOFError: - break + # run the web data to get items, using scrapy + # save the feed to a temp file, then read the items back in + process = crawler.CrawlerProcess( + settings={ + "USER_AGENT": "gather-vision (+https://github.com/anotherbyte-net/gather-vision)", + # http cache + "HTTPCACHE_ENABLED": True, + "HTTPCACHE_DIR": http_cache_dir_setting, + "HTTPCACHE_POLICY": "scrapy.extensions.httpcache.DummyPolicy", + "HTTPCACHE_STORAGE": "scrapy.extensions.httpcache.FilesystemCacheStorage", + "EXTENSIONS": { + "scrapy.extensions.telnet.TelnetConsole": None, + }, + # feed + "FEED_EXPORTERS": { + "pickle_raw": "gather_vision.app.AppPickleItemExporter", + }, + "FEEDS": { + f"file:///{feed_path_setting}": {"format": "pickle_raw"}, + }, + "WEB_DATA_ITEMS": web_data, + # logs + "LOG_ENABLED": True, + "LOG_FILE": None, + "LOG_STDOUT": False, + "LOG_LEVEL": "ERROR", + # throttling requests + "DOWNLOAD_DELAY": 3, + "AUTOTHROTTLE_ENABLED": True, + "AUTOTHROTTLE_START_DELAY": 3, + "AUTOTHROTTLE_MAX_DELAY": 60, + "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0, + # pipelines + "ITEM_PIPELINES": { + "scrapy.pipelines.files.FilesPipeline": 1, + }, + "FILES_STORE": files_dir_setting, + "MEDIA_ALLOW_REDIRECTS": True, + # Set settings whose default value is deprecated to a future-proof value + "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "FEED_EXPORT_ENCODING": "utf-8", + }, + install_root_handler=True, + ) - logger.info("Loaded %s data items from web data sources.", len(feed_items)) + process.crawl(WebDataFetch) + + logging.getLogger("scrapy").setLevel("ERROR") + logging.getLogger("py.warnings").setLevel("CRITICAL") + + # the script will block here until the crawling is finished + process.start() + + # load the feed items + feed_item_count = 0 + for item in feed_path.parent.iterdir(): + if not item.is_file(): + continue + if item.suffix != ".pickle": + continue + + with item.open("rb") as f: + while True: + try: + # store PluginDataItems in the related PluginWebData instance + web_data_item = pickle.load(f) + map_id = self._data_item_id(web_data_item) + web_data_map[map_id].data = [web_data_item] + feed_item_count += 1 + except EOFError: + break + + logger.info("Loaded %s data items from web data sources.", feed_item_count) logger.info("Finished update.") - # TODO: still need to do something with the feed_items? - - # TODO: save results? + return entry.UpdateResult(web_data=web_data, local_data=local_data) - return plugin_entry.UpdateResult(web_data=web_data, local_data=local_data) - - def list(self, args: plugin_entry.ListArgs) -> plugin_entry.ListResult: + def list(self, args: entry.ListArgs) -> entry.ListResult: """List all available plugins. Args: @@ -210,12 +219,14 @@ def list(self, args: plugin_entry.ListArgs) -> plugin_entry.ListResult: for plugin_item in self.load(): result = plugin_item.entry_instance.list(args) items.update(result.items) - return plugin_entry.ListResult(items) + return entry.ListResult(items) def _get_entry_points(self, group: str): - return entry_points(group=group) + return importlib_metadata.entry_points(group=group) - def _build_plugin_item(self, entry_point: EntryPoint) -> PluginItem: + def _build_plugin_item( + self, entry_point: importlib_metadata.EntryPoint + ) -> PluginItem: entry_class = entry_point.load() item = PluginItem( entry_point=entry_point, @@ -224,14 +235,15 @@ def _build_plugin_item(self, entry_point: EntryPoint) -> PluginItem: ) return item + def _data_item_id(self, item) -> str: + return "-".join([item.plugin_name, item.plugin_data_source]) + class WebDataFetch(scrapy.Spider): name = "web-data" def start_requests(self): - web_data_items: typing.List[plugin_data.WebData] = self.settings.get( - "WEB_DATA_ITEMS" - ) + web_data_items: typing.List[data.WebData] = self.settings.get("WEB_DATA_ITEMS") for web_data_item in web_data_items: for initial_url in web_data_item.initial_urls(): yield scrapy.Request( @@ -240,19 +252,19 @@ def start_requests(self): cb_kwargs={"web_data_item": web_data_item}, ) - def parse(self, response: Response, **kwargs): - web_data_item: plugin_data.WebData = response.cb_kwargs.get("web_data_item") + def parse(self, response: http.Response, **kwargs): + web_data_item: data.WebData = response.cb_kwargs.get("web_data_item") is_json = "json" in response.headers["Content-Type"].decode("utf-8").lower() - if isinstance(response, TextResponse): + if isinstance(response, http.TextResponse): body_data = response.json() if is_json else None selector = response.selector else: body_data = None selector = None - data = plugin_data.WebDataAvailable( + web_data = data.WebDataAvailable( request_url=response.request.url, request_method=response.request.method, response_url=response.url, @@ -263,7 +275,7 @@ def parse(self, response: Response, **kwargs): headers=response.headers, meta=response.cb_kwargs, ) - for i in web_data_item.parse_response(data): + for i in web_data_item.parse_response(web_data): if isinstance(i, str): yield scrapy.Request( url=i, diff --git a/src/gather_vision/cli.py b/src/gather_vision/cli.py index 8ea5841..07d3a49 100644 --- a/src/gather_vision/cli.py +++ b/src/gather_vision/cli.py @@ -2,11 +2,12 @@ import argparse import logging +import pathlib import sys import typing from gather_vision import app, utils -from gather_vision.plugin import entry as plugin_entry +from gather_vision.plugin import entry def cli_update(args: argparse.Namespace) -> bool: @@ -20,13 +21,31 @@ def cli_update(args: argparse.Namespace) -> bool: """ logger = logging.getLogger(__name__) - app_args = plugin_entry.UpdateArgs(name=args.name) + app_args = entry.UpdateArgs(name=args.name, data_path=args.data_path) main_app = app.App() logger.info("Updating '%s'.", args.name) result = main_app.update(app_args) - # TODO: save result + # cli just logs the plugins and count of data items + available = { + "local": result.local_data, + "web": result.web_data, + } + for group, data_items in available.items(): + logger.info("Updated %s %s data items.", len(data_items), group) + for item_index, data_item in enumerate(data_items): + item_num = item_index + 1 + name = data_item.plugin_name + source = data_item.plugin_data_source + count = len(data_item.data) + logger.info( + " %s) plugin '%s' data source '%s' with %s items", + item_num, + name, + source, + count, + ) return True @@ -44,10 +63,11 @@ def cli_list( """ logger = logging.getLogger(__name__) - app_args = plugin_entry.ListArgs() + app_args = entry.ListArgs() main_app = app.App() result = main_app.list(app_args) + # cli just logs the plugins and data sources logger.info("Listing %s plugins.", len(result.items)) for plugin_index, (plugin_name, data_sources) in enumerate(result.items.items()): plugin_num = plugin_index + 1 @@ -55,6 +75,7 @@ def cli_list( for data_source_index, data_source_name in enumerate(data_sources): data_source_num = data_source_index + 1 logger.info(" %s.%s) %s", plugin_num, data_source_num, data_source_name) + return True @@ -112,6 +133,11 @@ def main(args: typing.Optional[typing.List[str]] = None) -> int: default=None, help="The name of the update to run.", ) + parser_update.add_argument( + "--data-path", + type=pathlib.Path, + help="The path to the data directory for downloads, cache, files.", + ) parser_update.set_defaults(func=cli_update) # create the parser for the "list" command diff --git a/src/gather_vision/plugin/entry.py b/src/gather_vision/plugin/entry.py index 2422155..db3fcb6 100644 --- a/src/gather_vision/plugin/entry.py +++ b/src/gather_vision/plugin/entry.py @@ -1,9 +1,11 @@ """Public api for plugin entry point.""" + import abc import dataclasses +import pathlib import typing -from gather_vision.plugin.data import LocalData, WebData +from gather_vision.plugin import data @dataclasses.dataclass @@ -16,13 +18,15 @@ class UpdateArgs: data_source: typing.Optional[str] = None """The plugin data source name.""" + data_path: typing.Optional[pathlib.Path] = None + @dataclasses.dataclass class UpdateResult: """The result from the update command.""" - web_data: typing.List["WebData"] - local_data: typing.List["LocalData"] + web_data: typing.List["data.WebData"] + local_data: typing.List["data.LocalData"] @dataclasses.dataclass diff --git a/src/gather_vision/utils.py b/src/gather_vision/utils.py index b712eb9..8eb9c3a 100644 --- a/src/gather_vision/utils.py +++ b/src/gather_vision/utils.py @@ -1,8 +1,9 @@ """Small utility functions.""" + import pathlib import typing -from importlib_metadata import distribution, PackageNotFoundError -from importlib_resources import as_file, files +import importlib_metadata +import importlib_resources def get_name_dash() -> str: @@ -18,13 +19,15 @@ def get_name_under() -> str: def get_version() -> typing.Optional[str]: """Get the package version.""" try: - dist = distribution(get_name_dash()) + dist = importlib_metadata.distribution(get_name_dash()) return dist.version - except PackageNotFoundError: + except importlib_metadata.PackageNotFoundError: pass try: - with as_file(files(get_name_under()).joinpath("cli.py")) as file_path: + with importlib_resources.as_file( + importlib_resources.files(get_name_under()).joinpath("cli.py") + ) as file_path: version_path = file_path.parent.parent.parent / "VERSION" return version_path.read_text(encoding="utf-8").strip() except FileNotFoundError: @@ -62,3 +65,5 @@ def validate_path( class GatherVisionException(Exception): """A gather vision error.""" + + pass diff --git a/tests/conftest.py b/tests/conftest.py index 8c63713..0d64a5a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,10 @@ -from importlib_metadata import EntryPoints +import re + +import importlib_metadata import pytest -from example_plugin import ExamplePlugin +import example_plugin @pytest.fixture(autouse=True) @@ -10,6 +12,22 @@ def patch_app_get_entry_points(monkeypatch): """Provide an example plugin for tests.""" def _get_entry_points(self, group: str): - return EntryPoints(ExamplePlugin) + return importlib_metadata.EntryPoints(example_plugin.ExamplePlugin) monkeypatch.setattr("gather_vision.app.App._get_entry_points", _get_entry_points) + + +@pytest.fixture() +def equal_ignore_whitespace(): + def _equal_ignore_whitespace(value1: str, value2: str, ignore_case=False): + # Ignore non-space and non-word characters + whitespace = re.compile(r"\s+") + replace1 = whitespace.sub(" ", value1 or "").strip() + replace2 = whitespace.sub(" ", value2 or "").strip() + + if ignore_case: + assert replace1.casefold() == replace2.casefold() + else: + assert replace1 == replace2 + + return _equal_ignore_whitespace diff --git a/tests/example_plugin.py b/tests/example_plugin.py index c6f3e28..301d9ce 100644 --- a/tests/example_plugin.py +++ b/tests/example_plugin.py @@ -1,25 +1,24 @@ import dataclasses import logging -import re import typing -from gather_vision.plugin import data as plugin_data, entry as plugin_entry +from gather_vision.plugin import data, entry logger = logging.getLogger(__name__) logging.getLogger("example_plugin").setLevel("INFO") -class ExamplePlugin(plugin_entry.Entry): +class ExamplePlugin(entry.Entry): plugin_name = "example-plugin" plugin_value = "example_plugin.ExamplePlugin" _data_source_1 = "example-data-source-1" _data_source_2 = "example-data-source-2" _data_source_names = [_data_source_1, _data_source_2] - def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult: + def update(self, args: entry.UpdateArgs) -> entry.UpdateResult: logger.info(f"Running update for plugin {self.plugin_name}.") - return plugin_entry.UpdateResult( + return entry.UpdateResult( web_data=[ ExamplePluginWebData( plugin_name=self.plugin_name, @@ -34,21 +33,21 @@ def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult: ], ) - def list(self, args: plugin_entry.ListArgs) -> plugin_entry.ListResult: + def list(self, args: entry.ListArgs) -> entry.ListResult: logger.info(f"Running list for plugin {self.plugin_name}.") - return plugin_entry.ListResult( + return entry.ListResult( items={self.plugin_name: self._data_source_names}, ) -class ExamplePluginWebData(plugin_data.WebData): +class ExamplePluginWebData(data.WebData): def initial_urls(self) -> typing.Iterable[str]: return ["https://example.com/"] def parse_response( - self, data: plugin_data.WebDataAvailable + self, data: data.WebDataAvailable ) -> typing.Generator[ - typing.Union[str, plugin_data.GatherDataItem], typing.Any, typing.Any + typing.Union[str, data.GatherDataItem], typing.Any, typing.Any ]: yield ExamplePluginDataItem( plugin_name=self.plugin_name, @@ -65,15 +64,15 @@ def parse_response( @dataclasses.dataclass -class ExamplePluginDataItem(plugin_data.GatherDataItem): +class ExamplePluginDataItem(data.GatherDataItem): name: str value: int -class ExamplePluginLocalData(plugin_data.LocalData): +class ExamplePluginLocalData(data.LocalData): def load_resources( self, - ) -> typing.Generator[plugin_data.GatherDataItem, typing.Any, typing.Any]: + ) -> typing.Generator[data.GatherDataItem, typing.Any, typing.Any]: items = [ ExamplePluginDataItem( plugin_name=self.plugin_name, @@ -91,8 +90,3 @@ def load_resources( for item in items: yield item - - -def collapse_whitespace(value: str) -> str: - whitespace = re.compile(r"\s+") - return whitespace.sub(" ", value) diff --git a/tests/test_cli.py b/tests/test_cli.py index da64d38..3f78419 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,15 +1,12 @@ import sys -import logging_tree -from importlib_metadata import EntryPoints, entry_points, EntryPoint - import pytest +import importlib_metadata -from gather_vision.app import PluginItem -from gather_vision.cli import main -from example_plugin import ExamplePlugin, collapse_whitespace +import example_plugin +from gather_vision import app, cli -expected_version = "0.0.3" +expected_version = "0.0.4" if sys.version_info.minor >= 10: help_phrase_options = "options:" @@ -18,9 +15,9 @@ @pytest.mark.parametrize("main_args,exit_code", [([], 1), (["--help"], 0)]) -def test_cli_no_args(capsys, caplog, main_args, exit_code): +def test_cli_no_args(capsys, caplog, main_args, exit_code, equal_ignore_whitespace): with pytest.raises(SystemExit, match=str(exit_code)): - main(main_args) + cli.main(main_args) prog_help = ( "usage: gather-vision [-h] [--version]\n" @@ -46,19 +43,19 @@ def test_cli_no_args(capsys, caplog, main_args, exit_code): stdout, stderr = capsys.readouterr() if main_args == ["--help"]: - assert collapse_whitespace(stdout) == collapse_whitespace(prog_help) + equal_ignore_whitespace(stdout, prog_help) assert stderr == "" assert caplog.record_tuples == [] if main_args == []: assert stdout == "" - assert collapse_whitespace(stderr) == collapse_whitespace(prog_help) + equal_ignore_whitespace(stderr, prog_help) assert caplog.record_tuples == [] def test_cli_version(capsys, caplog): with pytest.raises(SystemExit, match="0"): - main(["--version"]) + cli.main(["--version"]) stdout, stderr = capsys.readouterr() assert stdout == f"gather-vision {expected_version}\n" @@ -66,16 +63,18 @@ def test_cli_version(capsys, caplog): assert caplog.record_tuples == [] -def test_cli_list_help(capsys, caplog): +def test_cli_list_help(capsys, caplog, equal_ignore_whitespace): with pytest.raises(SystemExit, match="0"): - main(["list", "--help"]) + cli.main(["list", "--help"]) stdout, stderr = capsys.readouterr() - assert stdout == ( - "usage: gather-vision list [-h]\n" - "\n" - f"{help_phrase_options}\n" - " -h, --help show this help message and exit\n" + equal_ignore_whitespace( + stdout, + ( + "usage: gather-vision list [-h] " + f"{help_phrase_options} " + " -h, --help show this help message and exit" + ), ) assert stderr == "" assert caplog.record_tuples == [] @@ -89,24 +88,24 @@ def test_cli_list(capsys, caplog, monkeypatch): orig_build_plugin_item = App._build_plugin_item def get_entry_points(self, group): - result = EntryPoints( - [i for i in entry_points(group=group)] + result = importlib_metadata.EntryPoints( + [i for i in importlib_metadata.entry_points(group=group)] + [ - EntryPoint( + importlib_metadata.EntryPoint( group=App.group, - name=ExamplePlugin.plugin_name, - value=ExamplePlugin.plugin_value, + name=example_plugin.ExamplePlugin.plugin_name, + value=example_plugin.ExamplePlugin.plugin_value, ) ] ) return result def build_plugin_item(self, entry_point): - if entry_point.name == ExamplePlugin.plugin_name: - return PluginItem( + if entry_point.name == example_plugin.ExamplePlugin.plugin_name: + return app.PluginItem( entry_point=entry_point, - entry_class=ExamplePlugin, - entry_instance=ExamplePlugin(), + entry_class=example_plugin.ExamplePlugin, + entry_instance=example_plugin.ExamplePlugin(), ) return orig_build_plugin_item(self, entry_point) @@ -115,7 +114,7 @@ def build_plugin_item(self, entry_point): m.setattr(App, "_build_plugin_item", build_plugin_item) with pytest.raises(SystemExit, match="0"): - main(["list"]) + cli.main(["list"]) stdout, stderr = capsys.readouterr() assert stdout == "" @@ -132,17 +131,23 @@ def build_plugin_item(self, entry_point): ] -def test_cli_update_help(capsys, caplog): +def test_cli_update_help(capsys, caplog, equal_ignore_whitespace): with pytest.raises(SystemExit, match="0"): - main(["--log-level", "debug", "update", "--help"]) + cli.main(["--log-level", "debug", "update", "--help"]) stdout, stderr = capsys.readouterr() - assert stdout == ( - "usage: gather-vision update [-h] [--name NAME]\n" - "\n" - f"{help_phrase_options}\n" - " -h, --help show this help message and exit\n" - " --name NAME The name of the update to run.\n" + + equal_ignore_whitespace( + stdout, + ( + "usage: gather-vision update [-h] [--name NAME] [--data-path DATA_PATH]\n" + "\n" + f"{help_phrase_options}\n" + " -h, --help show this help message and exit\n" + " --name NAME The name of the update to run.\n" + " --data-path DATA_PATH\n" + " The path to the data directory for downloads, cache, files.\n" + ), ) assert stderr == "" assert caplog.record_tuples == [] @@ -150,7 +155,7 @@ def test_cli_update_help(capsys, caplog): def test_cli_update_not_available(capsys, caplog): with pytest.raises(SystemExit, match="1"): - main(["update", "--name", "not-available"]) + cli.main(["update", "--name", "not-available"]) stdout, stderr = capsys.readouterr() assert stdout == "" @@ -161,29 +166,38 @@ def test_cli_update_not_available(capsys, caplog): ( "gather_vision.cli", 40, - "Error: GatherVisionException - Could not find plugin named 'not-available'.", + "Error: GatherVisionException - Could not find plugin named 'not-available'. " + "Available plugins (1): example-plugin.", ), ] -def test_cli_update_example_plugin(capsys, caplog): +def test_cli_update_example_plugin(capsys, caplog, tmp_path): with pytest.raises(SystemExit, match="0"): - main(["update", "--name", "example-plugin"]) + cli.main(["update", "--name", "example-plugin", "--data-path", str(tmp_path)]) + + # logging_tree_str = logging_tree.format.build_description() - logging_tree_str = logging_tree.format.build_description() + l_cli = "gather_vision.cli" + l_app = "gather_vision.app" + + prefix = " 1) plugin 'example-plugin' data source 'example-data-source-" stdout, stderr = capsys.readouterr() assert stdout == "" assert stderr == "" - for i in [ - ("gather_vision.cli", 20, "Starting gather-vision."), - ("gather_vision.cli", 20, "Updating 'example-plugin'."), - ("example_plugin", 20, "Running update for plugin example-plugin."), - ("gather_vision.app", 20, "Loaded 1 local data sources."), - ("example_plugin", 20, "Running update for plugin example-plugin."), - ("gather_vision.app", 20, "Starting 1 web data sources."), - ("gather_vision.app", 20, "Loaded 2 data items from web data sources."), - ("gather_vision.app", 20, "Finished update."), - ("gather_vision.cli", 20, "Finished."), + for index, log_item in [ + (0, (l_cli, 20, "Starting gather-vision.")), + (1, (l_cli, 20, "Updating 'example-plugin'.")), + (2, ("example_plugin", 20, "Running update for plugin example-plugin.")), + (3, (l_app, 20, "Loaded 1 local data sources.")), + (4, (l_app, 20, "Starting 1 web data sources.")), + (18, (l_app, 20, "Loaded 2 data items from web data sources.")), + (19, (l_app, 20, "Finished update.")), + (20, (l_cli, 20, "Updated 1 local data items.")), + (21, (l_cli, 20, prefix + "2' with 2 items")), + (22, (l_cli, 20, "Updated 1 web data items.")), + (23, (l_cli, 20, prefix + "1' with 2 items")), + (24, (l_cli, 20, "Finished.")), ]: - assert i in caplog.record_tuples + assert caplog.record_tuples[index] == log_item