From e0772416e3271b51c325afa1d1d8e3b095297233 Mon Sep 17 00:00:00 2001
From: Mark Cottman-Fields <cofiem@gmail.com>
Date: Thu, 16 Mar 2023 21:53:07 +1000
Subject: [PATCH] feat: improve scrapy web data gathering

Move to v0.0.4.
---
 CHANGELOG.md                      |   7 +
 CONTRIBUTING.md                   |   2 +-
 VERSION                           |   2 +-
 src/gather_vision/app.py          | 248 ++++++++++++++++--------------
 src/gather_vision/cli.py          |  34 +++-
 src/gather_vision/plugin/entry.py |  10 +-
 src/gather_vision/utils.py        |  15 +-
 tests/conftest.py                 |  24 ++-
 tests/example_plugin.py           |  30 ++--
 tests/test_cli.py                 | 120 ++++++++-------
 10 files changed, 286 insertions(+), 206 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6f3e344..59d6fd3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Change log
 
+## [v0.0.4](https://github.com/anotherbyte-net/gather-vision/releases/tag/v0.0.4)
+
+[full change log](https://github.com/anotherbyte-net/gather-vision/compare/v0.0.3...v0.0.4)
+
+- allow providing data storage path
+- improve scrapy web data gathering
+
 ## [v0.0.3](https://github.com/anotherbyte-net/gather-vision/releases/tag/v0.0.3)
 
 [full change log](https://github.com/anotherbyte-net/gather-vision/compare/v0.0.2...v0.0.3)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 00a3a9a..7a262bd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -85,7 +85,7 @@ source .venv-test/bin/activate
 python -m pip install --upgrade pip setuptools wheel
 python -m pip install --upgrade -r requirements.txt
 
-GATHER_VISION_VERSION='0.0.3'
+GATHER_VISION_VERSION='0.0.4'
 pip install --index-url https://test.pypi.org/simple/ --no-deps gather-vision==$GATHER_VISION_VERSION
 # or
 pip install dist/gather_vision-$GATHER_VISION_VERSION-py3-none-any.whl
diff --git a/VERSION b/VERSION
index bcab45a..81340c7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.0.3
+0.0.4
diff --git a/src/gather_vision/app.py b/src/gather_vision/app.py
index 76a496d..521d825 100644
--- a/src/gather_vision/app.py
+++ b/src/gather_vision/app.py
@@ -1,19 +1,16 @@
 """The main application features."""
+
 import dataclasses
 import logging
-import pathlib
 import pickle
-import tempfile
 import typing
 
 import scrapy
-from importlib_metadata import EntryPoint, entry_points
-from scrapy.crawler import CrawlerProcess
-from scrapy.exporters import BaseItemExporter
-from scrapy.http import Response, TextResponse
+import importlib_metadata
+from scrapy import crawler, exporters, http
 
 from gather_vision import utils
-from gather_vision.plugin import data as plugin_data, entry as plugin_entry
+from gather_vision.plugin import data, entry
 
 logger = logging.getLogger(__name__)
 
@@ -22,12 +19,12 @@
 class PluginItem:
     """Information about a plugin."""
 
-    entry_point: EntryPoint
-    entry_class: typing.Type[plugin_entry.Entry]
-    entry_instance: plugin_entry.Entry
+    entry_point: importlib_metadata.EntryPoint
+    entry_class: typing.Type[entry.Entry]
+    entry_instance: entry.Entry
 
 
-class AppPickleItemExporter(BaseItemExporter):
+class AppPickleItemExporter(exporters.BaseItemExporter):
     def __init__(self, file, protocol=4, **kwargs):
         super().__init__(**kwargs)
         self.file = file
@@ -62,7 +59,7 @@ def load(self) -> typing.List[PluginItem]:
             logger.info("Loaded %s plugins.", len(self._available))
         return self._available
 
-    def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult:
+    def update(self, args: entry.UpdateArgs) -> entry.UpdateResult:
         """Execute the update action for all plugins or the plugin with the given name.
 
         Args:
@@ -71,133 +68,145 @@ def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult:
         Returns:
             The result of running the plugin's update process.
         """
+        loaded_plugins = list(self.load())
         named_plugins = [
             i
-            for i in self.load()
+            for i in loaded_plugins
             if args.name is None or i.entry_point.name == args.name
         ]
+        available_plugins = ", ".join(
+            sorted(i.entry_point.name for i in loaded_plugins)
+        )
 
-        if len(named_plugins) == 0:
+        named_count = len(named_plugins)
+        available_count = len(loaded_plugins)
+
+        if named_count == 0:
             raise utils.GatherVisionException(
-                f"Could not find plugin named '{args.name}'."
+                f"Could not find plugin named '{args.name}'. "
+                f"Available plugins ({available_count}): {available_plugins}."
             )
 
-        if args.name and len(named_plugins) > 1:
+        if args.name and named_count > 1:
             raise utils.GatherVisionException(
                 f"Found multiple plugins named '{args.name}'."
+                f"Available plugins ({available_count}): {available_plugins}."
             )
 
-        # load data from local sources first
-        local_data: typing.List[plugin_data.LocalData] = []
+        # get the data sources
+        local_data: typing.List[data.LocalData] = []
+        web_data: typing.List[data.WebData] = []
         for named_plugin in named_plugins:
             plugin_update_result = named_plugin.entry_instance.update(args)
+
+            # load data from local sources
             for local_data_item in plugin_update_result.local_data:
                 local_data_item.data = list(local_data_item.load_resources())
             local_data.extend(plugin_update_result.local_data)
 
+            # get the web data sources
+            web_data.extend(plugin_update_result.web_data)
+
         logger.info("Loaded %s local data sources.", len(local_data))
 
         # allow running multiple plugins at once
-        # gather WebData subclasses and run the spider
-        web_data: typing.List[plugin_data.WebData] = []
-        for named_plugin in named_plugins:
-            plugin_update_result = named_plugin.entry_instance.update(args)
-            web_data.extend(plugin_update_result.web_data)
-
+        # run the spider
         logger.info("Starting %s web data sources.", len(web_data))
+        web_data_map = dict([(self._data_item_id(i), i) for i in web_data])
 
-        # run the web data to get items, using scrapy
-        # save the feed to a temp file, then read the items back in
-        feed_items = []
-        with tempfile.TemporaryDirectory() as temp_dir:
-            feed_path = pathlib.Path(temp_dir, "feed_%(name)s_%(time)s.pickle")
-            feed_path_setting = str(feed_path).replace("\\", "/")
-
-            files_path = pathlib.Path(temp_dir, "feed_%(name)s_%(time)s.files")
-            files_path_setting = str(files_path).replace("\\", "/")
-
-            process = CrawlerProcess(
-                settings={
-                    "USER_AGENT": "gather-vision (+https://github.com/anotherbyte-net/gather-vision)",
-                    # http cache
-                    "HTTPCACHE_ENABLED": True,
-                    "HTTPCACHE_DIR": ".httpcache",
-                    "HTTPCACHE_POLICY": "scrapy.extensions.httpcache.DummyPolicy",
-                    "HTTPCACHE_STORAGE": "scrapy.extensions.httpcache.FilesystemCacheStorage",
-                    "EXTENSIONS": {
-                        "scrapy.extensions.telnet.TelnetConsole": None,
-                    },
-                    # feed
-                    "FEED_EXPORTERS": {
-                        "pickle_raw": "gather_vision.app.AppPickleItemExporter",
-                    },
-                    "FEEDS": {
-                        f"file:///{feed_path_setting}": {"format": "pickle_raw"},
-                    },
-                    "WEB_DATA_ITEMS": web_data,
-                    # logs
-                    "LOG_ENABLED": True,
-                    "LOG_FILE": None,
-                    "LOG_STDOUT": False,
-                    "LOG_LEVEL": "ERROR",
-                    # throttling requests
-                    "DOWNLOAD_DELAY": 3,
-                    "AUTOTHROTTLE_ENABLED": True,
-                    "AUTOTHROTTLE_START_DELAY": 3,
-                    "AUTOTHROTTLE_MAX_DELAY": 60,
-                    "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0,
-                    # pipelines
-                    "ITEM_PIPELINES": {
-                        "scrapy.pipelines.files.FilesPipeline": 1,
-                    },
-                    "FILES_STORE": files_path_setting,
-                    "MEDIA_ALLOW_REDIRECTS": True,
-                    # Set settings whose default value is deprecated to a future-proof value
-                    "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
-                    "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
-                    "FEED_EXPORT_ENCODING": "utf-8",
-                },
-                install_root_handler=True,
-            )
-
-            process.crawl(WebDataFetch)
+        # build the output paths
 
-            logging.getLogger("scrapy").setLevel("ERROR")
-            logging.getLogger("py.warnings").setLevel("CRITICAL")
+        if not args.data_path:
+            raise ValueError(f"Invalid data path '{args.data_path}'.")
 
-            # the script will block here until the crawling is finished
-            process.start()
+        feed_path = args.data_path / "feeds" / "feed_%(name)s_%(time)s.pickle"
+        feed_path_setting = str(feed_path).replace("\\", "/")
 
-            # f = io.BytesIO()
-            # pickle.dump(items, f)
-            #
-            # f.seek(0)
-            # result = pickle.load(f)
+        files_dir = args.data_path / "files"
+        files_dir_setting = str(files_dir).replace("\\", "/")
 
-            # load the feed items
-            for item in feed_path.parent.iterdir():
-                if not item.is_file():
-                    continue
-                if item.suffix != ".pickle":
-                    continue
+        http_cache_dir = args.data_path / "http_cache"
+        http_cache_dir_setting = str(http_cache_dir).replace("\\", "/")
 
-                with item.open("rb") as f:
-                    while True:
-                        try:
-                            feed_items.append(pickle.load(f))
-                        except EOFError:
-                            break
+        # run the web data to get items, using scrapy
+        # save the feed to a temp file, then read the items back in
+        process = crawler.CrawlerProcess(
+            settings={
+                "USER_AGENT": "gather-vision (+https://github.com/anotherbyte-net/gather-vision)",
+                # http cache
+                "HTTPCACHE_ENABLED": True,
+                "HTTPCACHE_DIR": http_cache_dir_setting,
+                "HTTPCACHE_POLICY": "scrapy.extensions.httpcache.DummyPolicy",
+                "HTTPCACHE_STORAGE": "scrapy.extensions.httpcache.FilesystemCacheStorage",
+                "EXTENSIONS": {
+                    "scrapy.extensions.telnet.TelnetConsole": None,
+                },
+                # feed
+                "FEED_EXPORTERS": {
+                    "pickle_raw": "gather_vision.app.AppPickleItemExporter",
+                },
+                "FEEDS": {
+                    f"file:///{feed_path_setting}": {"format": "pickle_raw"},
+                },
+                "WEB_DATA_ITEMS": web_data,
+                # logs
+                "LOG_ENABLED": True,
+                "LOG_FILE": None,
+                "LOG_STDOUT": False,
+                "LOG_LEVEL": "ERROR",
+                # throttling requests
+                "DOWNLOAD_DELAY": 3,
+                "AUTOTHROTTLE_ENABLED": True,
+                "AUTOTHROTTLE_START_DELAY": 3,
+                "AUTOTHROTTLE_MAX_DELAY": 60,
+                "AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0,
+                # pipelines
+                "ITEM_PIPELINES": {
+                    "scrapy.pipelines.files.FilesPipeline": 1,
+                },
+                "FILES_STORE": files_dir_setting,
+                "MEDIA_ALLOW_REDIRECTS": True,
+                # Set settings whose default value is deprecated to a future-proof value
+                "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
+                "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+                "FEED_EXPORT_ENCODING": "utf-8",
+            },
+            install_root_handler=True,
+        )
 
-        logger.info("Loaded %s data items from web data sources.", len(feed_items))
+        process.crawl(WebDataFetch)
+
+        logging.getLogger("scrapy").setLevel("ERROR")
+        logging.getLogger("py.warnings").setLevel("CRITICAL")
+
+        # the script will block here until the crawling is finished
+        process.start()
+
+        # load the feed items
+        feed_item_count = 0
+        for item in feed_path.parent.iterdir():
+            if not item.is_file():
+                continue
+            if item.suffix != ".pickle":
+                continue
+
+            with item.open("rb") as f:
+                while True:
+                    try:
+                        # store PluginDataItems in the related PluginWebData instance
+                        web_data_item = pickle.load(f)
+                        map_id = self._data_item_id(web_data_item)
+                        web_data_map[map_id].data = [web_data_item]
+                        feed_item_count += 1
+                    except EOFError:
+                        break
+
+        logger.info("Loaded %s data items from web data sources.", feed_item_count)
         logger.info("Finished update.")
 
-        # TODO: still need to do something with the feed_items?
-
-        # TODO: save results?
+        return entry.UpdateResult(web_data=web_data, local_data=local_data)
 
-        return plugin_entry.UpdateResult(web_data=web_data, local_data=local_data)
-
-    def list(self, args: plugin_entry.ListArgs) -> plugin_entry.ListResult:
+    def list(self, args: entry.ListArgs) -> entry.ListResult:
         """List all available plugins.
 
         Args:
@@ -210,12 +219,14 @@ def list(self, args: plugin_entry.ListArgs) -> plugin_entry.ListResult:
         for plugin_item in self.load():
             result = plugin_item.entry_instance.list(args)
             items.update(result.items)
-        return plugin_entry.ListResult(items)
+        return entry.ListResult(items)
 
     def _get_entry_points(self, group: str):
-        return entry_points(group=group)
+        return importlib_metadata.entry_points(group=group)
 
-    def _build_plugin_item(self, entry_point: EntryPoint) -> PluginItem:
+    def _build_plugin_item(
+        self, entry_point: importlib_metadata.EntryPoint
+    ) -> PluginItem:
         entry_class = entry_point.load()
         item = PluginItem(
             entry_point=entry_point,
@@ -224,14 +235,15 @@ def _build_plugin_item(self, entry_point: EntryPoint) -> PluginItem:
         )
         return item
 
+    def _data_item_id(self, item) -> str:
+        return "-".join([item.plugin_name, item.plugin_data_source])
+
 
 class WebDataFetch(scrapy.Spider):
     name = "web-data"
 
     def start_requests(self):
-        web_data_items: typing.List[plugin_data.WebData] = self.settings.get(
-            "WEB_DATA_ITEMS"
-        )
+        web_data_items: typing.List[data.WebData] = self.settings.get("WEB_DATA_ITEMS")
         for web_data_item in web_data_items:
             for initial_url in web_data_item.initial_urls():
                 yield scrapy.Request(
@@ -240,19 +252,19 @@ def start_requests(self):
                     cb_kwargs={"web_data_item": web_data_item},
                 )
 
-    def parse(self, response: Response, **kwargs):
-        web_data_item: plugin_data.WebData = response.cb_kwargs.get("web_data_item")
+    def parse(self, response: http.Response, **kwargs):
+        web_data_item: data.WebData = response.cb_kwargs.get("web_data_item")
 
         is_json = "json" in response.headers["Content-Type"].decode("utf-8").lower()
 
-        if isinstance(response, TextResponse):
+        if isinstance(response, http.TextResponse):
             body_data = response.json() if is_json else None
             selector = response.selector
         else:
             body_data = None
             selector = None
 
-        data = plugin_data.WebDataAvailable(
+        web_data = data.WebDataAvailable(
             request_url=response.request.url,
             request_method=response.request.method,
             response_url=response.url,
@@ -263,7 +275,7 @@ def parse(self, response: Response, **kwargs):
             headers=response.headers,
             meta=response.cb_kwargs,
         )
-        for i in web_data_item.parse_response(data):
+        for i in web_data_item.parse_response(web_data):
             if isinstance(i, str):
                 yield scrapy.Request(
                     url=i,
diff --git a/src/gather_vision/cli.py b/src/gather_vision/cli.py
index 8ea5841..07d3a49 100644
--- a/src/gather_vision/cli.py
+++ b/src/gather_vision/cli.py
@@ -2,11 +2,12 @@
 
 import argparse
 import logging
+import pathlib
 import sys
 import typing
 
 from gather_vision import app, utils
-from gather_vision.plugin import entry as plugin_entry
+from gather_vision.plugin import entry
 
 
 def cli_update(args: argparse.Namespace) -> bool:
@@ -20,13 +21,31 @@ def cli_update(args: argparse.Namespace) -> bool:
     """
     logger = logging.getLogger(__name__)
 
-    app_args = plugin_entry.UpdateArgs(name=args.name)
+    app_args = entry.UpdateArgs(name=args.name, data_path=args.data_path)
     main_app = app.App()
 
     logger.info("Updating '%s'.", args.name)
     result = main_app.update(app_args)
 
-    # TODO: save result
+    # cli just logs the plugins and count of data items
+    available = {
+        "local": result.local_data,
+        "web": result.web_data,
+    }
+    for group, data_items in available.items():
+        logger.info("Updated %s %s data items.", len(data_items), group)
+        for item_index, data_item in enumerate(data_items):
+            item_num = item_index + 1
+            name = data_item.plugin_name
+            source = data_item.plugin_data_source
+            count = len(data_item.data)
+            logger.info(
+                "  %s) plugin '%s' data source '%s' with %s items",
+                item_num,
+                name,
+                source,
+                count,
+            )
 
     return True
 
@@ -44,10 +63,11 @@ def cli_list(
     """
     logger = logging.getLogger(__name__)
 
-    app_args = plugin_entry.ListArgs()
+    app_args = entry.ListArgs()
     main_app = app.App()
     result = main_app.list(app_args)
 
+    # cli just logs the plugins and data sources
     logger.info("Listing %s plugins.", len(result.items))
     for plugin_index, (plugin_name, data_sources) in enumerate(result.items.items()):
         plugin_num = plugin_index + 1
@@ -55,6 +75,7 @@ def cli_list(
         for data_source_index, data_source_name in enumerate(data_sources):
             data_source_num = data_source_index + 1
             logger.info("    %s.%s) %s", plugin_num, data_source_num, data_source_name)
+
     return True
 
 
@@ -112,6 +133,11 @@ def main(args: typing.Optional[typing.List[str]] = None) -> int:
         default=None,
         help="The name of the update to run.",
     )
+    parser_update.add_argument(
+        "--data-path",
+        type=pathlib.Path,
+        help="The path to the data directory for downloads, cache, files.",
+    )
     parser_update.set_defaults(func=cli_update)
 
     # create the parser for the "list" command
diff --git a/src/gather_vision/plugin/entry.py b/src/gather_vision/plugin/entry.py
index 2422155..db3fcb6 100644
--- a/src/gather_vision/plugin/entry.py
+++ b/src/gather_vision/plugin/entry.py
@@ -1,9 +1,11 @@
 """Public api for plugin entry point."""
+
 import abc
 import dataclasses
+import pathlib
 import typing
 
-from gather_vision.plugin.data import LocalData, WebData
+from gather_vision.plugin import data
 
 
 @dataclasses.dataclass
@@ -16,13 +18,15 @@ class UpdateArgs:
     data_source: typing.Optional[str] = None
     """The plugin data source name."""
 
+    data_path: typing.Optional[pathlib.Path] = None
+
 
 @dataclasses.dataclass
 class UpdateResult:
     """The result from the update command."""
 
-    web_data: typing.List["WebData"]
-    local_data: typing.List["LocalData"]
+    web_data: typing.List["data.WebData"]
+    local_data: typing.List["data.LocalData"]
 
 
 @dataclasses.dataclass
diff --git a/src/gather_vision/utils.py b/src/gather_vision/utils.py
index b712eb9..8eb9c3a 100644
--- a/src/gather_vision/utils.py
+++ b/src/gather_vision/utils.py
@@ -1,8 +1,9 @@
 """Small utility functions."""
+
 import pathlib
 import typing
-from importlib_metadata import distribution, PackageNotFoundError
-from importlib_resources import as_file, files
+import importlib_metadata
+import importlib_resources
 
 
 def get_name_dash() -> str:
@@ -18,13 +19,15 @@ def get_name_under() -> str:
 def get_version() -> typing.Optional[str]:
     """Get the package version."""
     try:
-        dist = distribution(get_name_dash())
+        dist = importlib_metadata.distribution(get_name_dash())
         return dist.version
-    except PackageNotFoundError:
+    except importlib_metadata.PackageNotFoundError:
         pass
 
     try:
-        with as_file(files(get_name_under()).joinpath("cli.py")) as file_path:
+        with importlib_resources.as_file(
+            importlib_resources.files(get_name_under()).joinpath("cli.py")
+        ) as file_path:
             version_path = file_path.parent.parent.parent / "VERSION"
             return version_path.read_text(encoding="utf-8").strip()
     except FileNotFoundError:
@@ -62,3 +65,5 @@ def validate_path(
 
 class GatherVisionException(Exception):
     """A gather vision error."""
+
+    pass
diff --git a/tests/conftest.py b/tests/conftest.py
index 8c63713..0d64a5a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,8 +1,10 @@
-from importlib_metadata import EntryPoints
+import re
+
+import importlib_metadata
 
 import pytest
 
-from example_plugin import ExamplePlugin
+import example_plugin
 
 
 @pytest.fixture(autouse=True)
@@ -10,6 +12,22 @@ def patch_app_get_entry_points(monkeypatch):
     """Provide an example plugin for tests."""
 
     def _get_entry_points(self, group: str):
-        return EntryPoints(ExamplePlugin)
+        return importlib_metadata.EntryPoints(example_plugin.ExamplePlugin)
 
     monkeypatch.setattr("gather_vision.app.App._get_entry_points", _get_entry_points)
+
+
+@pytest.fixture()
+def equal_ignore_whitespace():
+    def _equal_ignore_whitespace(value1: str, value2: str, ignore_case=False):
+        # Ignore non-space and non-word characters
+        whitespace = re.compile(r"\s+")
+        replace1 = whitespace.sub(" ", value1 or "").strip()
+        replace2 = whitespace.sub(" ", value2 or "").strip()
+
+        if ignore_case:
+            assert replace1.casefold() == replace2.casefold()
+        else:
+            assert replace1 == replace2
+
+    return _equal_ignore_whitespace
diff --git a/tests/example_plugin.py b/tests/example_plugin.py
index c6f3e28..301d9ce 100644
--- a/tests/example_plugin.py
+++ b/tests/example_plugin.py
@@ -1,25 +1,24 @@
 import dataclasses
 import logging
-import re
 import typing
 
-from gather_vision.plugin import data as plugin_data, entry as plugin_entry
+from gather_vision.plugin import data, entry
 
 logger = logging.getLogger(__name__)
 
 logging.getLogger("example_plugin").setLevel("INFO")
 
 
-class ExamplePlugin(plugin_entry.Entry):
+class ExamplePlugin(entry.Entry):
     plugin_name = "example-plugin"
     plugin_value = "example_plugin.ExamplePlugin"
     _data_source_1 = "example-data-source-1"
     _data_source_2 = "example-data-source-2"
     _data_source_names = [_data_source_1, _data_source_2]
 
-    def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult:
+    def update(self, args: entry.UpdateArgs) -> entry.UpdateResult:
         logger.info(f"Running update for plugin {self.plugin_name}.")
-        return plugin_entry.UpdateResult(
+        return entry.UpdateResult(
             web_data=[
                 ExamplePluginWebData(
                     plugin_name=self.plugin_name,
@@ -34,21 +33,21 @@ def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult:
             ],
         )
 
-    def list(self, args: plugin_entry.ListArgs) -> plugin_entry.ListResult:
+    def list(self, args: entry.ListArgs) -> entry.ListResult:
         logger.info(f"Running list for plugin {self.plugin_name}.")
-        return plugin_entry.ListResult(
+        return entry.ListResult(
             items={self.plugin_name: self._data_source_names},
         )
 
 
-class ExamplePluginWebData(plugin_data.WebData):
+class ExamplePluginWebData(data.WebData):
     def initial_urls(self) -> typing.Iterable[str]:
         return ["https://example.com/"]
 
     def parse_response(
-        self, data: plugin_data.WebDataAvailable
+        self, data: data.WebDataAvailable
     ) -> typing.Generator[
-        typing.Union[str, plugin_data.GatherDataItem], typing.Any, typing.Any
+        typing.Union[str, data.GatherDataItem], typing.Any, typing.Any
     ]:
         yield ExamplePluginDataItem(
             plugin_name=self.plugin_name,
@@ -65,15 +64,15 @@ def parse_response(
 
 
 @dataclasses.dataclass
-class ExamplePluginDataItem(plugin_data.GatherDataItem):
+class ExamplePluginDataItem(data.GatherDataItem):
     name: str
     value: int
 
 
-class ExamplePluginLocalData(plugin_data.LocalData):
+class ExamplePluginLocalData(data.LocalData):
     def load_resources(
         self,
-    ) -> typing.Generator[plugin_data.GatherDataItem, typing.Any, typing.Any]:
+    ) -> typing.Generator[data.GatherDataItem, typing.Any, typing.Any]:
         items = [
             ExamplePluginDataItem(
                 plugin_name=self.plugin_name,
@@ -91,8 +90,3 @@ def load_resources(
 
         for item in items:
             yield item
-
-
-def collapse_whitespace(value: str) -> str:
-    whitespace = re.compile(r"\s+")
-    return whitespace.sub(" ", value)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index da64d38..3f78419 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,15 +1,12 @@
 import sys
 
-import logging_tree
-from importlib_metadata import EntryPoints, entry_points, EntryPoint
-
 import pytest
+import importlib_metadata
 
-from gather_vision.app import PluginItem
-from gather_vision.cli import main
-from example_plugin import ExamplePlugin, collapse_whitespace
+import example_plugin
+from gather_vision import app, cli
 
-expected_version = "0.0.3"
+expected_version = "0.0.4"
 
 if sys.version_info.minor >= 10:
     help_phrase_options = "options:"
@@ -18,9 +15,9 @@
 
 
 @pytest.mark.parametrize("main_args,exit_code", [([], 1), (["--help"], 0)])
-def test_cli_no_args(capsys, caplog, main_args, exit_code):
+def test_cli_no_args(capsys, caplog, main_args, exit_code, equal_ignore_whitespace):
     with pytest.raises(SystemExit, match=str(exit_code)):
-        main(main_args)
+        cli.main(main_args)
 
     prog_help = (
         "usage: gather-vision [-h] [--version]\n"
@@ -46,19 +43,19 @@ def test_cli_no_args(capsys, caplog, main_args, exit_code):
 
     stdout, stderr = capsys.readouterr()
     if main_args == ["--help"]:
-        assert collapse_whitespace(stdout) == collapse_whitespace(prog_help)
+        equal_ignore_whitespace(stdout, prog_help)
         assert stderr == ""
         assert caplog.record_tuples == []
 
     if main_args == []:
         assert stdout == ""
-        assert collapse_whitespace(stderr) == collapse_whitespace(prog_help)
+        equal_ignore_whitespace(stderr, prog_help)
         assert caplog.record_tuples == []
 
 
 def test_cli_version(capsys, caplog):
     with pytest.raises(SystemExit, match="0"):
-        main(["--version"])
+        cli.main(["--version"])
 
     stdout, stderr = capsys.readouterr()
     assert stdout == f"gather-vision {expected_version}\n"
@@ -66,16 +63,18 @@ def test_cli_version(capsys, caplog):
     assert caplog.record_tuples == []
 
 
-def test_cli_list_help(capsys, caplog):
+def test_cli_list_help(capsys, caplog, equal_ignore_whitespace):
     with pytest.raises(SystemExit, match="0"):
-        main(["list", "--help"])
+        cli.main(["list", "--help"])
 
     stdout, stderr = capsys.readouterr()
-    assert stdout == (
-        "usage: gather-vision list [-h]\n"
-        "\n"
-        f"{help_phrase_options}\n"
-        "  -h, --help  show this help message and exit\n"
+    equal_ignore_whitespace(
+        stdout,
+        (
+            "usage: gather-vision list [-h] "
+            f"{help_phrase_options} "
+            "  -h, --help  show this help message and exit"
+        ),
     )
     assert stderr == ""
     assert caplog.record_tuples == []
@@ -89,24 +88,24 @@ def test_cli_list(capsys, caplog, monkeypatch):
         orig_build_plugin_item = App._build_plugin_item
 
         def get_entry_points(self, group):
-            result = EntryPoints(
-                [i for i in entry_points(group=group)]
+            result = importlib_metadata.EntryPoints(
+                [i for i in importlib_metadata.entry_points(group=group)]
                 + [
-                    EntryPoint(
+                    importlib_metadata.EntryPoint(
                         group=App.group,
-                        name=ExamplePlugin.plugin_name,
-                        value=ExamplePlugin.plugin_value,
+                        name=example_plugin.ExamplePlugin.plugin_name,
+                        value=example_plugin.ExamplePlugin.plugin_value,
                     )
                 ]
             )
             return result
 
         def build_plugin_item(self, entry_point):
-            if entry_point.name == ExamplePlugin.plugin_name:
-                return PluginItem(
+            if entry_point.name == example_plugin.ExamplePlugin.plugin_name:
+                return app.PluginItem(
                     entry_point=entry_point,
-                    entry_class=ExamplePlugin,
-                    entry_instance=ExamplePlugin(),
+                    entry_class=example_plugin.ExamplePlugin,
+                    entry_instance=example_plugin.ExamplePlugin(),
                 )
             return orig_build_plugin_item(self, entry_point)
 
@@ -115,7 +114,7 @@ def build_plugin_item(self, entry_point):
         m.setattr(App, "_build_plugin_item", build_plugin_item)
 
         with pytest.raises(SystemExit, match="0"):
-            main(["list"])
+            cli.main(["list"])
 
     stdout, stderr = capsys.readouterr()
     assert stdout == ""
@@ -132,17 +131,23 @@ def build_plugin_item(self, entry_point):
     ]
 
 
-def test_cli_update_help(capsys, caplog):
+def test_cli_update_help(capsys, caplog, equal_ignore_whitespace):
     with pytest.raises(SystemExit, match="0"):
-        main(["--log-level", "debug", "update", "--help"])
+        cli.main(["--log-level", "debug", "update", "--help"])
 
     stdout, stderr = capsys.readouterr()
-    assert stdout == (
-        "usage: gather-vision update [-h] [--name NAME]\n"
-        "\n"
-        f"{help_phrase_options}\n"
-        "  -h, --help   show this help message and exit\n"
-        "  --name NAME  The name of the update to run.\n"
+
+    equal_ignore_whitespace(
+        stdout,
+        (
+            "usage: gather-vision update [-h] [--name NAME] [--data-path DATA_PATH]\n"
+            "\n"
+            f"{help_phrase_options}\n"
+            "  -h, --help            show this help message and exit\n"
+            "  --name NAME           The name of the update to run.\n"
+            "  --data-path DATA_PATH\n"
+            "                        The path to the data directory for downloads, cache, files.\n"
+        ),
     )
     assert stderr == ""
     assert caplog.record_tuples == []
@@ -150,7 +155,7 @@ def test_cli_update_help(capsys, caplog):
 
 def test_cli_update_not_available(capsys, caplog):
     with pytest.raises(SystemExit, match="1"):
-        main(["update", "--name", "not-available"])
+        cli.main(["update", "--name", "not-available"])
 
     stdout, stderr = capsys.readouterr()
     assert stdout == ""
@@ -161,29 +166,38 @@ def test_cli_update_not_available(capsys, caplog):
         (
             "gather_vision.cli",
             40,
-            "Error: GatherVisionException - Could not find plugin named 'not-available'.",
+            "Error: GatherVisionException - Could not find plugin named 'not-available'. "
+            "Available plugins (1): example-plugin.",
         ),
     ]
 
 
-def test_cli_update_example_plugin(capsys, caplog):
+def test_cli_update_example_plugin(capsys, caplog, tmp_path):
     with pytest.raises(SystemExit, match="0"):
-        main(["update", "--name", "example-plugin"])
+        cli.main(["update", "--name", "example-plugin", "--data-path", str(tmp_path)])
+
+    # logging_tree_str = logging_tree.format.build_description()
 
-    logging_tree_str = logging_tree.format.build_description()
+    l_cli = "gather_vision.cli"
+    l_app = "gather_vision.app"
+
+    prefix = "  1) plugin 'example-plugin' data source 'example-data-source-"
 
     stdout, stderr = capsys.readouterr()
     assert stdout == ""
     assert stderr == ""
-    for i in [
-        ("gather_vision.cli", 20, "Starting gather-vision."),
-        ("gather_vision.cli", 20, "Updating 'example-plugin'."),
-        ("example_plugin", 20, "Running update for plugin example-plugin."),
-        ("gather_vision.app", 20, "Loaded 1 local data sources."),
-        ("example_plugin", 20, "Running update for plugin example-plugin."),
-        ("gather_vision.app", 20, "Starting 1 web data sources."),
-        ("gather_vision.app", 20, "Loaded 2 data items from web data sources."),
-        ("gather_vision.app", 20, "Finished update."),
-        ("gather_vision.cli", 20, "Finished."),
+    for index, log_item in [
+        (0, (l_cli, 20, "Starting gather-vision.")),
+        (1, (l_cli, 20, "Updating 'example-plugin'.")),
+        (2, ("example_plugin", 20, "Running update for plugin example-plugin.")),
+        (3, (l_app, 20, "Loaded 1 local data sources.")),
+        (4, (l_app, 20, "Starting 1 web data sources.")),
+        (18, (l_app, 20, "Loaded 2 data items from web data sources.")),
+        (19, (l_app, 20, "Finished update.")),
+        (20, (l_cli, 20, "Updated 1 local data items.")),
+        (21, (l_cli, 20, prefix + "2' with 2 items")),
+        (22, (l_cli, 20, "Updated 1 web data items.")),
+        (23, (l_cli, 20, prefix + "1' with 2 items")),
+        (24, (l_cli, 20, "Finished.")),
     ]:
-        assert i in caplog.record_tuples
+        assert caplog.record_tuples[index] == log_item