Skip to content

Commit

Permalink
fix: update scrapy settings and passing extra data
Browse files Browse the repository at this point in the history
  • Loading branch information
cofiem committed Mar 14, 2023
1 parent b60190a commit 2d7e303
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 7 deletions.
40 changes: 35 additions & 5 deletions src/gather_vision/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,25 +113,49 @@ def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult:
feed_path = pathlib.Path(temp_dir, "feed_%(name)s_%(time)s.pickle")
feed_path_setting = str(feed_path).replace("\\", "/")

files_path = pathlib.Path(temp_dir, "feed_%(name)s_%(time)s.files")
files_path_setting = str(files_path).replace("\\", "/")

process = CrawlerProcess(
settings={
"USER_AGENT": "gather-vision (+https://github.com/anotherbyte-net/gather-vision)",
# http cache
"HTTPCACHE_ENABLED": True,
"HTTPCACHE_DIR": ".httpcache",
"HTTPCACHE_POLICY": "scrapy.extensions.httpcache.RFC2616Policy",
"HTTPCACHE_POLICY": "scrapy.extensions.httpcache.DummyPolicy",
"HTTPCACHE_STORAGE": "scrapy.extensions.httpcache.FilesystemCacheStorage",
"EXTENSIONS": {
"scrapy.extensions.telnet.TelnetConsole": None,
},
# feed
"FEED_EXPORTERS": {
"pickle_raw": "gather_vision.app.AppPickleItemExporter",
},
"FEEDS": {
f"file:///{feed_path_setting}": {"format": "pickle_raw"},
},
"WEB_DATA_ITEMS": web_data,
# logs
"LOG_ENABLED": True,
"LOG_FILE": None,
"LOG_STDOUT": False,
"LOG_LEVEL": "ERROR",
# throttling requests
"DOWNLOAD_DELAY": 3,
"AUTOTHROTTLE_ENABLED": True,
"AUTOTHROTTLE_START_DELAY": 3,
"AUTOTHROTTLE_MAX_DELAY": 60,
"AUTOTHROTTLE_TARGET_CONCURRENCY": 1.0,
# pipelines
"ITEM_PIPELINES": {
"scrapy.pipelines.files.FilesPipeline": 1,
},
"FILES_STORE": files_path_setting,
"MEDIA_ALLOW_REDIRECTS": True,
# Set settings whose default value is deprecated to a future-proof value
"REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"FEED_EXPORT_ENCODING": "utf-8",
},
install_root_handler=True,
)
Expand Down Expand Up @@ -167,6 +191,10 @@ def update(self, args: plugin_entry.UpdateArgs) -> plugin_entry.UpdateResult:
logger.info("Loaded %s data items from web data sources.", len(feed_items))
logger.info("Finished update.")

# TODO: still need to do something with the feed_items?

# TODO: save results?

return plugin_entry.UpdateResult(web_data=web_data, local_data=local_data)

def list(self, args: plugin_entry.ListArgs) -> plugin_entry.ListResult:
Expand Down Expand Up @@ -209,11 +237,11 @@ def start_requests(self):
yield scrapy.Request(
url=initial_url,
callback=self.parse,
meta={"web_data_item": web_data_item},
cb_kwargs={"web_data_item": web_data_item},
)

def parse(self, response: Response, **kwargs):
web_data_item: plugin_data.WebData = response.meta["web_data_item"]
web_data_item: plugin_data.WebData = response.cb_kwargs.get("web_data_item")

is_json = "json" in response.headers["Content-Type"].decode("utf-8").lower()

Expand All @@ -233,12 +261,14 @@ def parse(self, response: Response, **kwargs):
selector=selector,
status=response.status,
headers=response.headers,
meta=response.meta,
meta=response.cb_kwargs,
)
for i in web_data_item.parse_response(data):
if isinstance(i, str):
yield scrapy.Request(
url=i, callback=self.parse, meta={"web_data_item": web_data_item}
url=i,
callback=self.parse,
cb_kwargs={"web_data_item": web_data_item},
)
else:
yield i
6 changes: 4 additions & 2 deletions src/gather_vision/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import logging
import sys
import typing
from logging.config import dictConfig

from gather_vision import app, utils
from gather_vision.plugin import entry as plugin_entry
Expand All @@ -25,7 +24,10 @@ def cli_update(args: argparse.Namespace) -> bool:
main_app = app.App()

logger.info("Updating '%s'.", args.name)
main_app.update(app_args)
result = main_app.update(app_args)

# TODO: save result

return True


Expand Down

0 comments on commit 2d7e303

Please sign in to comment.