From b58da9740736f62293a2a9d728be04d9776b0e15 Mon Sep 17 00:00:00 2001 From: Fabian Canobra <40277847+fcanobrash@users.noreply.github.com> Date: Mon, 21 Sep 2020 12:37:18 -0300 Subject: [PATCH] Refactoring and improvements (#81) --- .gitignore | 4 + README.md | 152 +++++++++--- scrapy_autounit/cassette.py | 98 ++++++++ scrapy_autounit/cli.py | 219 ++++++++++++------ scrapy_autounit/middleware.py | 132 ++--------- scrapy_autounit/parser.py | 120 ++++++++++ scrapy_autounit/player.py | 226 ++++++++++++++++++ scrapy_autounit/recorder.py | 140 +++++++++++ scrapy_autounit/utils.py | 422 +--------------------------------- setup.py | 8 +- tests/test_middleware.py | 8 +- tests/test_record.py | 52 ++--- tox.ini | 6 +- 13 files changed, 918 insertions(+), 669 deletions(-) create mode 100644 scrapy_autounit/cassette.py create mode 100644 scrapy_autounit/parser.py create mode 100644 scrapy_autounit/player.py create mode 100644 scrapy_autounit/recorder.py diff --git a/.gitignore b/.gitignore index 9bfefce..9db4b78 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ +build/ dist/ *.egg-info/ .python-version __pycache__ +.tox/ +.direnv/ +.envrc diff --git a/README.md b/README.md index 620685a..1f06431 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,25 @@ # Scrapy Autounit [![AppVeyor](https://ci.appveyor.com/api/projects/status/github/scrapinghub/scrapy-autounit?branch=master&svg=true)](https://ci.appveyor.com/project/scrapinghub/scrapy-autounit/branch/master) -[![PyPI Version](https://img.shields.io/pypi/v/scrapy-autounit.svg?color=blue)](https://pypi.python.org/pypi/scrapy-autounit/) +[![PyPI Version](https://img.shields.io/pypi/v/scrapy-autounit.svg?color=blue)](https://pypi.python.org/pypi/scrapy-autounit/) +  +## Documentation +- [Overview](#overview) +- [Installation](#installation) +- [Usage](#usage) +- [Caveats](#caveats) +- [Settings](#settings) +- [Command Line Interface](#command-line-interface) +- [Internals](#internals) +  ## Overview Scrapy-Autounit is an automatic test generation tool for your Scrapy spiders. It generates test fixtures and tests cases as you run your spiders. -The test fixtures are generated from the items and requests that your spider yields, then the test cases evaluate those fixtures against your spiders' callbacks. + +The fixtures are generated from the items and requests that your spider returns, then the test cases evaluate those fixtures against your spiders' callbacks. Scrapy Autounit generates fixtures and tests per spider and callback under the Scrapy project root directory. Here is an example of the directory tree of your project once the fixtures are created: @@ -36,12 +47,14 @@ my_project │   └── my_spider.py └── scrapy.cfg ``` +  ## Installation ``` pip install scrapy_autounit ``` +  ## Usage @@ -62,74 +75,92 @@ To generate your fixtures and tests just run your spiders as usual, Scrapy Autou $ scrapy crawl my_spider ``` When the spider finishes, a directory `autounit` is created in your project root dir, containing all the generated tests/fixtures for the spider you just ran (see the directory tree example above). -If you want to **update** your tests and fixtures you only need to run your spiders again. + +If you want to **update** your tests and fixtures you only need to run your spiders again or use the [`autounit update`](#autounit-update) command line tool. ### Running tests To run your tests you can use `unittest` regular commands. ###### Test all ``` -$ python -m unittest +$ python -m unittest discover autounit/tests/ ``` ###### Test a specific spider ``` -$ python -m unittest discover -s autounit.tests.my_spider +$ python -m unittest discover autounit/tests/my_spider/ ``` ###### Test a specific callback ``` -$ python -m unittest discover -s autounit.tests.my_spider.my_callback -``` -###### Test a specific fixture -``` -$ python -m unittest autounit.tests.my_spider.my_callback.test_fixture2 +$ python -m unittest discover autounit/tests/my_spider/my_callback/ ``` +  ## Caveats - Keep in mind that as long as `AUTOUNIT_ENABLED` is on, each time you run a spider tests/fixtures are going to be generated for its callbacks. This means that if you have your tests/fixtures ready to go, this setting should be off to prevent undesired overwrites. Each time you want to regenerate your tests (e.g.: due to changes in your spiders), you can turn this on again and run your spiders as usual. +For example, this setting should be off when running your spiders in Scrapy Cloud. -- Autounit uses an internal `_autounit` key in requests' meta dictionaries. Avoid using/overriding this key in your spiders when adding data to meta to prevent unexpected behaviours. +- Autounit uses an internal `_autounit_cassette` key in requests' meta dictionaries. Avoid using/overriding this key in your spiders when adding data to meta to prevent unexpected behaviours. +  ## Settings -**AUTOUNIT_ENABLED** +###### General + +- **AUTOUNIT_ENABLED** Set this to `True` or `False` to enable or disable unit test generation. -**AUTOUNIT_MAX_FIXTURES_PER_CALLBACK** +- **AUTOUNIT_MAX_FIXTURES_PER_CALLBACK** Sets the maximum number of fixtures to store per callback. `Minimum: 10` `Default: 10` -**AUTOUNIT_SKIPPED_FIELDS** +- **AUTOUNIT_EXTRA_PATH** +This is an extra string element to add to the test path and name between the spider name and callback name. You can use this to separate tests from the same spider with different configurations. +`Default: None` + +###### Output + +- **AUTOUNIT_DONT_TEST_OUTPUT_FIELDS** Sets a list of fields to be skipped from testing your callbacks' items. It's useful to bypass fields that return a different value on each run. For example if you have a field that is always set to `datetime.now()` in your spider, you probably want to add that field to this list to be skipped on tests. Otherwise you'll get a different value when you're generating your fixtures than when you're running your tests, making your tests fail. `Default: []` -**AUTOUNIT_REQUEST_SKIPPED_FIELDS** -Sets a list of request fields to be skipped when running your tests. -Similar to AUTOUNIT_SKIPPED_FIELDS but applied to requests instead of items. +###### Requests + +- **AUTOUNIT_DONT_TEST_REQUEST_ATTRS** +Sets a list of request attributes to be skipped when running your tests. `Default: []` -**AUTOUNIT_EXCLUDED_HEADERS** +- **AUTOUNIT_DONT_RECORD_HEADERS** Sets a list of headers to exclude from requests recording. -For security reasons, Autounit already excludes `Authorization` and `Proxy-Authorization` headers by default, if you want to include them in your fixtures see *`AUTOUNIT_INCLUDED_AUTH_HEADERS`*. +For security reasons, Autounit already excludes `Authorization` and `Proxy-Authorization` headers by default, if you want to record them in your fixtures see *`AUTOUNIT_RECORD_AUTH_HEADERS`*. `Default: []` -**AUTOUNIT_INCLUDED_AUTH_HEADERS** +- **AUTOUNIT_RECORD_AUTH_HEADERS** If you want to include `Authorization` or `Proxy-Authorization` headers in your fixtures, add one or both of them to this list. `Default: []` -**AUTOUNIT_INCLUDED_SETTINGS** -Sets a list of settings names to be recorded in the generated test case. +###### Spider attributes + +- **AUTOUNIT_DONT_RECORD_SPIDER_ATTRS** +Sets a list of spider attributes that won't be recorded into your fixtures. `Default: []` -**AUTOUNIT_EXTRA_PATH** -This is an extra string element to add to the test path and name between the spider name and callback name. You can use this to separate tests from the same spider with different configurations. -`Default: None` +- **AUTOUNIT_DONT_TEST_SPIDER_ATTRS** +Sets a list of spider attributes to be skipped from testing your callbacks. These attributes will still be recorded. +`Default: []` + +###### Settings + +- **AUTOUNIT_RECORD_SETTINGS** +Sets a list of settings names to be recorded in the generated test case. +`Default: []` --- -**Note**: Remember that you can always apply any of these settings per spider including them in your spider's `custom_settings` class attribute - see https://docs.scrapy.org/en/latest/topics/settings.html#settings-per-spider. +**Note**: Remember that you can always apply any of these settings per spider including them in your spider's `custom_settings` class attribute - see https://docs.scrapy.org/en/latest/topics/settings.html#settings-per-spider. +  ## Command line interface @@ -162,8 +193,9 @@ The original request that triggered the callback. ***`response`*** The response obtained from the original request and passed to the callback. -***`result`*** +***`output_data`*** The callback's output such as items and requests. +_Same as ***`result`*** prior to v0.0.28._ ***`middlewares`*** The relevant middlewares to replicate when running the tests. @@ -171,11 +203,16 @@ The relevant middlewares to replicate when running the tests. ***`settings`*** The settings explicitly recorded by the *`AUTOUNIT_INCLUDED_SETTINGS`* setting. -***`spider_args`*** -The arguments passed to the spider in the crawl. +***`init_attrs`*** +The spider's attributes right after its _\_\_init\_\__ call. + +***`input_attrs`*** +The spider's attributes right before running the callback. +_Same as ***`spider_args`*** or ***`spider_args_in`*** prior to v0.0.28._ -***`python_version`*** -Indicates if the fixture was recorded in python 2 or 3. +***`output_attrs`*** +The spider's attributes right after running the callback. +_Same as ***`spider_args_out`*** prior to v0.0.28._ Then for example, to inspect a fixture's specific request we can do the following: ``` @@ -184,12 +221,53 @@ $ autounit inspect my_spider my_callback 4 | jq '.request' ### `autounit update` -You can update your fixtures to match your latest changes in a particular callback to avoid running the whole spider. -For example, this updates all the fixtures for a specific callback: +This command updates your fixtures to match your latest changes, avoiding to run the whole spider again. +You can update the whole project, an entire spider, just a callback or a single fixture. + +###### Update the whole project +``` +$ autounit update +WARNING: this will update all the existing fixtures from the current project +Do you want to continue? (y/n) +``` + +###### Update every callback in a spider +``` +$ autounit update -s my_spider +``` + +###### Update every fixture in a callback +``` +$ autounit update -s my_spider -c my_callback +``` + +###### Update a single fixture ``` -$ autounit update my_spider my_callback +# Update fixture number 5 +$ autounit update -s my_spider -c my_callback -f 5 ``` -Optionally you can specify a particular fixture to update with `-f` or `--fixture`: +  + +## Internals + +The `AutounitMiddleware` uses a [`Recorder`](scrapy_autounit/recorder.py) to record [`Cassettes`](scrapy_autounit/cassette.py) in binary fixtures. + +Then, the tests use a [`Player`](scrapy_autounit/player.py) to playback those `Cassettes` and compare its output against your current callbacks. + +The fixtures contain a pickled and compressed `Cassette` instance that you can get programmatically by doing: +```python +from scrapy_autounit.cassette import Cassette + +cassette = Cassette.from_fixture(path_to_your_fixture) +# cassette.request +# cassette.response +# cassette.output_data +# ... +``` + +If you know what you're doing, you can modify that cassette and re-record it by using: +```python +from scrapy_autounit.recorder import Recorder + +Recorder.update_fixture(cassette, path) ``` -$ autounit update my_spider my_callback --fixture 4 -``` \ No newline at end of file diff --git a/scrapy_autounit/cassette.py b/scrapy_autounit/cassette.py new file mode 100644 index 0000000..6265dc1 --- /dev/null +++ b/scrapy_autounit/cassette.py @@ -0,0 +1,98 @@ +import pickle +import sys +import zlib + +from scrapy.crawler import Crawler +from scrapy.utils.conf import build_component_list +from scrapy.utils.project import get_project_settings + +from .utils import get_spider_class + + +class Cassette: + """ + Helper class to store request, response and output data. + """ + FIXTURE_VERSION = 2 + + def __init__( + self, + spider=None, + spider_name=None, + request=None, + response=None, + init_attrs=None, + input_attrs=None, + output_attrs=None, + output_data=None, + middlewares=None, + included_settings=None, + python_version=None, + filename=None, + ): + self.spider_name = spider_name + self.middlewares = middlewares + self.included_settings = included_settings + if spider: + self.spider_name = spider.name + self.middlewares = self._get_middlewares(spider.settings) + self.included_settings = self._get_included_settings(spider.settings) + + self.request = request + self.response = response + self.init_attrs = init_attrs + self.input_attrs = input_attrs + self.output_attrs = output_attrs + self.output_data = output_data + self.filename = filename + self.python_version = python_version or sys.version_info.major + + @classmethod + def from_fixture(cls, fixture): + with open(fixture, 'rb') as f: + binary = f.read() + cassette = pickle.loads(zlib.decompress(binary)) + return cassette + + def _get_middlewares(self, settings): + full_list = build_component_list(settings.getwithbase('SPIDER_MIDDLEWARES')) + autounit_mw_path = list(filter(lambda x: x.endswith('AutounitMiddleware'), full_list))[0] + start = full_list.index(autounit_mw_path) + mw_paths = [mw for mw in full_list[start:] if mw != autounit_mw_path] + return mw_paths + + def _get_included_settings(self, settings): + # Use the new setting, if empty, try the deprecated one + names = settings.getlist('AUTOUNIT_RECORD_SETTINGS', []) + if not names: + names = settings.getlist('AUTOUNIT_INCLUDED_SETTINGS', []) + included = {name: settings.get(name) for name in names} + return included + + def get_spider(self): + settings = get_project_settings() + spider_cls = get_spider_class(self.spider_name, settings) + + spider_cls.update_settings(settings) + for k, v in self.included_settings.items(): + settings.set(k, v, priority=50) + + crawler = Crawler(spider_cls, settings) + spider = spider_cls.from_crawler(crawler, **self.init_attrs) + return spider + + def pack(self): + return zlib.compress(pickle.dumps(self, protocol=2)) + + def to_dict(self): + return { + 'spider_name': self.spider_name, + 'request': self.request, + 'response': self.response, + 'output_data': self.output_data, + 'middlewares': self.middlewares, + 'settings': self.included_settings, + 'init_attrs': self.init_attrs, + 'input_attrs': self.input_attrs, + 'output_attrs': self.output_attrs, + } diff --git a/scrapy_autounit/cli.py b/scrapy_autounit/cli.py index 407bda4..4c627ee 100644 --- a/scrapy_autounit/cli.py +++ b/scrapy_autounit/cli.py @@ -1,26 +1,21 @@ -import re +import argparse +import json import os +import pickle +import re import sys -import json -import scrapy -import argparse -from glob import glob from datetime import datetime +from glob import glob -from scrapy.utils.python import to_unicode -from scrapy.utils.reqser import request_from_dict -from scrapy.utils.project import inside_project, get_project_settings +import scrapy from scrapy.commands.genspider import sanitize_module_name +from scrapy.utils.project import inside_project, get_project_settings +from scrapy.utils.python import to_unicode -from scrapy_autounit.utils import ( - add_sample, - auto_import, - unpickle_data, - decompress_data, - get_project_dir, - parse_callback_result, - prepare_callback_replay, -) +from .cassette import Cassette +from .player import Player +from .recorder import Recorder, TEST_TEMPLATE +from .utils import get_base_path, get_project_dir class CommandLine: @@ -29,11 +24,11 @@ def __init__(self, parser): self.args = parser.parse_args() if not inside_project(): - self.error("No active Scrapy project") + self._error("No active Scrapy project") self.command = self.args.command - self.spider = sanitize_module_name(self.args.spider) + self.spider = self.args.spider self.callback = self.args.callback self.fixture = self.args.fixture @@ -42,36 +37,89 @@ def __init__(self, parser): self.settings = get_project_settings() - base_path = self.settings.get( - 'AUTOUNIT_BASE_PATH', - default=os.path.join(self.project_dir, 'autounit')) + base_path = get_base_path(self.settings) self.tests_dir = os.path.join(base_path, 'tests') - self.spider_dir = os.path.join(self.tests_dir, self.spider) + if self.spider: + self.spider = sanitize_module_name(self.spider) + self.callbacks_dir = self._get_callbacks_dir(self.spider) + if not os.path.isdir(self.callbacks_dir): + self._error("No recorded data found for spider '{}'".format(self.spider)) + + if self.callback: + self.callback_dir = os.path.join(self.callbacks_dir, self.callback) + if not os.path.isdir(self.callback_dir): + self._error( + "No recorded data found for callback " + "'{}' from '{}' spider".format(self.callback, self.spider)) + + if self.fixture: + self.fixture_path = os.path.join(self.callback_dir, self.parse_fixture_arg()) + if not os.path.isfile(self.fixture_path): + self._error("Fixture '{}' not found".format(self.fixture_path)) + + def _error(self, msg): + print(msg) + sys.exit(1) - if not os.path.isdir(self.spider_dir): - self.error( - "No recorded data found " - "for spider '{}'".format(self.spider)) + def _walk(self, root): + for _, subdirs, _ in os.walk(root): + for subdir in subdirs: + if subdir == '__pycache__': + continue + yield subdir + def _get_callbacks_dir(self, spider): extra_path = self.settings.get('AUTOUNIT_EXTRA_PATH') or '' - self.callback_dir = os.path.join( - self.spider_dir, extra_path, self.callback) - - if not os.path.isdir(self.callback_dir): - self.error( - "No recorded data found for callback " - "'{}' from '{}' spider".format(self.callback, self.spider)) - - if self.fixture: - self.fixture_path = os.path.join( - self.callback_dir, self.parse_fixture_arg()) - if not os.path.isfile(self.fixture_path): - self.error("Fixture '{}' not found".format(self.fixture_path)) - - def error(self, msg): - print(msg) - sys.exit(1) + return os.path.join(self.tests_dir, spider, extra_path) + + def _get_spider_fixtures(self, callbacks_dir): + fixtures = [] + for callback in self._walk(callbacks_dir): + target = os.path.join(callbacks_dir, callback, '*.bin') + fixtures.extend(glob(target)) + return fixtures + + def _from_legacy_fixture(self, recorded): + encoding = recorded['encoding'] + old = pickle.loads(recorded['data'], encoding=encoding) + return Cassette( + spider_name=old['spider_name'], + request=old['request'], + response=old['response'], + init_attrs={}, + input_attrs=old.get('spider_args_in') or old.get('spider_args') or {}, + output_attrs=old.get('spider_args_out', {}), + output_data=old['result'], + middlewares=old['middlewares'], + included_settings=old['settings'], + python_version=old.get('python_version', sys.version_info.major), + ) + + def _update_legacy_test(self, path, cassette): + path_dir = os.path.dirname(path) + older_version_test = os.path.join(path_dir, 'test_fixture1.py') + if os.path.isfile(older_version_test): + to_remove = os.path.join(path_dir, 'test_fixture*.py') + for test in glob(to_remove): + if test == older_version_test: + os.rename(test, path) + continue + os.remove(test) + test_name = ( + sanitize_module_name(cassette.spider_name) + '__' + + cassette.request['callback'] + ) + with open(path, 'r+') as f: + old = f.read() + command = 'Scrapy Autounit' + command_re = re.search('# Generated by: (.*) # noqa', old) + if command_re: + command = command_re.group(1) + test_code = TEST_TEMPLATE.format(test_name=test_name, command=command) + f.seek(0) + f.write(test_code) + f.truncate() def parse_fixture_arg(self): try: @@ -95,51 +143,66 @@ def parse_data(self, data): return to_unicode(data) elif isinstance(data, datetime): return data.isoformat() - elif isinstance(data, (int, float)): + elif isinstance(data, (int, float, str)): return data - return str(data) - - def get_fixture_data(self): - with open(self.fixture_path, 'rb') as f: - raw_data = f.read() - fixture_info = unpickle_data(decompress_data(raw_data), 'utf-8') - if 'fixture_version' in fixture_info: - encoding = fixture_info['encoding'] - data = unpickle_data(fixture_info['data'], encoding) - else: - data = fixture_info # legacy tests (not all will work, just utf-8) - return data + return repr(data) def inspect(self): - data = self.parse_data(self.get_fixture_data()) + cassette = Cassette.from_fixture(self.fixture_path) + data = self.parse_data(cassette.to_dict()) print(json.dumps(data)) def update(self): + if self.callback and not self.spider: + print("Must specify a spider") + return + + if self.fixture and (not self.spider or not self.callback): + print("Must specify a spider and a callback") + return + + if not self.spider: + print("WARNING: this will update all the existing fixtures from the current project") + confirmation = input("Do you want to continue? (y/n) ") + if confirmation.lower() != 'y': + print("Update cancelled") + return + to_update = [] if self.fixture: to_update.append(self.fixture_path) - else: + elif self.callback: target = os.path.join(self.callback_dir, "*.bin") to_update = glob(target) + elif self.spider: + to_update = self._get_spider_fixtures(self.callbacks_dir) + else: + for spider in self._walk(self.tests_dir): + callbacks_dir = self._get_callbacks_dir(spider) + to_update.extend(self._get_spider_fixtures(callbacks_dir)) for path in to_update: - data, _, spider, _ = prepare_callback_replay(path) + player = Player.from_fixture(path) + + # Convert legacy fixtures to new cassette-based fixtures + if isinstance(player.cassette, dict): + print("Converting legacy fixture: {}".format(path)) + new_cassette = self._from_legacy_fixture(player.cassette) + player.cassette = new_cassette + test_path = os.path.join(os.path.dirname(path), 'test_fixtures.py') + self._update_legacy_test(test_path, new_cassette) - request = request_from_dict(data['request'], spider) + output, attrs = player.playback(compare=False) - response_cls = auto_import( - data['response'].pop('cls', 'scrapy.http.HtmlResponse') - ) - response = response_cls( - request=data["request"], **data['response']) + _, parsed = player.parse_callback_output(output) - data["result"], _ = parse_callback_result( - request.callback(response), spider - ) + cassette = player.cassette + cassette.output_data = parsed + cassette.init_attrs = attrs['init'] + cassette.input_attrs = attrs['input'] + cassette.output_attrs = attrs['output'] - fixture_dir, filename = os.path.split(path) - fixture_index = re.search(r"\d+", filename).group() - add_sample(fixture_index, fixture_dir, filename, data) + Recorder.update_fixture(cassette, path) print("Fixture '{}' successfully updated.".format( os.path.relpath(path))) @@ -169,14 +232,18 @@ def main(): update_cmd = subparsers.add_parser( 'update', - description="Updates fixtures to callback changes", + description="Updates fixtures and tests according to library and spider changes.", formatter_class=argparse.RawTextHelpFormatter) - update_cmd.add_argument('spider', help="The spider to update.") - update_cmd.add_argument('callback', help="The callback to update.") + update_cmd.add_argument('-s', '--spider', help=( + "The spider to update.\n" + "If not specified, all the spiders from the current project will be updated.")) + update_cmd.add_argument('-c', '--callback', help=( + "The callback to update.\n" + "If not specified, all the callbacks from the specified spider will be updated.")) update_cmd.add_argument('-f', '--fixture', help=( "The fixture to update.\n" "Can be the fixture number or the fixture name.\n" - "If not specified, all fixtures will be updated.")) + "If not specified, all the fixtures from the specified callback will be updated.")) cli = CommandLine(parser) cli.parse_command() diff --git a/scrapy_autounit/middleware.py b/scrapy_autounit/middleware.py index ef1f636..5c80c5c 100644 --- a/scrapy_autounit/middleware.py +++ b/scrapy_autounit/middleware.py @@ -1,136 +1,50 @@ -import os -import six -import pickle -import random import logging +import pickle +from scrapy import signals from scrapy.exceptions import NotConfigured -from scrapy.commands.genspider import sanitize_module_name -from .utils import ( - add_sample, - write_test, - response_to_dict, - get_or_create_test_dir, - parse_request, - get_project_dir, - get_middlewares, - create_dir, - parse_callback_result, - clear_fixtures, - get_filter_attrs, -) +from .recorder import Recorder -logger = logging.getLogger(__name__) - -def _copy_settings(settings): - out = {} - for name in settings.getlist('AUTOUNIT_INCLUDED_SETTINGS', []): - out[name] = settings.get(name) - return out +logger = logging.getLogger(__name__) class AutounitMiddleware: def __init__(self, crawler): + self.crawler = crawler settings = crawler.settings - spider = crawler.spider - if not any( - self.__class__.__name__ in s - for s in settings.getwithbase('SPIDER_MIDDLEWARES').keys() - ): - raise ValueError( - '%s must be in SPIDER_MIDDLEWARES' % ( - self.__class__.__name__,)) + spider_mw = settings.getwithbase('SPIDER_MIDDLEWARES').keys() + if not any(self.__class__.__name__ in mw for mw in spider_mw): + raise ValueError('{} must be in SPIDER_MIDDLEWARES'.format(self.__class__.__name__)) + if not settings.getbool('AUTOUNIT_ENABLED'): raise NotConfigured('scrapy-autounit is not enabled') + if settings.getint('CONCURRENT_REQUESTS') > 1: logger.warn( 'Recording with concurrency > 1! ' - 'Data races in shared object modification may create broken ' - 'tests.' + 'Data races in shared object modification may create broken tests.' ) - self.max_fixtures = settings.getint( - 'AUTOUNIT_MAX_FIXTURES_PER_CALLBACK', - default=10 - ) - self.max_fixtures = \ - self.max_fixtures if self.max_fixtures >= 10 else 10 - - self.base_path = settings.get( - 'AUTOUNIT_BASE_PATH', - default=os.path.join(get_project_dir(), 'autounit') - ) - create_dir(self.base_path, exist_ok=True) - clear_fixtures(self.base_path, sanitize_module_name(spider.name)) - - self.fixture_counters = {} - @classmethod def from_crawler(cls, crawler): - return cls(crawler) + mw = cls(crawler) + crawler.signals.connect(mw.engine_started, signal=signals.engine_started) + return mw + + def engine_started(self): + self.recorder = Recorder(self.crawler.spider) + for warning in self.recorder.deprecated_settings(): + logger.warn(warning) def process_spider_input(self, response, spider): - response.meta['_autounit'] = pickle.dumps({ - 'request': parse_request(response.request, spider), - 'response': response_to_dict(response), - 'spider_args': { - k: v for k, v in spider.__dict__.items() - if k not in get_filter_attrs(spider) - }, - 'middlewares': get_middlewares(spider), - }) + cassette = self.recorder.new_cassette(response) + response.meta['_autounit_cassette'] = pickle.dumps(cassette, protocol=2) return None def process_spider_output(self, response, result, spider): - settings = spider.settings - - processed_result, out = parse_callback_result(result, spider) - - input_data = pickle.loads(response.meta.pop('_autounit')) - - request = input_data['request'] - callback_name = request['callback'] - spider_attr_out = { - k: v for k, v in spider.__dict__.items() - if k not in get_filter_attrs(spider) - } - - data = { - 'spider_name': spider.name, - 'request': request, - 'response': input_data['response'], - 'spider_args_out': spider_attr_out, - 'result': processed_result, - 'spider_args_in': input_data['spider_args'], - 'settings': _copy_settings(settings), - 'middlewares': input_data['middlewares'], - 'python_version': 2 if six.PY2 else 3, - } - - callback_counter = self.fixture_counters.setdefault(callback_name, 0) - self.fixture_counters[callback_name] += 1 - - test_dir, test_name = get_or_create_test_dir( - self.base_path, - sanitize_module_name(spider.name), - callback_name, - settings.get('AUTOUNIT_EXTRA_PATH'), - ) - - index = 0 - if callback_counter < self.max_fixtures: - index = callback_counter + 1 - add_sample(index, test_dir, test_name, data) - else: - r = random.randint(0, callback_counter) - if r < self.max_fixtures: - index = r + 1 - add_sample(index, test_dir, test_name, data) - - if index == 1: - write_test(test_dir, test_name, request['url']) - + cassette = pickle.loads(response.meta.pop('_autounit_cassette')) + out = self.recorder.record(cassette, result) return out diff --git a/scrapy_autounit/parser.py b/scrapy_autounit/parser.py new file mode 100644 index 0000000..4c013a1 --- /dev/null +++ b/scrapy_autounit/parser.py @@ -0,0 +1,120 @@ +import copy + +from scrapy.http import Request, Response +from scrapy.spiders import CrawlSpider +from scrapy.utils.reqser import request_to_dict + + +class Parser: + def _clean_headers(self, headers): + # Use the new setting, if empty, try the deprecated one + excluded = self.spider.settings.get('AUTOUNIT_DONT_RECORD_HEADERS', []) + if not excluded: + excluded = self.spider.settings.get('AUTOUNIT_EXCLUDED_HEADERS', []) + auth_headers = ['Authorization', 'Proxy-Authorization'] + # Use the new setting, if empty, try the deprecated one + included = self.spider.settings.get('AUTOUNIT_RECORD_AUTH_HEADERS', []) + if not included: + included = self.spider.settings.get('AUTOUNIT_INCLUDED_AUTH_HEADERS', []) + excluded.extend([h for h in auth_headers if h not in included]) + for header in excluded: + headers.pop(header, None) + headers.pop(header.encode(), None) + + def _request_to_dict(self, request): + _request = request_to_dict(request, spider=self.spider) + if not _request['callback']: + _request['callback'] = 'parse' + elif isinstance(self.spider, CrawlSpider): + rule = request.meta.get('rule') + if rule is not None: + _request['callback'] = self.spider.rules[rule].callback + self._clean_headers(_request['headers']) + _meta = {} + for key, value in _request.get('meta').items(): + if key != '_autounit_cassette': + _meta[key] = self.parse_object(value) + _request['meta'] = _meta + return _request + + def _response_to_dict(self, response): + return { + 'cls': '{}.{}'.format( + type(response).__module__, + getattr(type(response), '__qualname__', None) or + getattr(type(response), '__name__', None) + ), + 'url': response.url, + 'status': response.status, + 'body': response.body, + 'headers': dict(response.headers), + 'flags': response.flags, + 'encoding': response.encoding, + } + + def spider_attrs(self): + to_filter = {'crawler', 'settings', 'start_urls'} + + if isinstance(self.spider, CrawlSpider): + to_filter |= {'rules', '_rules'} + + dont_record_attrs = set( + self.spider.settings.get('AUTOUNIT_DONT_RECORD_SPIDER_ATTRS', [])) + to_filter |= dont_record_attrs + + return { + k: v for k, v in self.spider.__dict__.items() + if k not in to_filter + } + + def parse_response(self, response_obj): + request = self._request_to_dict(response_obj.request) + response = self._response_to_dict(response_obj) + return request, response + + def parse_object(self, _object): + if isinstance(_object, Request): + return self._request_to_dict(_object) + elif isinstance(_object, Response): + return self.parse_object(self._response_to_dict(_object)) + elif isinstance(_object, dict): + for k, v in _object.items(): + _object[k] = self.parse_object(v) + elif isinstance(_object, list): + for i, v in enumerate(_object): + _object[i] = self.parse_object(v) + elif isinstance(_object, tuple): + _object = tuple([self.parse_object(o) for o in _object]) + return _object + + def parse_callback_output(self, output): + parsed = [] + original = [] + for elem in output: + original.append(elem) + is_request = isinstance(elem, Request) + if is_request: + data = self._request_to_dict(elem) + else: + data = self.parse_object(copy.deepcopy(elem)) + parsed.append({ + 'type': 'request' if is_request else 'item', + 'data': data + }) + return iter(original), parsed + + def deprecated_settings(self): + mapping = { + 'AUTOUNIT_SKIPPED_FIELDS': 'AUTOUNIT_DONT_TEST_OUTPUT_FIELDS', + 'AUTOUNIT_REQUEST_SKIPPED_FIELDS': 'AUTOUNIT_DONT_TEST_REQUEST_ATTRS', + 'AUTOUNIT_EXCLUDED_HEADERS': 'AUTOUNIT_DONT_RECORD_HEADERS', + 'AUTOUNIT_INCLUDED_AUTH_HEADERS': 'AUTOUNIT_RECORD_AUTH_HEADERS', + 'AUTOUNIT_INCLUDED_SETTINGS': 'AUTOUNIT_RECORD_SETTINGS', + } + message = "DEPRECATED: '{}' is going to be removed soon. Please use '{}' instead." + warnings = [] + for old, new in mapping.items(): + if not self.spider.settings.get(old): + continue + warnings.append(message.format(old, new)) + return warnings diff --git a/scrapy_autounit/player.py b/scrapy_autounit/player.py new file mode 100644 index 0000000..7ff8bc4 --- /dev/null +++ b/scrapy_autounit/player.py @@ -0,0 +1,226 @@ +from importlib import import_module +import sys + +from scrapy import signals +from scrapy.exceptions import NotConfigured +from scrapy.utils.misc import load_object, arg_to_iter +from scrapy.utils.reqser import request_from_dict +from testfixtures import compare + +from .cassette import Cassette +from .parser import Parser + + +class Player(Parser): + def __init__(self, cassette): + self.cassette = cassette + + @classmethod + def from_fixture(cls, path): + cassette = Cassette.from_fixture(path) + player = Player(cassette) + return player + + def _len(self, iterator): + return len(list(iterator)) + 1 + + def _auto_import(self, qualified_name): + mod_name, class_name = qualified_name.rsplit('.', 1) + return getattr(import_module(mod_name), class_name) + + def _create_instance(self, objcls, settings, crawler, *args, **kwargs): + if settings is None: + if crawler is None: + raise ValueError("Specifiy at least one of settings and crawler.") + settings = crawler.settings + if crawler and hasattr(objcls, 'from_crawler'): + return objcls.from_crawler(crawler, *args, **kwargs) + elif hasattr(objcls, 'from_settings'): + return objcls.from_settings(settings, *args, **kwargs) + else: + return objcls(*args, **kwargs) + + def _clean(self, x, y, fields): + for obj in (x, y): + for field in fields: + obj.pop(field, None) + + def _check_python_version(self): + current = sys.version_info.major + recorded = self.cassette.python_version + assert current == recorded, ( + 'Trying to test python {} fixture while running python {}'.format(recorded, current) + ) + + def _init_spider(self): + spider = self.cassette.get_spider() + spider.start_requests() + spider.crawler.signals.send_catch_log(signal=signals.spider_opened, spider=spider) + self.spider = spider + self.crawler = spider.crawler + + def _http_objects(self): + request = request_from_dict(self.cassette.request, self.spider) + response_cls = self._auto_import( + self.cassette.response.pop('cls', 'scrapy.http.HtmlResponse') + ) + response = response_cls(request=request, **self.cassette.response) + return request, response + + def _get_middlewares(self): + middlewares = [] + for mw_path in self.cassette.middlewares: + try: + mw_cls = load_object(mw_path) + mw = self._create_instance(mw_cls, self.spider.settings, self.crawler) + middlewares.append(mw) + except NotConfigured: + continue + return middlewares + + def _compare(self, expected, found, message): + x_label = "expected" + y_label = "found" + compare( + expected=expected, + actual=found, + x_label=x_label, + y_label=y_label, + prefix="{} ({})".format(message, self.cassette.filename), + ) + + def _compare_items(self, index, found, expected): + # Get recorded data and parse callback's output + expected_type = expected['type'] + expected_data = expected['data'] + found_data = self.parse_object(found) + + # Clean both objects using the skipped fields from settings + setting_names = ( + 'AUTOUNIT_DONT_TEST_OUTPUT_FIELDS', + 'AUTOUNIT_SKIPPED_FIELDS' + ) + if expected_type == 'request': + setting_names = ( + 'AUTOUNIT_DONT_TEST_REQUEST_ATTRS', + 'AUTOUNIT_REQUEST_SKIPPED_FIELDS' + ) + # Use the new setting, if empty, try the deprecated one + to_clean = self.spider.settings.get(setting_names[0], []) + if not to_clean: + to_clean = self.spider.settings.get(setting_names[1], []) + self._clean(expected_data, found_data, to_clean) + + self._compare( + expected=expected_data, + found=found_data, + message="Callback output #{} doesn't match recorded output".format(index), + ) + + def _compare_outputs(self, found, expected): + out = [] + sentinel = object() + + # Iterate the callback output comparing it with the recorded output + for index, found_item in enumerate(found, start=1): + out.append(found_item) + expected_item = next(expected, sentinel) + if expected_item == sentinel: + raise AssertionError( + "Callback returned {} more item/s than expected ({})".format( + self._len(found), self.cassette.filename)) + self._compare_items(index, found_item, expected_item) + + # Check if we expected more data than the found + expected_more = next(expected, sentinel) + if expected_more != sentinel: + raise AssertionError( + "Expected {} more item/s from callback ({})".format( + self._len(expected), self.cassette.filename)) + + return out + + def _filter_attrs(self, attrs): + dont_test_attrs = self.spider.settings.get( + 'AUTOUNIT_DONT_TEST_SPIDER_ATTRS', []) + for attr in dont_test_attrs: + attrs.pop(attr) + + def _compare_attrs(self, attrs): + # Filter and compare attributes set by spider's init + self._filter_attrs(self.cassette.init_attrs) + self._filter_attrs(attrs['init']) + self._compare( + expected=self.cassette.init_attrs, + found=attrs['init'], + message="Init attributes not equal" + ) + + # Filter and compare spider attributes before the callback + self._filter_attrs(self.cassette.input_attrs) + self._filter_attrs(attrs['input']) + self._compare( + expected=self.cassette.input_attrs, + found=attrs['input'], + message="Input arguments not equal" + ) + + # Filter and compare spider attributes after callback + self._filter_attrs(self.cassette.output_attrs) + self._filter_attrs(attrs['output']) + self._compare( + expected=self.cassette.output_attrs, + found=attrs['output'], + message="Output arguments not equal" + ) + + def playback(self, compare=True): + self._check_python_version() + self._init_spider() + + for warning in self.deprecated_settings(): + print(warning) + + attrs = {} + attrs['init'] = self.spider_attrs() + + # Set spider attributes as they were before the callback + for k, v in self.cassette.input_attrs.items(): + setattr(self.spider, k, v) + + attrs['input'] = self.spider_attrs() + + # Create Request and Response objects + request, response = self._http_objects() + + # Create middlewares instances + middlewares = self._get_middlewares() + + # Run middlewares process_spider_input methods + for mw in middlewares: + if hasattr(mw, 'process_spider_input'): + mw.process_spider_input(response, self.spider) + + # Run the callback + cb_kwargs = getattr(request, "cb_kwargs", {}) + cb_output = arg_to_iter(request.callback(response, **cb_kwargs)) + + # Run middlewares process_spider_output methods + middlewares.reverse() + for mw in middlewares: + if hasattr(mw, 'process_spider_output'): + cb_output = mw.process_spider_output(response, cb_output, self.spider) + + found = iter(cb_output) + expected = iter(self.cassette.output_data) + + if compare: + out = self._compare_outputs(found, expected) + attrs['output'] = self.spider_attrs() + self._compare_attrs(attrs) + else: + # Exhaust the callback output so we can get output attributes + out = [x for x in found] + attrs['output'] = self.spider_attrs() + + return iter(out), attrs diff --git a/scrapy_autounit/recorder.py b/scrapy_autounit/recorder.py new file mode 100644 index 0000000..f09003e --- /dev/null +++ b/scrapy_autounit/recorder.py @@ -0,0 +1,140 @@ +import os +import random +import shutil +import sys + +from scrapy.commands.genspider import sanitize_module_name + +from .cassette import Cassette +from .parser import Parser +from .utils import get_base_path + + +TEST_TEMPLATE = """# THIS IS A GENERATED FILE +# Generated by: {command} # noqa: E501 +import os +import unittest +from glob import glob + +from scrapy_autounit.player import Player + + +class AutoUnit(unittest.TestCase): + def test__{test_name}(self): + _dir = os.path.dirname(os.path.abspath(__file__)) + fixtures = glob(os.path.join(_dir, "*.bin")) + for fixture in fixtures: + player = Player.from_fixture(fixture) + player.playback() + + +if __name__ == '__main__': + unittest.main() +""" + + +class Recorder(Parser): + def __init__(self, spider): + self.spider = spider + self.settings = spider.settings + self.spider_name = sanitize_module_name(spider.name) + self.spider_init_attrs = self.spider_attrs() + + self.fixture_counters = {} + self._set_max_fixtures() + + self.base_path = get_base_path(self.settings) + self._create_dir(self.base_path, exist_ok=True) + self._clear_fixtures() + + @classmethod + def update_fixture(cls, cassette, path): + with open(path, 'wb') as outfile: + outfile.write(cassette.pack()) + + def _set_max_fixtures(self): + self.max_fixtures = self.settings.getint('AUTOUNIT_MAX_FIXTURES_PER_CALLBACK', default=10) + if self.max_fixtures < 10: + self.max_fixtures = 10 + + def _get_test_dir(self, callback_name): + components = [self.base_path, 'tests', self.spider_name] + extra = self.settings.get('AUTOUNIT_EXTRA_PATH') + if extra: + components.append(extra) + components.append(callback_name) + test_dir = None + for comp in components: + test_dir = os.path.join(test_dir, comp) if test_dir else comp + self._create_dir(test_dir, parents=True, exist_ok=True) + init_file = os.path.join(test_dir, '__init__.py') + with open(init_file, 'a'): + os.utime(init_file, None) + return test_dir + + def _create_dir(self, path, parents=False, exist_ok=False): + try: + if parents: + os.makedirs(path) + else: + os.mkdir(path) + except OSError: + if not exist_ok: + raise + + def _clear_fixtures(self): + path = os.path.join(self.base_path, 'tests', self.spider_name) + shutil.rmtree(path, ignore_errors=True) + + def _add_sample(self, index, test_dir, cassette): + filename = 'fixture%s.bin' % str(index) + path = os.path.join(test_dir, filename) + cassette.filename = filename + with open(path, 'wb') as outfile: + outfile.write(cassette.pack()) + + def _write_test(self, path, callback_name): + command = 'scrapy {}'.format(' '.join(sys.argv)) + test_path = os.path.join(path, 'test_fixtures.py') + test_name = self.spider_name + '__' + callback_name + test_code = TEST_TEMPLATE.format(test_name=test_name, command=command) + with open(str(test_path), 'w') as f: + f.write(test_code) + + def new_cassette(self, response_obj): + request, response = self.parse_response(response_obj) + return Cassette( + spider=self.spider, + request=request, + response=response, + init_attrs=self.spider_init_attrs, + input_attrs=self.spider_attrs(), + ) + + def record(self, cassette, output): + original, parsed = self.parse_callback_output(output) + + cassette.output_data = parsed + cassette.output_attrs = self.spider_attrs() + + callback_name = cassette.request['callback'] + callback_counter = self.fixture_counters.setdefault(callback_name, 0) + self.fixture_counters[callback_name] += 1 + + test_dir = self._get_test_dir(callback_name) + + index = 0 + if callback_counter < self.max_fixtures: + index = callback_counter + 1 + else: + r = random.randint(0, callback_counter) + if r < self.max_fixtures: + index = r + 1 + + if index != 0: + self._add_sample(index, test_dir, cassette) + + if index == 1: + self._write_test(test_dir, callback_name) + + return original diff --git a/scrapy_autounit/utils.py b/scrapy_autounit/utils.py index 82487b3..97be9c3 100644 --- a/scrapy_autounit/utils.py +++ b/scrapy_autounit/utils.py @@ -1,49 +1,17 @@ import os -import sys -import copy -import zlib -import pickle -import shutil from importlib import import_module from itertools import islice -import six -from scrapy import signals -from scrapy.crawler import Crawler -from scrapy.spiders import CrawlSpider -from scrapy.exceptions import NotConfigured -from scrapy.http import Request, Response -from scrapy.item import Item -from scrapy.utils.conf import (build_component_list, closest_scrapy_cfg, - init_env) -from scrapy.utils.misc import arg_to_iter, load_object, walk_modules -from scrapy.utils.project import get_project_settings -from scrapy.utils.python import to_bytes -from scrapy.utils.reqser import request_from_dict, request_to_dict +from scrapy.utils.conf import closest_scrapy_cfg, init_env +from scrapy.utils.misc import walk_modules from scrapy.utils.spider import iter_spider_classes -import datadiff.tools -NO_ITEM_MARKER = object() -FIXTURE_VERSION = 1 - - -def auto_import(qualified_name): - mod_name, class_name = qualified_name.rsplit('.', 1) - return getattr(import_module(mod_name), class_name) - - -def create_instance(objcls, settings, crawler, *args, **kwargs): - if settings is None: - if crawler is None: - raise ValueError("Specifiy at least one of settings and crawler.") - settings = crawler.settings - if crawler and hasattr(objcls, 'from_crawler'): - return objcls.from_crawler(crawler, *args, **kwargs) - elif hasattr(objcls, 'from_settings'): - return objcls.from_settings(settings, *args, **kwargs) - else: - return objcls(*args, **kwargs) +def get_base_path(settings): + return settings.get( + 'AUTOUNIT_BASE_PATH', + default=os.path.join(get_project_dir(), 'autounit') + ) def get_project_dir(): @@ -63,104 +31,6 @@ def get_project_dir(): return None -def get_middlewares(spider): - full_list = build_component_list( - spider.settings.getwithbase('SPIDER_MIDDLEWARES')) - autounit_mw_path = list(filter( - lambda x: x.endswith('AutounitMiddleware'), full_list))[0] - start = full_list.index(autounit_mw_path) - mw_paths = [mw for mw in full_list[start:] if mw != autounit_mw_path] - - return mw_paths - - -def create_dir(path, parents=False, exist_ok=False): - try: - if parents: - os.makedirs(path) - else: - os.mkdir(path) - except OSError: - if not exist_ok: - raise - - -def get_or_create_test_dir(base_path, spider_name, callback_name, extra=None): - components = [base_path, 'tests', spider_name] - if extra: - components.append(extra) - components.append(callback_name) - test_dir = None - for component in components: - test_dir = os.path.join(test_dir, component) if test_dir else component - create_dir(test_dir, parents=True, exist_ok=True) - init_file = os.path.join(test_dir, '__init__.py') - with open(init_file, 'a'): - os.utime(init_file, None) - test_name = '__'.join(components[2:]) - return test_dir, test_name - - -def get_filter_attrs(spider): - attrs = {'crawler', 'settings', 'start_urls'} - if isinstance(spider, CrawlSpider): - attrs |= {'rules', '_rules'} - return attrs - - -def add_sample(index, test_dir, test_name, data): - encoding = data['response']['encoding'] - filename = 'fixture%s.bin' % str(index) - path = os.path.join(test_dir, filename) - info = pickle_data({ - 'data': pickle_data(data), - 'encoding': encoding, - 'fixture_version': FIXTURE_VERSION, - }) - data = compress_data(info) - with open(path, 'wb') as outfile: - outfile.write(data) - - -def clear_fixtures(base_path, spider_name): - path = os.path.join(base_path, "tests", spider_name) - shutil.rmtree(path, ignore_errors=True) - - -def compress_data(data): - return zlib.compress(data) - - -def decompress_data(data): - return zlib.decompress(data) - - -def pickle_data(data): - return pickle.dumps(data, protocol=2) - - -def unpickle_data(data, encoding): - if six.PY2: - return pickle.loads(data) - return pickle.loads(data, encoding=encoding) - - -def response_to_dict(response): - return { - 'cls': '{}.{}'.format( - type(response).__module__, - getattr(type(response), '__qualname__', None) or - getattr(type(response), '__name__', None) - ), - 'url': response.url, - 'status': response.status, - 'body': response.body, - 'headers': dict(response.headers), - 'flags': response.flags, - 'encoding': response.encoding, - } - - def get_spider_class(spider_name, project_settings): spider_modules = project_settings.get('SPIDER_MODULES') for spider_module in spider_modules: @@ -172,278 +42,8 @@ def get_spider_class(spider_name, project_settings): return None -def parse_object(_object, spider): - if isinstance(_object, Request): - return parse_request(_object, spider) - elif isinstance(_object, Response): - return parse_object(response_to_dict(_object), spider) - elif isinstance(_object, dict): - for k, v in _object.items(): - _object[k] = parse_object(v, spider) - elif isinstance(_object, list): - for i, v in enumerate(_object): - _object[i] = parse_object(v, spider) - elif isinstance(_object, tuple): - _object = tuple([parse_object(o, spider) for o in _object]) - return _object - - -def parse_request(request, spider): - _request = request_to_dict(request, spider=spider) - if not _request['callback']: - _request['callback'] = 'parse' - elif isinstance(spider, CrawlSpider): - rule = request.meta.get('rule') - if rule is not None: - _request['callback'] = spider.rules[rule].callback - - clean_headers(_request['headers'], spider.settings) - - _meta = {} - for key, value in _request.get('meta').items(): - if key != '_autounit': - _meta[key] = parse_object(value, spider) - _request['meta'] = _meta - - return _request - - -def clean_request(request, settings): - _clean(request, settings, 'AUTOUNIT_REQUEST_SKIPPED_FIELDS') - - -def clean_headers(headers, settings): - excluded = settings.get('AUTOUNIT_EXCLUDED_HEADERS', default=[]) - auth_headers = ['Authorization', 'Proxy-Authorization'] - included = settings.get('AUTOUNIT_INCLUDED_AUTH_HEADERS', default=[]) - excluded.extend([h for h in auth_headers if h not in included]) - for header in excluded: - headers.pop(header, None) - headers.pop(header.encode(), None) - - -def clean_item(item, settings): - _clean(item, settings, 'AUTOUNIT_SKIPPED_FIELDS') - - -def _clean(data, settings, name): - fields = settings.get(name, default=[]) - for field in fields: - data.pop(field, None) - - -def write_test(path, test_name, url): - command = 'scrapy {}'.format(' '.join(sys.argv)) - test_path = os.path.join(path, 'test_fixtures.py') - - test_code = '''# THIS IS A GENERATED FILE -# Generated by: {command} # noqa: E501 -# Request URL: {url} # noqa: E501 -import os -import unittest -from scrapy_autounit.utils import generate_test - - -class AutoUnit(unittest.TestCase): - def test__{test_name}(self): - files = os.listdir( - os.path.dirname( - os.path.abspath(__file__) - ) - ) - files = [f for f in files if f.endswith('.bin')] - self.maxDiff = None - for f in files: - file_path = os.path.join(os.path.dirname(__file__), f) - print("Testing fixture '%s'" % (f)) - test = generate_test(os.path.abspath(file_path)) - test(self) - - -if __name__ == '__main__': - unittest.main() -'''.format( - test_name=test_name, - command=command, - url=url, - ) - - with open(str(test_path), 'w') as f: - f.write(test_code) - - -def binary_check(fx_obj, cb_obj, encoding): - if isinstance(cb_obj, (dict, Item)): - fx_obj = { - key: binary_check(value, cb_obj[key], encoding) - for key, value in fx_obj.items() - } - - if isinstance(cb_obj, list): - fx_obj = [ - binary_check(fxitem, cbitem, encoding) - for fxitem, cbitem in zip(fx_obj, cb_obj) - ] - - if isinstance(cb_obj, Request): - headers = {} - for key, value in fx_obj['headers'].items(): - key = to_bytes(key, encoding) - headers[key] = [to_bytes(v, encoding) for v in value] - fx_obj['headers'] = headers - fx_obj['body'] = to_bytes(fx_obj['body'], encoding) - - if isinstance(cb_obj, six.binary_type): - fx_obj = fx_obj.encode(encoding) - - return fx_obj - - -def set_spider_attrs(spider, _args): - for k, v in _args.items(): - setattr(spider, k, v) - - -def parse_callback_result(result, spider): - processed_result = [] - out = [] - for elem in result: - out.append(elem) - is_request = isinstance(elem, Request) - if is_request: - _data = parse_request(elem, spider) - else: - _data = parse_object(copy.deepcopy(elem), spider) - processed_result.append({ - 'type': 'request' if is_request else 'item', - 'data': _data - }) - return processed_result, out - - -def prepare_callback_replay(fixture_path, encoding="utf-8"): - with open(str(fixture_path), 'rb') as f: - raw_data = f.read() - - fixture_info = unpickle_data(decompress_data(raw_data), encoding) - if 'fixture_version' in fixture_info: - encoding = fixture_info['encoding'] - data = unpickle_data(fixture_info['data'], encoding) - else: - data = fixture_info # legacy tests - - settings = get_project_settings() - - spider_name = data.get('spider_name') - if not spider_name: # legacy tests - spider_name = os.path.basename( - os.path.dirname( - os.path.dirname(fixture_path) - ) - ) - - spider_cls = get_spider_class(spider_name, settings) - spider_cls.update_settings(settings) - - for k, v in data.get('settings', {}).items(): - settings.set(k, v, 50) - - crawler = Crawler(spider_cls, settings) - spider_args_in = data.get('spider_args', data.get('spider_args_in', {})) - spider = spider_cls.from_crawler(crawler, **spider_args_in) - crawler.spider = spider - - return data, crawler, spider, settings - - def generate_test(fixture_path, encoding='utf-8'): - data, crawler, spider, settings = prepare_callback_replay( - fixture_path, encoding=encoding - ) - - def test(self): - fx_result = data['result'] - fx_version = data.get('python_version') - - spider_args_in = data.get( - 'spider_args', data.get('spider_args_in', {})) - set_spider_attrs(spider, spider_args_in) - request = request_from_dict(data['request'], spider) - response_cls = auto_import(data['response'].pop( - 'cls', 'scrapy.http.HtmlResponse')) - response = response_cls(request=request, **data['response']) - - middlewares = [] - middleware_paths = data['middlewares'] - for mw_path in middleware_paths: - try: - mw_cls = load_object(mw_path) - mw = create_instance(mw_cls, settings, crawler) - middlewares.append(mw) - except NotConfigured: - continue - - crawler.signals.send_catch_log( - signal=signals.spider_opened, - spider=spider - ) - result_attr_in = { - k: v for k, v in spider.__dict__.items() - if k not in get_filter_attrs(spider) - } - self.assertEqual(spider_args_in, result_attr_in, - 'Input arguments not equal!') - - for mw in middlewares: - if hasattr(mw, 'process_spider_input'): - mw.process_spider_input(response, spider) - - result = arg_to_iter(request.callback(response)) - middlewares.reverse() - - for mw in middlewares: - if hasattr(mw, 'process_spider_output'): - result = mw.process_spider_output(response, result, spider) - - for index, (cb_obj, fx_item) in enumerate(six.moves.zip_longest( - result, fx_result, fillvalue=NO_ITEM_MARKER - )): - if any(item == NO_ITEM_MARKER for item in (cb_obj, fx_item)): - raise AssertionError( - "The fixture's data length doesn't match with " - "the current callback's output length. " - "Expected %s elements, found %s" % ( - len(fx_result), index + 1 + len(list(result))) - ) - - cb_obj = parse_object(cb_obj, spider) - - fx_obj = fx_item['data'] - if fx_item['type'] == 'request': - clean_request(fx_obj, settings) - clean_request(cb_obj, settings) - else: - clean_item(fx_obj, settings) - clean_item(cb_obj, settings) - - if fx_version == 2 and six.PY3: - fx_obj = binary_check(fx_obj, cb_obj, encoding) - - try: - datadiff.tools.assert_equal(fx_obj, cb_obj) - except AssertionError as e: - six.raise_from( - AssertionError( - "Callback output #{} doesn't match recorded " - "output:{}".format(index, e)), - None) - - # Spider attributes get updated after the yield - result_attr_out = { - k: v for k, v in spider.__dict__.items() - if k not in get_filter_attrs(spider) - } - - self.assertEqual(data['spider_args_out'], result_attr_out, - 'Output arguments not equal!') - return test + raise AssertionError( + "This spider's tests and fixtures are from an old version and need to be updated. " + "Please update them by using the `autounit` command line utility. " + "See `autounit update -h` for more help.") diff --git a/setup.py b/setup.py index c91631d..1ca87a3 100644 --- a/setup.py +++ b/setup.py @@ -1,25 +1,27 @@ import setuptools + with open("README.md", "r") as fh: long_description = fh.read() setuptools.setup( name='scrapy-autounit', - version='0.0.27', + version='0.0.28', author='', author_email='', description='Automatic unit test generation for Scrapy.', long_description=long_description, long_description_content_type='text/markdown', - url='https://github.com/fcanobrash/scrapy-autounit', + url='https://github.com/scrapinghub/scrapy-autounit', packages=setuptools.find_packages(), classifiers=[ + 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', ], install_requires=[ - 'datadiff==2.0.0', + 'testfixtures==6.14.1', ], entry_points = { 'console_scripts': [ diff --git a/tests/test_middleware.py b/tests/test_middleware.py index f43cc80..78b29e4 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -7,15 +7,11 @@ def process_spider_output(self, response, result, spider): if hasattr(spider, 'test_attr'): delattr(spider, 'test_attr') - return super(self.__class__, self).process_spider_output(response, - result, - spider) + return super(self.__class__, self).process_spider_output(response, result, spider) class DelObjectsAutounitMiddleware(AutounitMiddleware, object): def process_spider_output(self, response, result, spider): result = [] - return super(self.__class__, self).process_spider_output(response, - result, - spider) + return super(self.__class__, self).process_spider_output(response, result, spider) diff --git a/tests/test_record.py b/tests/test_record.py index e390bf3..7188b03 100644 --- a/tests/test_record.py +++ b/tests/test_record.py @@ -1,9 +1,9 @@ -import unittest -import tempfile -import subprocess import os -import shutil import re +import shutil +import subprocess +import tempfile +import unittest SPIDER_TEMPLATE = ''' @@ -155,10 +155,7 @@ def record(self, args=None, settings=None, record_verbosity=False): env = os.environ.copy() env['PYTHONPATH'] = self.dir # doesn't work if == cwd env['SCRAPY_SETTINGS_MODULE'] = 'myproject.settings' - command_args = [ - 'scrapy', 'crawl', self._spider_name, - '-s', 'AUTOUNIT_ENABLED=1', - ] + command_args = ['scrapy', 'crawl', self._spider_name, '-s', 'AUTOUNIT_ENABLED=1'] for k, v in (args or {}).items(): command_args.append('-a') command_args.append('{}={}'.format(k, v)) @@ -187,9 +184,7 @@ def test(self, test_verbosity=True): env = os.environ.copy() env['SCRAPY_SETTINGS_MODULE'] = 'myproject.settings' result = run( - [ - 'python', '-m', 'unittest', 'discover', '-v' - ], + ['python', '-m', 'unittest', 'discover', '-v'], env=env, cwd=self.dir, stdout=subprocess.PIPE, @@ -251,8 +246,8 @@ def test_spider_attributes(self): yield {'a': 4} """) spider.record(settings=dict( - AUTOUNIT_EXCLUDED_FIELDS='_base_url', - AUTOUNIT_INCLUDED_SETTINGS='AUTOUNIT_EXCLUDED_FIELDS')) + AUTOUNIT_DONT_TEST_OUTPUT_FIELDS='_base_url', + AUTOUNIT_RECORD_SETTINGS='AUTOUNIT_DONT_TEST_OUTPUT_FIELDS')) spider.test() def test_spider_attributes_recursive(self): @@ -339,7 +334,7 @@ def test_skipped_fields(self): with CaseSpider() as spider: spider.imports('import time') spider.custom_settings(''' - AUTOUNIT_SKIPPED_FIELDS = ['ts'] + AUTOUNIT_DONT_TEST_OUTPUT_FIELDS = ['ts'] ''') spider.start_requests("yield scrapy.Request('data:text/plain,')") spider.parse(''' @@ -353,7 +348,7 @@ def test_request_skipped_fields(self): with CaseSpider() as spider: spider.imports('import random') spider.custom_settings(''' - AUTOUNIT_REQUEST_SKIPPED_FIELDS = ['url'] + AUTOUNIT_DONT_TEST_REQUEST_ATTRS = ['url'] ''') spider.start_requests("yield scrapy.Request('data:text/plain,')") spider.parse(''' @@ -385,9 +380,18 @@ def test_request_parsing_types_meta_in_output(self): spider.start_requests(''' yield scrapy.Request( 'data:text/plain,', - meta={'metadata': [ - ('tuples', {'dicts': 'and'}, ['lists', 'in'], 'meta', 1, 20.5) - ]} + meta={ + 'metadata': [ + ( + 'tuples', + {'dicts': 'and'}, + ['lists', 'in'], + 'meta', + 1, + 20.5 + ) + ] + } ) ''') spider.parse(''' @@ -489,11 +493,8 @@ def template(self): yield {'a': 5} """) spider.record() - expected_message = "AssertionError: The fixture's data length "\ - "doesn't match with the current callback's "\ - "output length." - with self.assertRaisesRegexp(AssertionError, - re.escape(expected_message)): + expected_message = "more item/s than expected" + with self.assertRaisesRegexp(AssertionError, re.escape(expected_message)): spider.test(test_verbosity=True) def test_attribute_change_raises_error(self): @@ -519,9 +520,8 @@ def template(self): yield scrapy.Request('data:text/plain,', dont_filter=True) """) spider.record() - expected_message = "Output arguments not equal!" - with self.assertRaisesRegexp(AssertionError, - re.escape(expected_message)): + expected_message = "Output arguments not equal" + with self.assertRaisesRegexp(AssertionError, re.escape(expected_message)): spider.test(test_verbosity=True) def test_missing_parse_method_raises_assertionerror(self): diff --git a/tox.ini b/tox.ini index d36347f..803bb0d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,10 @@ [tox] envlist = py27,py35,py36 +[flake8] +max-line-length = 100 +exclude = __init__.py + [testenv] deps = scrapy @@ -8,5 +12,5 @@ deps = flake8 commands = pip install -e . - flake8 --exclude=__init__.py scrapy_autounit tests + flake8 scrapy_autounit tests python -m unittest -v tests.test_record