From 5f94cb079c7b03720438b1d5096fc8859b3ff131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dorian=20Oszcz=C4=99da?= Date: Sun, 8 Jan 2023 15:07:51 +0000 Subject: [PATCH] misc: Restructure the API. (#5) * misc!: Remove non-developer dependencies. (#3) * misc!: Remove `sprint` dependency. * misc!: Remove `web_scraper` dependency. * misc!: Restructure the API. * test: Add tests. --- CHANGELOG.md | 22 ++ README.md | 27 ++- analysis_options.yaml | 6 +- example/example.dart | 55 +++-- lib/robots_txt.dart | 5 +- lib/src/parser.dart | 319 +++++++++++++++++++------- lib/src/rule.dart | 87 +++++-- lib/src/ruleset.dart | 85 ++++--- lib/src/utils.dart | 3 - pubspec.yaml | 14 +- test/contents_definitions.dart | 126 +++++++++++ test/parser_test.dart | 402 +++++++++++++++++++++++++++++++++ 12 files changed, 972 insertions(+), 179 deletions(-) delete mode 100644 lib/src/utils.dart create mode 100644 test/contents_definitions.dart create mode 100644 test/parser_test.dart diff --git a/CHANGELOG.md b/CHANGELOG.md index 0142b83..fb1ea73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,25 @@ +## 2.0.0 + +- Additions: + - Added developer dependencies: + - `meta` for static analysis. + - `test` for testing. + - Added support for the 'Sitemap' field. + - Added support for specifying: + - The precedent rule type for determining whether a certain user-agent can + or cannot access a certain path. (`PrecedentRuleType`) + - The comparison strategy to use for comparing rule precedence. + (`PrecedenceStrategy`) + - Added tests. +- Changes: + - Bumped the minimum SDK version to `2.17.0` for enhanced enum support. +- Improvements: + - Made all structs `const` and marked them as `@sealed` and `@immutable`. +- Deletions: + - Removed dependencies: + - `sprint` + - `web_scraper` + ## 1.1.1 - Updated project description. diff --git a/README.md b/README.md index 0cead0c..fec6a52 100644 --- a/README.md +++ b/README.md @@ -1,24 +1,23 @@ -## A lightweight `robots.txt` ruleset parser to ensure your application follows the standard protocol. +## A complete, dependency-less and fully documented `robots.txt` ruleset parser. ### Usage The following code gets the `robots.txt` robot exclusion ruleset of a website. -`quietMode` determines whether or not the library should print warning messages in the case of the `robots.txt` not being valid or other errors. - ```dart -// Create an instance of the `robots.txt` parser -final robots = Robots(host: 'https://github.com/'); -// Read the ruleset of the website -await robots.read(); +// Get the contents of the `robots.txt` file. +final contents = /* Your method of obtaining the contents of a `robots.txt` file. */; +// Parse the contents. +final robots = Robots.parse(contents); ``` -Now that the `robots.txt` file has been read, we can verify whether we can visit a certain path or not: +Now that the `robots.txt` file has been read, we can verify whether we can visit +a certain path or not: ```dart -final userAgent = '*'; -print("Can '$userAgent' visit '/gist/'?"); -print(robots.canVisitPath('/gist/', userAgent: '*')); // It cannot -print("Can '$userAgent' visit '/wordcollector/robots_txt'?"); -print(robots.canVisitPath('/wordcollector/robots_txt', userAgent: '*')); // It can -``` \ No newline at end of file +final userAgent = /* Your user agent. */; +// False: it cannot. +print(robots.verifyCanAccess('/gist/', userAgent: userAgent)); +// True: it can. +print(robots.verifyCanAccess('/wordcollector/robots_txt', userAgent: userAgent)); +``` diff --git a/analysis_options.yaml b/analysis_options.yaml index e925772..4996fe7 100644 --- a/analysis_options.yaml +++ b/analysis_options.yaml @@ -1 +1,5 @@ -include: package:words/core.yaml \ No newline at end of file +include: package:words/core.yaml + +linter: + rules: + directives_ordering: false \ No newline at end of file diff --git a/example/example.dart b/example/example.dart index b39b2c0..05052d7 100644 --- a/example/example.dart +++ b/example/example.dart @@ -1,32 +1,53 @@ +import 'dart:convert'; +import 'dart:io'; + import 'package:robots_txt/robots_txt.dart'; -Future main() async { - // Create an instance of the `robots.txt` parser. - final robots = Robots(host: 'https://github.com/'); - // Read the ruleset of the website. - await robots.read(); - // Print the ruleset. +Future main() async { + // Get the contents of the `robots.txt` file. + final contents = await fetchFileContents(host: 'github.com'); + // Parse the contents. + final robots = Robots.parse(contents); + + // Print the rulesets. for (final ruleset in robots.rulesets) { - // Print the user-agent the ruleset applies to. - print(ruleset.appliesTo); + // Print the user-agent this ruleset applies to. + print(ruleset.userAgent); + if (ruleset.allows.isNotEmpty) { - print('Allows:'); + print('Allowed:'); } - // Print the path expressions allowed by this ruleset. + // Print the regular expressions that match to paths allowed by this + // ruleset. for (final rule in ruleset.allows) { - print(' - ${rule.expression}'); + print(' - ${rule.pattern}'); } + if (ruleset.disallows.isNotEmpty) { - print('Disallows:'); + print('Disallowed:'); } - // Print the path expressions disallowed by this ruleset. + // Print the regular expressions that match to paths disallowed by this + // ruleset. for (final rule in ruleset.disallows) { - print(' - ${rule.expression}'); + print(' - ${rule.pattern}'); } } + // False: it cannot. - print(robots.canVisitPath('/gist/', userAgent: '*')); + print(robots.verifyCanAccess('/gist/', userAgent: '*')); // True: it can. - print(robots.canVisitPath('/wordcollector/robots_txt', userAgent: '*')); - return; + print(robots.verifyCanAccess('/wordcollector/robots_txt', userAgent: '*')); +} + +Future fetchFileContents({required String host}) async { + final client = HttpClient(); + + final contents = await client + .get(host, 80, '/robots.txt') + .then((request) => request.close()) + .then((response) => response.transform(utf8.decoder).join()); + + client.close(); + + return contents; } diff --git a/lib/robots_txt.dart b/lib/robots_txt.dart index 7a934c4..59b7c52 100644 --- a/lib/robots_txt.dart +++ b/lib/robots_txt.dart @@ -1,5 +1,6 @@ /// Lightweight, fully documented `robots.txt` file parser. library robots_txt; -export 'src/parser.dart'; -export 'src/rule.dart'; +export 'src/parser.dart' show Robots, PrecedentRuleType, FieldType; +export 'src/rule.dart' show Rule, FindRule, Precedence, PrecedenceStrategy; +export 'src/ruleset.dart' show Ruleset, FindRuleInRuleset; diff --git a/lib/src/parser.dart b/lib/src/parser.dart index cb918a1..32af43e 100644 --- a/lib/src/parser.dart +++ b/lib/src/parser.dart @@ -1,101 +1,163 @@ +import 'package:meta/meta.dart'; + import 'package:robots_txt/src/rule.dart'; import 'package:robots_txt/src/ruleset.dart'; -import 'package:sprint/sprint.dart'; -import 'package:web_scraper/web_scraper.dart'; -/// Abstracts away the rather convoluted declaration for an element with two -/// fields; 'title' and 'attributes'. 'attributes' is a map containing the -/// attributes of the element. -typedef Element = Map>; +/// Defines a Regex pattern that matches to comments. +final commentPattern = RegExp('#.*'); -/// Allows for parsing of a host's `robots.txt` to get information about which -/// of its resources may or may not be accessed, as well as which of its pages -/// cannot be traversed. +/// Stores information about a `robots.txt` file, exposing a simple and concise +/// API for working with the file and validating if a certain path can be +/// accessed by a given user-agent. +@immutable +@sealed class Robots { - /// Instance of `Sprint` message logger for the `robots.txt` parser. - final Sprint log; - - /// The host of this `robots.txt` file. - final String host; - - /// Stores an instance of the scraper for a given URL. - final WebScraper scraper; - - /// Stores expressions for both paths which may or may not be traversed. - final List rulesets = []; - - /// Creates an instance of a `robots.txt` parser for the provided [host]. - Robots({ - required this.host, - bool quietMode = false, - bool productionMode = true, - }) : scraper = WebScraper(host), - log = Sprint( - 'Robots', - quietMode: quietMode, - productionMode: productionMode, - ); + /// Stores information about the rules specified for given user-agents. + final List rulesets; + + /// Stores links to the website's sitemaps. + final List sitemaps; + + /// Defines an instance of `Robots` with no rulesets. + static const _empty = Robots._construct(rulesets: [], sitemaps: []); - /// Reads and parses the `robots.txt` file of the [host]. - Future read({String? onlyRelevantTo}) async { - await scraper.loadWebPage('/robots.txt'); - final body = scraper.getElement('body', [])[0]; + /// Creates an instance of `Robots`. + const Robots._construct({required this.rulesets, required this.sitemaps}); - final invalidRobotsFileError = "'$host' has an invalid `robots.txt`:"; + /// Parses the contents of a `robots.txt` file, creating an instance of + /// `Robots`. If [onlyApplicableTo] is specified, the parser will ignore any + /// rulesets that do not apply to it. + /// + /// This function will never throw an exception. + factory Robots.parse(String contents, {String? onlyApplicableTo}) { + contents = contents.replaceAll(commentPattern, ''); - if (body.isEmpty) { - log.warn('$invalidRobotsFileError No text elements found'); - return; + if (contents.trim().isEmpty) { + return Robots._empty; } - final content = body['title'] as String; - final lines = content.split('\n').where((line) => line.isNotEmpty); - parseRulesets(lines, onlyRelevantTo: onlyRelevantTo); + final lines = contents.split('\n').where((line) => line.isNotEmpty); + + return Robots._fromLines(lines, onlyFor: onlyApplicableTo); } - /// Iterates over [lines] and parses each ruleset, additionally ignoring - /// those rulesets which are not relevant to [onlyRelevantTo]. - void parseRulesets(Iterable lines, {String? onlyRelevantTo}) { - Ruleset? ruleset; + /// Iterates over [lines] and sequentially parses each ruleset, optionally + /// ignoring those rulesets which are not relevant to [onlyFor]. + factory Robots._fromLines( + Iterable lines, { + String? onlyFor, + }) { + final rulesets = []; + final sitemaps = []; + + // Temporary data used for parsing rulesets. + final userAgents = []; + final allows = []; + final disallows = []; + + bool isReadingRuleset() => userAgents.isNotEmpty; + + void saveRulesets() { + for (final userAgent in userAgents) { + rulesets.add( + Ruleset( + userAgent: userAgent, + allows: List.from(allows), + disallows: List.from(disallows), + ), + ); + } + } + + void reset() { + userAgents.clear(); + allows.clear(); + disallows.clear(); + } + + late FieldType previousType; for (var index = 0; index < lines.length; index++) { - final field = getRobotsFieldFromLine(lines.elementAt(index)); + final field = _getFieldFromLine(lines.elementAt(index)); + if (field == null) { + continue; + } + + final type = FieldType.byKey(field.key); + if (type == null) { + continue; + } - switch (field.key) { - case 'user-agent': - if (ruleset != null) { - rulesets.add(ruleset); + switch (type) { + case FieldType.userAgent: + if (userAgents.isNotEmpty && previousType != FieldType.userAgent) { + saveRulesets(); + reset(); } - if (onlyRelevantTo != null && field.key != onlyRelevantTo) { - ruleset = null; + + if (onlyFor != null && field.key != onlyFor) { break; } - ruleset = Ruleset(field.value); + + userAgents.add(field.value); break; + case FieldType.disallow: + if (!isReadingRuleset()) { + break; + } - case 'allow': - if (ruleset != null) { - final expression = convertFieldPathToExpression(field.value); - ruleset.allows.add(Rule(expression, index)); + final RegExp pattern; + try { + pattern = _convertPathToRegExp(field.value); + } on FormatException { + break; } + disallows.add( + Rule( + pattern: pattern, + precedence: lines.length - (index + 1), + ), + ); + break; - case 'disallow': - if (ruleset != null) { - final expression = convertFieldPathToExpression(field.value); - ruleset.disallows.add(Rule(expression, index)); + case FieldType.allow: + if (!isReadingRuleset()) { + break; } + + final RegExp pattern; + try { + pattern = _convertPathToRegExp(field.value); + } on FormatException { + break; + } + allows.add( + Rule( + pattern: pattern, + precedence: lines.length - (index + 1), + ), + ); + + break; + case FieldType.sitemap: + sitemaps.add(field.value); break; } + + previousType = type; } - if (ruleset != null) { - rulesets.add(ruleset); + if (isReadingRuleset()) { + saveRulesets(); + reset(); } + + return Robots._construct(rulesets: rulesets, sitemaps: sitemaps); } /// Reads a path declaration from within `robots.txt` and converts it to a /// regular expression for later matching. - RegExp convertFieldPathToExpression(String pathDeclaration) { - // Collapse duplicate slashes and wildcards into singles. + static RegExp _convertPathToRegExp(String pathDeclaration) { + // Collapse duplicate slashes and wildcards into single ones. final collapsed = pathDeclaration.replaceAll('/+', '/').replaceAll('*+', '*'); final normalised = collapsed.endsWith('*') @@ -104,35 +166,126 @@ class Robots { final withWildcardsReplaced = normalised.replaceAll('.', r'\.').replaceAll('*', '.*'); final withTrailingText = withWildcardsReplaced.contains(r'$') - ? withWildcardsReplaced.split(r'$')[0] + ? withWildcardsReplaced.split(r'$').first : '$withWildcardsReplaced.*'; return RegExp(withTrailingText, caseSensitive: false, dotAll: true); } /// Extracts the key and value from [target] and puts it into a `MapEntry`. - MapEntry getRobotsFieldFromLine(String target) { + static MapEntry? _getFieldFromLine(String target) { final keyValuePair = target.split(':'); - final key = keyValuePair[0].toLowerCase(); + if (keyValuePair.length < 2) { + return null; + } + + final key = keyValuePair.first.trim(); final value = keyValuePair.sublist(1).join(':').trim(); return MapEntry(key, value); } - /// Determines whether or not [path] may be traversed. - bool canVisitPath(String path, {required String userAgent}) { - final explicitAllowance = rulesets.getRule( - appliesTo: userAgent, - concernsPath: path, - andAllowsIt: true, + /// Checks if the `robots.txt` file allows [userAgent] to access [path]. + bool verifyCanAccess( + String path, { + required String userAgent, + PrecedentRuleType typePrecedence = PrecedentRuleType.defaultPrecedentType, + PrecedenceStrategy comparisonMethod = PrecedenceStrategy.defaultStrategy, + }) { + final allowedBy = rulesets.findApplicableRule( + userAgent: userAgent, + path: path, + type: RuleType.allow, + comparisonMethod: comparisonMethod, ); - final explicitDisallowance = rulesets.getRule( - appliesTo: userAgent, - concernsPath: path, - andAllowsIt: false, + final disallowedBy = rulesets.findApplicableRule( + userAgent: userAgent, + path: path, + type: RuleType.disallow, + comparisonMethod: comparisonMethod, ); - final allowancePriority = explicitAllowance?.priority ?? -1; - final disallowancePriority = explicitDisallowance?.priority ?? -1; + switch (typePrecedence) { + case PrecedentRuleType.defaultPrecedentType: + // TODO(vxern): Below is a fix for an issue in Dart 2.18 with the enhanced + // enums. This issue is fixed in 2.19, which is still on the beta + // channel. Refer to: https://github.com/dart-lang/sdk/issues/49188 + // ignore: no_duplicate_case_values + case PrecedentRuleType.allow: + return allowedBy != null || disallowedBy == null; + case PrecedentRuleType.disallow: + return disallowedBy != null || allowedBy == null; + } + } +} + +/// Describes the type of a rule. +@internal +enum RuleType { + /// A rule explicitly allows a given path. + allow, + + /// A rule explicitly disallows a given path. + disallow, +} + +/// Defines the method used to decide whether rules that explicitly allow a +/// user-agent to access a path take precedence over ones that disallow it to do +/// so, or the other way around. +enum PrecedentRuleType { + /// The rule that explicitly allows a user-agent to access a path takes + /// precedence over rules that explicitly disallow it. + allow, + + /// The rule that explicitly disallows a user-agent to access a path takes + /// precedence over rules that explicitly allow it. + disallow; + + /// Defines the default precedent rule type. + static const defaultPrecedentType = PrecedentRuleType.allow; +} + +/// Defines a key-value field of a `robots.txt` file specifying a rule. +@visibleForTesting +enum FieldType { + /// A field specifying the user-agent the following fields apply to. + userAgent(key: 'User-agent', example: '*'), + + /// A field explicitly disallowing a user-agent to visit a path. + disallow(key: 'Disallow', example: '/'), + + /// A field explicitly allowing a user-agent to visit a path. + allow(key: 'Allow', example: '/file.txt'), + + /// A field specifying the location of a sitemap of a website. + sitemap(key: 'Sitemap', example: 'https://example.com/sitemap.xml'); + + /// The name of the field key. + final String key; + + /// An example of a field definition. Used for testing. + final String example; - return allowancePriority >= disallowancePriority; + /// Contains the field types that introduce rules. + static const rules = [FieldType.allow, FieldType.disallow]; + + /// Constructs a `FieldType`. + const FieldType({required this.key, required this.example}); + + /// Converts a `FieldType` to a `robots.txt` field. + String toField([String? value]) => '$key: ${value ?? example}'; + + /// Attempts to resolve [key] to a `FieldKey` corresponding to that [key]. + /// Returns `null` if not found. + static FieldType? byKey(String key) { + for (final value in FieldType.values) { + if (key == value.key) { + return value; + } + } + + return null; } + + @override + @Deprecated('Use `toField()` instead') + String toString(); } diff --git a/lib/src/rule.dart b/lib/src/rule.dart index 3320ce6..df9c2a1 100644 --- a/lib/src/rule.dart +++ b/lib/src/rule.dart @@ -1,30 +1,79 @@ +import 'package:meta/meta.dart'; + /// A single rule (either `Allow` or `Disallow`) inside the `robots.txt` file. +@immutable +@sealed class Rule { - /// An expression which a path may be matched against to determine whether - /// this rule applies to the path. - final RegExp expression; + /// A regular expression matching to a particular path. + final RegExp pattern; - /// The priority of this rule based on its position inside the `robots.txt` - /// file. If the path is determined to be relevant to two rules, the rule - /// with the higher priority *overrides* the ruling of the other. - final int priority; + /// The precedence of this rule based on its position inside the `robots.txt` + /// file. The rule with the higher precedence is used to decide whether or not + /// a path may be visited. + final int _precedence; - /// Instantiates a rule with an [expression] and the [priority] it has over + /// Instantiates a rule with an [pattern] and the [precedence] it has over /// other rules. - const Rule(this.expression, this.priority); + const Rule({required this.pattern, required int precedence}) + : _precedence = precedence; } -/// Extends `List` with a method for getting the `Rule` with the highest -/// [Rule.priority]. -extension RulingOnPath on List { - /// Taking [path], checks which `Rule`s' expressions match [path], and - /// returns the `Rule` with the highest priority. - Rule? getRulingOnPath(String path) { - final relevantRules = where((rule) => rule.expression.hasMatch(path)); - if (relevantRules.isEmpty) { +/// Extends `List` with methods used to find rule that pertain to a +/// certain path. +extension FindRule on List { + /// Taking a [path], returns the `Rule`s that pertain to it. + List findApplicable({required String path}) => + where((rule) => rule.pattern.hasMatch(path)).toList(); + + /// Taking a [path], gets the `Rule`s that pertain to it, and returns the + /// `Rule` that has precedence over the other rules. + Rule? findMostApplicable({ + required String path, + PrecedenceStrategy comparisonMethod = PrecedenceStrategy.defaultStrategy, + }) { + final comparisonFunction = _ruleComparisonFunctions[comparisonMethod]!; + + final applicableRules = findApplicable(path: path); + if (applicableRules.isEmpty) { return null; } - // Get the relevant rule with the highest priority - return relevantRules.reduce((a, b) => a.priority > b.priority ? a : b); + + return applicableRules.reduce(comparisonFunction); } } + +/// Extends `Rule?` with a getter `precedence` to avoid having to explicitly +/// default to `-1` whenever attempting to access the hidden property +/// `_precedence` on a nullish value. +extension Precedence on Rule? { + /// Gets the precedence of this rule. Defaults to `-1` if `null`. + int get precedence => this?._precedence ?? -1; +} + +/// The signature of a method that compares two variables of type `T` and +/// returns the one supposed 'greater'. +@internal +typedef ComparisonFunction = T Function(T a, T b); + +/// `ComparisonFunction`s matched to `PrecedenceStrategy`s. +final _ruleComparisonFunctions = + Map>.unmodifiable( + >{ + PrecedenceStrategy.higherTakesPrecedence: (a, b) => + a.precedence > b.precedence ? a : b, + PrecedenceStrategy.lowerTakesPrecedence: (a, b) => + a.precedence < b.precedence ? a : b, + }, +); + +/// Defines the strategy to use to compare rules as per their `precedence`. +enum PrecedenceStrategy { + /// The rule defined higher up in the `robots.txt` file takes precedence. + higherTakesPrecedence, + + /// The rule defines lower down in the `robots.txt` file takes precedence. + lowerTakesPrecedence; + + /// Defines the default strategy to use to compare rules. + static const defaultStrategy = PrecedenceStrategy.higherTakesPrecedence; +} diff --git a/lib/src/ruleset.dart b/lib/src/ruleset.dart index de1230a..99d3c72 100644 --- a/lib/src/ruleset.dart +++ b/lib/src/ruleset.dart @@ -1,46 +1,67 @@ +import 'package:meta/meta.dart'; + +import 'package:robots_txt/src/parser.dart'; import 'package:robots_txt/src/rule.dart'; /// A collection of `Rule`s, and the `user-agent` they are relevant to inside /// the `robots.txt` file. +@immutable +@sealed class Ruleset { - /// The `user-agent` which this ruleset applies to. - final String appliesTo; + /// The user-agent which this ruleset applies to. + final String userAgent; + + /// List of `Rule`s which state that a path may not be traversed. + final List disallows; - /// List of `Rule`s which explicitly state that a path may be traversed. - final List allows = []; + /// List of `Rule`s which state that a path may be traversed. + final List allows; - /// List of `Rule`s which explicitly state that a path may not be traversed. - final List disallows = []; + /// Whether this ruleset applies to all user-agents. + final bool appliesToAll; /// Instantiates a ruleset with the `user-agent`. - Ruleset(this.appliesTo); + const Ruleset({ + required this.userAgent, + required this.allows, + required this.disallows, + }) : appliesToAll = userAgent == '*'; /// Checks whether this ruleset applies to [userAgent]. - bool doesConcern(String userAgent) => - appliesTo == '*' || appliesTo == userAgent; + bool appliesTo(String userAgent) => + appliesToAll || this.userAgent == userAgent; } -/// Extends `List` with a method for getting a single `Rule` from the -/// list of `Rulesets` -extension RulingOfRulesets on List { - /// Gets the rule which [appliesTo], [concernsPath] [andAllowsIt]. - Rule? getRule({ - required String appliesTo, - required String concernsPath, - required bool andAllowsIt, - }) => - fold(null, (current, next) { - if (!next.doesConcern(appliesTo)) { - return current; - } - - final currentPriority = current?.priority ?? -1; - final relevantRules = andAllowsIt ? next.allows : next.disallows; - final nextRule = relevantRules.getRulingOnPath(concernsPath); - - if (nextRule == null || nextRule.priority < currentPriority) { - return current; - } - return nextRule; - }); +/// Extends `List` with a method used to find a rule that matches +/// the supplied filters. +extension FindRuleInRuleset on List { + /// Gets the rule that applies to [userAgent], pertains to [path] and is of + /// type [type]. + Rule? findApplicableRule({ + required String userAgent, + required String path, + required RuleType type, + PrecedenceStrategy comparisonMethod = PrecedenceStrategy.defaultStrategy, + }) { + for (final ruleset in this) { + final rules = type == RuleType.allow ? ruleset.allows : ruleset.disallows; + if (rules.isEmpty) { + continue; + } + + if (!ruleset.appliesTo(userAgent)) { + continue; + } + + final rule = rules.findMostApplicable( + path: path, + comparisonMethod: comparisonMethod, + ); + if (rule != null) { + return rule; + } + } + + return null; + } } diff --git a/lib/src/utils.dart b/lib/src/utils.dart deleted file mode 100644 index 1813084..0000000 --- a/lib/src/utils.dart +++ /dev/null @@ -1,3 +0,0 @@ -/// Taking the singular form of [word], morphs it according to [count]. -String pluralise(String word, int count) => '${count == 0 ? 'no' : count} ' - '${count == 0 || count > 1 ? '${word}s' : word}'; diff --git a/pubspec.yaml b/pubspec.yaml index ed61e4a..b3e5038 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -1,20 +1,18 @@ name: robots_txt -version: 1.1.1 +version: 2.0.0 -description: >- - A lightweight `robots.txt` ruleset parser to ensure your application adheres - the de facto standard. +description: A complete, dependency-less and fully documented `robots.txt` ruleset parser. homepage: https://github.com/wordcollector/robots_txt repository: https://github.com/wordcollector/robots_txt issue_tracker: https://github.com/wordcollector/robots_txt/issues environment: - sdk: '>=2.13.0 <3.0.0' + sdk: '>=2.17.0 <3.0.0' dependencies: - sprint: ^1.0.4 - web_scraper: ^0.1.4 + meta: ^1.8.0 # Used for static analysis. dev_dependencies: - words: ^0.1.1 + test: ^1.22.1 # Testing. + words: ^0.2.0 # Stricter lints. diff --git a/test/contents_definitions.dart b/test/contents_definitions.dart new file mode 100644 index 0000000..e57b35a --- /dev/null +++ b/test/contents_definitions.dart @@ -0,0 +1,126 @@ +import 'package:robots_txt/robots_txt.dart'; + +/// Empty file contents. +const emptyContents = ''; + +/// Invalid `robots.txt` contents. +const invalidContents = 'This is an invalid robots.txt file.'; + +/// Valid `robots.txt` file with an invalid disallow field. +final validContentsInvalidPattern = ''' +${FieldType.userAgent.toField('A')} +${FieldType.disallow.toField(r'/\$')} +'''; + +/// Valid `robots.txt` file with all supported fields with example values. +final validContentsValidPattern = + FieldType.values.map((value) => value.toField()).join('\n'); + +/// Example rule fields without a user-agent. +final rulesWithoutUserAgent = + FieldType.rules.map((value) => value.toField()).join('\n'); + +/// Example rule fields defined before a user-agent. +final rulesDefinedBeforeUserAgent = [...FieldType.rules, FieldType.userAgent] + .map((value) => value.toField()) + .join(); + +/// Example sitemap field. +final sitemap = FieldType.sitemap.toField(); + +/// File disallowed for user-agent 'A'. +final fileDisallowedForA = ''' +${FieldType.userAgent.toField('A')} +${FieldType.disallow.toField('/file.txt')} +'''; + +/// File disallowed for user-agents 'A' and 'B'. +final fileDisallowedForAAndB = ''' +${FieldType.userAgent.toField('A')} +${FieldType.userAgent.toField('B')} +${FieldType.disallow.toField('/file.txt')} +'''; + +/// File disallowed for all user-agents. +final fileDisallowedForAll = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/file.txt')} +'''; + +/// File disallowed for all user-agents except 'A'. +final fileDisallowedForAllExceptA = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/file.txt')} +${FieldType.userAgent.toField('A')} +${FieldType.allow.toField('/file.txt')} +'''; + +/// Directory disallowed. +final directoryDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/directory/')} +'''; + +/// Directory disallowed, but not a certain file. +final directoryDisallowedButNotFile = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/directory/')} +${FieldType.allow.toField('/directory/file.txt')} +'''; + +/// Directory disallowed, but not its subdirectory. +final directoryDisallowedButNotSubdirectory = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/directory/')} +${FieldType.allow.toField('/directory/subdirectory/')} +'''; + +/// Nested directory disallowed. +final nestedDirectoryDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/*/directory/')} +'''; + +/// Nested directory disallowed, but not its subdirectory. +final nestedDirectoryDisallowedButNotSubdirectory = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/*/directory/')} +${FieldType.allow.toField('/*/directory/subdirectory/')} +'''; + +/// Nested file disallowed. +final nestedFileDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/*/file.txt')} +'''; + +/// All files disallowed. +final allFilesDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/*.*')} +'''; + +/// All directories disallowed. +final directoriesDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/*/')} +'''; + +/// All text files disallowed, but not other files. +final textFilesDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/*.txt')} +'''; + +/// Files containing a certain string disallowed. +final filesContainingStringDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('*/*string*.*')} +${FieldType.allow.toField('/*string*/')} +'''; + +/// Directories containing a certain string disallowed. +final directoriesContainingStringDisallowed = ''' +${FieldType.userAgent.toField('*')} +${FieldType.disallow.toField('/*string*/')} +'''; diff --git a/test/parser_test.dart b/test/parser_test.dart new file mode 100644 index 0000000..2b90577 --- /dev/null +++ b/test/parser_test.dart @@ -0,0 +1,402 @@ +import 'package:test/test.dart'; + +import 'package:robots_txt/robots_txt.dart'; + +import 'contents_definitions.dart'; + +void main() { + late Robots robots; + group('The parser correctly parses', () { + group('file contents', () { + test('that are empty.', () { + expect(() => robots = Robots.parse(emptyContents), returnsNormally); + expect(robots.verifyCanAccess('/', userAgent: 'A'), equals(true)); + }); + + test('that are not valid.', () { + expect(() => robots = Robots.parse(invalidContents), returnsNormally); + expect(robots.verifyCanAccess('/', userAgent: 'A'), equals(true)); + }); + + test('that are valid, but have an invalid pattern.', () { + expect( + () => robots = Robots.parse(validContentsInvalidPattern), + returnsNormally, + ); + }); + + test('that are valid.', () { + expect( + () => robots = Robots.parse(validContentsValidPattern), + returnsNormally, + ); + expect(robots.rulesets.length, equals(1)); + final ruleset = robots.rulesets.first; + expect(ruleset.disallows.length, equals(1)); + expect(ruleset.allows.length, equals(1)); + expect(robots.sitemaps.length, equals(1)); + expect(robots.verifyCanAccess('/', userAgent: 'A'), equals(false)); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(true), + ); + }); + + test('that define a sitemap.', () { + expect(() => robots = Robots.parse(sitemap), returnsNormally); + expect(robots.sitemaps, equals([FieldType.sitemap.example])); + }); + }); + + group('logical rules', () { + test('defined without a user agent.', () { + expect( + () => robots = Robots.parse(rulesWithoutUserAgent), + returnsNormally, + ); + expect(robots.rulesets, equals([])); + expect(robots.verifyCanAccess('/', userAgent: 'A'), equals(true)); + }); + + test('defined before a user agent.', () { + expect( + () => robots = Robots.parse(rulesDefinedBeforeUserAgent), + returnsNormally, + ); + expect(robots.rulesets, equals([])); + expect(robots.verifyCanAccess('/', userAgent: 'A'), equals(true)); + }); + + test('that disallow a file for A.', () { + expect( + () => robots = Robots.parse(fileDisallowedForA), + returnsNormally, + ); + expect(robots.verifyCanAccess('/', userAgent: 'A'), equals(true)); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(false), + ); + }); + + test('that disallow a file for both A and B.', () { + expect( + () => robots = Robots.parse(fileDisallowedForAAndB), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'B'), + equals(false), + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'C'), + equals(true), + ); + }); + + test('that disallow a file for all user-agents.', () { + expect( + () => robots = Robots.parse(fileDisallowedForAll), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'B'), + equals(false), + ); + }); + + test('that disallow a file for all user-agents except A.', () { + expect( + () => robots = Robots.parse(fileDisallowedForAllExceptA), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'B'), + equals(false), + ); + }); + }); + + group('rules', () { + test('that disallow a directory.', () { + expect( + () => robots = Robots.parse(directoryDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/directory/', userAgent: 'A'), + equals(false), + ); + }); + + test('that disallow a directory, but allow a file from within it.', () { + expect( + () => robots = Robots.parse(directoryDisallowedButNotFile), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/directory/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/directory/file.txt', userAgent: 'A'), + equals(true), + ); + }); + + test('that disallow a directory, but allow its subdirectory.', () { + expect( + () => robots = Robots.parse(directoryDisallowedButNotSubdirectory), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/directory/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/directory/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/directory/subdirectory/', userAgent: 'A'), + equals(true), + ); + }); + + test('that disallow a nested directory.', () { + expect( + () => robots = Robots.parse(nestedDirectoryDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/directory/', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/one/directory/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/one/two/directory/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/one/two/three/', userAgent: 'A'), + equals(true), + ); + }); + + test('that disallow a nested directory, but allow its subdirectory.', () { + expect( + () => robots = Robots.parse( + nestedDirectoryDisallowedButNotSubdirectory, + ), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/nest/directory/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess( + '/nest/directory/subdirectory/', + userAgent: 'A', + ), + equals(true), + ); + }); + + test('that disallow a nested file.', () { + expect( + () => robots = Robots.parse(nestedFileDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/directory/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess( + '/directory/subdirectory/file.txt', + userAgent: 'A', + ), + equals(false), + ); + expect( + robots.verifyCanAccess( + '/directory/subdirectory/file_2.txt', + userAgent: 'A', + ), + equals(true), + ); + }); + + test('that disallow files.', () { + expect( + () => robots = Robots.parse(allFilesDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/path', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/directory/', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/directory/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess( + '/directory/subdirectory/file.txt', + userAgent: 'A', + ), + equals(false), + ); + expect( + robots.verifyCanAccess( + '/directory/subdirectory/', + userAgent: 'A', + ), + equals(true), + ); + }); + + test('that disallow directories.', () { + expect( + () => robots = Robots.parse(directoriesDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/file', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/directory/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/directory/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess( + '/directory/subdirectory/file.txt', + userAgent: 'A', + ), + equals(false), + ); + expect( + robots.verifyCanAccess( + '/directory/subdirectory/', + userAgent: 'A', + ), + equals(false), + ); + }); + + test('that disallow only text files.', () { + expect( + () => robots = Robots.parse(textFilesDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/file.pdf', userAgent: 'A'), + equals(true), + ); + }); + + test('that disallow files that contain a certain string.', () { + expect( + () => robots = Robots.parse(filesContainingStringDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/file.txt', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/string.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/abc|string.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/string|abc.txt', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/string/file.txt', userAgent: 'A'), + equals(true), + ); + }); + + test('that disallow directories that contain a certain string.', () { + expect( + () => robots = Robots.parse(directoriesContainingStringDisallowed), + returnsNormally, + ); + expect( + robots.verifyCanAccess('/string.txt', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/directory/string.txt', userAgent: 'A'), + equals(true), + ); + expect( + robots.verifyCanAccess('/string/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/abc|string/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/string|abc/', userAgent: 'A'), + equals(false), + ); + expect( + robots.verifyCanAccess('/one/two/three/string/five/', userAgent: 'A'), + equals(false), + ); + }); + }); + }); +}