From 0455273a3601dba19d748c0e33d50bee70d29492 Mon Sep 17 00:00:00 2001 From: Aras Abbasi Date: Fri, 26 Sep 2025 17:52:02 +0200 Subject: [PATCH 1/2] docs: add crawling best practices --- docs/docs/best-practices/crawling.md | 56 ++++++++++++++++++++++++++++ docs/docsify/sidebar.md | 1 + test/fetch/user-agent.js | 20 ++++++++++ 3 files changed, 77 insertions(+) create mode 100644 docs/docs/best-practices/crawling.md diff --git a/docs/docs/best-practices/crawling.md b/docs/docs/best-practices/crawling.md new file mode 100644 index 00000000000..6646ca33afb --- /dev/null +++ b/docs/docs/best-practices/crawling.md @@ -0,0 +1,56 @@ +# Crawling + +[RFC 9309](https://datatracker.ietf.org/doc/html/rfc9309) defines crawlers as automated clients. + +Some web servers may reject requests that omit the `User-Agent` header or that use common defaults such as `'curl/7.79.1'`. + +In **undici**, the default user agent is `'undici'`. Since undici is integrated into Node.js core as the implementation of `fetch()`, requests made via `fetch()` use `'node'` as the default user agent. + +It is recommended to specify a **custom `User-Agent` header** when implementing crawlers. Providing a descriptive user agent allows servers to correctly identify the client and reduces the likelihood of requests being denied. + +A user agent string should include sufficient detail to identify the crawler and provide contact information. For example: + +``` +AcmeCo Crawler - acme.co - contact@acme.co +``` + +If a crawler behaves unexpectedly—for example, due to misconfiguration or implementation errors—server administrators can use the information in the user agent to contact the operator and coordinate an appropriate resolution. + +The `User-Agent` header can be set on individual requests or applied globally by configuring a custom dispatcher. + +**Example: setting a `User-Agent` per request** + +```js +import { fetch } from 'undici' + +const headers = { + 'User-Agent': 'AcmeCo Crawler - acme.co - contact@acme.co' +} + +const res = await fetch('https://example.com', { headers }) +``` + +## Best Practices for Crawlers + +When developing a crawler, the following practices are recommended in addition to setting a descriptive `User-Agent` header: + +* **Respect `robots.txt`** + Follow the directives defined in the target site’s `robots.txt` file, including disallowed paths and optional crawl-delay settings (see [W3C guidelines](https://www.w3.org/wiki/Write_Web_Crawler)). + +* **Rate limiting** + Regulate request frequency to avoid imposing excessive load on servers. Introduce delays between requests or limit the number of concurrent requests. The W3C suggests at least one second between requests. + +* **Error handling** + Implement retry logic with exponential backoff for transient failures, and stop requests when persistent errors occur (e.g., HTTP 403 or 429). + +* **Monitoring and logging** + Track request volume, response codes, and error rates to detect misbehavior and address issues proactively. + +* **Contact information** + Always include valid and current contact details in the `User-Agent` string so that administrators can reach the crawler operator if necessary. + +## References and Further Reading + +* [RFC 9309: The Robots Exclusion Protocol](https://datatracker.ietf.org/doc/html/rfc9309) +* [W3C Wiki: Write Web Crawler](https://www.w3.org/wiki/Write_Web_Crawler) +* [Ethical Web Crawling (WWW 2010 Conference Paper)](https://archives.iw3c2.org/www2010/proceedings/www/p1101.pdf) diff --git a/docs/docsify/sidebar.md b/docs/docsify/sidebar.md index 9b5ad541d21..8e5e7fc08f5 100644 --- a/docs/docsify/sidebar.md +++ b/docs/docsify/sidebar.md @@ -42,3 +42,4 @@ * [Client Certificate](/docs/best-practices/client-certificate.md "Connect using a client certificate") * [Writing Tests](/docs/best-practices/writing-tests.md "Using Undici inside tests") * [Mocking Request](/docs/best-practices/mocking-request.md "Using Undici inside tests") + * [Crawling](/docs/best-practices/crawling.md "Crawling") diff --git a/test/fetch/user-agent.js b/test/fetch/user-agent.js index ad3749dd53d..119bff175de 100644 --- a/test/fetch/user-agent.js +++ b/test/fetch/user-agent.js @@ -25,3 +25,23 @@ test('user-agent defaults correctly', async (t) => { t.assert.strictEqual(nodeBuildJSON.userAgentHeader, 'node') t.assert.strictEqual(undiciJSON.userAgentHeader, 'undici') }) + +test('set user-agent for fetch', async (t) => { + const server = http.createServer({ joinDuplicateHeaders: true }, (req, res) => { + res.end(JSON.stringify({ userAgentHeader: req.headers['user-agent'] })) + }) + t.after(closeServerAsPromise(server)) + + server.listen(0) + await events.once(server, 'listening') + const url = `http://localhost:${server.address().port}` + const [nodeBuildJSON, undiciJSON] = await Promise.all([ + nodeBuild.fetch(url, { headers: { 'user-agent': 'AcmeCo Crawler - acme.co - node@acme.co' } }).then((body) => body.json()), + undici.fetch(url, { + headers: { 'user-agent': 'AcmeCo Crawler - acme.co - undici@acme.co' } + }).then((body) => body.json()) + ]) + + t.assert.strictEqual(nodeBuildJSON.userAgentHeader, 'AcmeCo Crawler - acme.co - node@acme.co') + t.assert.strictEqual(undiciJSON.userAgentHeader, 'AcmeCo Crawler - acme.co - undici@acme.co') +}) From 41804ee7a48be948e4cfa057db232f653848c6b7 Mon Sep 17 00:00:00 2001 From: Aras Abbasi Date: Fri, 26 Sep 2025 18:01:04 +0200 Subject: [PATCH 2/2] add part about gdpr aspects --- docs/docs/best-practices/crawling.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/docs/best-practices/crawling.md b/docs/docs/best-practices/crawling.md index 6646ca33afb..96d67b2f0cc 100644 --- a/docs/docs/best-practices/crawling.md +++ b/docs/docs/best-practices/crawling.md @@ -14,6 +14,8 @@ A user agent string should include sufficient detail to identify the crawler and AcmeCo Crawler - acme.co - contact@acme.co ``` +When adding contact details, avoid using personal identifiers such as your own name or a private email address—especially in a professional or employment context. Instead, use a role-based or organizational contact (e.g., crawler-team@company.com) to protect individual privacy while still enabling communication. + If a crawler behaves unexpectedly—for example, due to misconfiguration or implementation errors—server administrators can use the information in the user agent to contact the operator and coordinate an appropriate resolution. The `User-Agent` header can be set on individual requests or applied globally by configuring a custom dispatcher.