From 0455273a3601dba19d748c0e33d50bee70d29492 Mon Sep 17 00:00:00 2001
From: Aras Abbasi <aras.abbasi@googlemail.com>
Date: Fri, 26 Sep 2025 17:52:02 +0200
Subject: [PATCH 1/2] docs: add crawling best practices

---
 docs/docs/best-practices/crawling.md | 56 ++++++++++++++++++++++++++++
 docs/docsify/sidebar.md              |  1 +
 test/fetch/user-agent.js             | 20 ++++++++++
 3 files changed, 77 insertions(+)
 create mode 100644 docs/docs/best-practices/crawling.md

diff --git a/docs/docs/best-practices/crawling.md b/docs/docs/best-practices/crawling.md
new file mode 100644
index 00000000000..6646ca33afb
--- /dev/null
+++ b/docs/docs/best-practices/crawling.md
@@ -0,0 +1,56 @@
+# Crawling
+
+[RFC 9309](https://datatracker.ietf.org/doc/html/rfc9309) defines crawlers as automated clients.
+
+Some web servers may reject requests that omit the `User-Agent` header or that use common defaults such as `'curl/7.79.1'`.
+
+In **undici**, the default user agent is `'undici'`. Since undici is integrated into Node.js core as the implementation of `fetch()`, requests made via `fetch()` use `'node'` as the default user agent.
+
+It is recommended to specify a **custom `User-Agent` header** when implementing crawlers. Providing a descriptive user agent allows servers to correctly identify the client and reduces the likelihood of requests being denied.
+
+A user agent string should include sufficient detail to identify the crawler and provide contact information. For example:
+
+```
+AcmeCo Crawler - acme.co - contact@acme.co
+```
+
+If a crawler behaves unexpectedly—for example, due to misconfiguration or implementation errors—server administrators can use the information in the user agent to contact the operator and coordinate an appropriate resolution.
+
+The `User-Agent` header can be set on individual requests or applied globally by configuring a custom dispatcher.
+
+**Example: setting a `User-Agent` per request**
+
+```js
+import { fetch } from 'undici'
+
+const headers = {
+  'User-Agent': 'AcmeCo Crawler - acme.co - contact@acme.co'
+}
+
+const res = await fetch('https://example.com', { headers })
+```
+
+## Best Practices for Crawlers
+
+When developing a crawler, the following practices are recommended in addition to setting a descriptive `User-Agent` header:
+
+* **Respect `robots.txt`**
+  Follow the directives defined in the target site’s `robots.txt` file, including disallowed paths and optional crawl-delay settings (see [W3C guidelines](https://www.w3.org/wiki/Write_Web_Crawler)).
+
+* **Rate limiting**
+  Regulate request frequency to avoid imposing excessive load on servers. Introduce delays between requests or limit the number of concurrent requests. The W3C suggests at least one second between requests.
+
+* **Error handling**
+  Implement retry logic with exponential backoff for transient failures, and stop requests when persistent errors occur (e.g., HTTP 403 or 429).
+
+* **Monitoring and logging**
+  Track request volume, response codes, and error rates to detect misbehavior and address issues proactively.
+
+* **Contact information**
+  Always include valid and current contact details in the `User-Agent` string so that administrators can reach the crawler operator if necessary.
+
+## References and Further Reading
+
+* [RFC 9309: The Robots Exclusion Protocol](https://datatracker.ietf.org/doc/html/rfc9309)
+* [W3C Wiki: Write Web Crawler](https://www.w3.org/wiki/Write_Web_Crawler)
+* [Ethical Web Crawling (WWW 2010 Conference Paper)](https://archives.iw3c2.org/www2010/proceedings/www/p1101.pdf)
diff --git a/docs/docsify/sidebar.md b/docs/docsify/sidebar.md
index 9b5ad541d21..8e5e7fc08f5 100644
--- a/docs/docsify/sidebar.md
+++ b/docs/docsify/sidebar.md
@@ -42,3 +42,4 @@
   * [Client Certificate](/docs/best-practices/client-certificate.md "Connect using a client certificate")
   * [Writing Tests](/docs/best-practices/writing-tests.md "Using Undici inside tests")
   * [Mocking Request](/docs/best-practices/mocking-request.md "Using Undici inside tests")
+  * [Crawling](/docs/best-practices/crawling.md "Crawling")
diff --git a/test/fetch/user-agent.js b/test/fetch/user-agent.js
index ad3749dd53d..119bff175de 100644
--- a/test/fetch/user-agent.js
+++ b/test/fetch/user-agent.js
@@ -25,3 +25,23 @@ test('user-agent defaults correctly', async (t) => {
   t.assert.strictEqual(nodeBuildJSON.userAgentHeader, 'node')
   t.assert.strictEqual(undiciJSON.userAgentHeader, 'undici')
 })
+
+test('set user-agent for fetch', async (t) => {
+  const server = http.createServer({ joinDuplicateHeaders: true }, (req, res) => {
+    res.end(JSON.stringify({ userAgentHeader: req.headers['user-agent'] }))
+  })
+  t.after(closeServerAsPromise(server))
+
+  server.listen(0)
+  await events.once(server, 'listening')
+  const url = `http://localhost:${server.address().port}`
+  const [nodeBuildJSON, undiciJSON] = await Promise.all([
+    nodeBuild.fetch(url, { headers: { 'user-agent': 'AcmeCo Crawler - acme.co - node@acme.co' } }).then((body) => body.json()),
+    undici.fetch(url, {
+      headers: { 'user-agent': 'AcmeCo Crawler - acme.co - undici@acme.co' }
+    }).then((body) => body.json())
+  ])
+
+  t.assert.strictEqual(nodeBuildJSON.userAgentHeader, 'AcmeCo Crawler - acme.co - node@acme.co')
+  t.assert.strictEqual(undiciJSON.userAgentHeader, 'AcmeCo Crawler - acme.co - undici@acme.co')
+})

From 41804ee7a48be948e4cfa057db232f653848c6b7 Mon Sep 17 00:00:00 2001
From: Aras Abbasi <aras.abbasi@googlemail.com>
Date: Fri, 26 Sep 2025 18:01:04 +0200
Subject: [PATCH 2/2] add part about gdpr aspects

---
 docs/docs/best-practices/crawling.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/docs/best-practices/crawling.md b/docs/docs/best-practices/crawling.md
index 6646ca33afb..96d67b2f0cc 100644
--- a/docs/docs/best-practices/crawling.md
+++ b/docs/docs/best-practices/crawling.md
@@ -14,6 +14,8 @@ A user agent string should include sufficient detail to identify the crawler and
 AcmeCo Crawler - acme.co - contact@acme.co
 ```
 
+When adding contact details, avoid using personal identifiers such as your own name or a private email address—especially in a professional or employment context. Instead, use a role-based or organizational contact (e.g., crawler-team@company.com) to protect individual privacy while still enabling communication.
+
 If a crawler behaves unexpectedly—for example, due to misconfiguration or implementation errors—server administrators can use the information in the user agent to contact the operator and coordinate an appropriate resolution.
 
 The `User-Agent` header can be set on individual requests or applied globally by configuring a custom dispatcher.