From 77695922e739bd5b97e7fdca1afcff1dd3d168a8 Mon Sep 17 00:00:00 2001 From: yan3ku Date: Tue, 1 Apr 2025 17:51:49 +0200 Subject: [PATCH 1/2] Add request-dont-filter --- src/core.lisp | 1 + src/engine.lisp | 17 ++++++++++------- src/request.lisp | 8 +++++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/core.lisp b/src/core.lisp index fdd2edf..36efc3c 100644 --- a/src/core.lisp +++ b/src/core.lisp @@ -18,6 +18,7 @@ #:response-body #:response-headers) (:export #:request-url + #:request-dont-filter #:request) (:export #:typed-output) (:export #:json-lines diff --git a/src/engine.lisp b/src/engine.lisp index 5954a2e..4d9db36 100644 --- a/src/engine.lisp +++ b/src/engine.lisp @@ -7,7 +7,8 @@ #:start #:process #:enqueue - #:request-url) + #:request-url + #:request-dont-filter) (:import-from #:scrapycl/spider) (:import-from #:scrapycl/task #:task @@ -50,7 +51,7 @@ (setf (gethash url seen-urls) t))) (declare (dynamic-extent #'seen-url-p)) - + (loop for task = (get-next-task spider) while task do (let* (;; It is important to have this var @@ -71,9 +72,11 @@ ;; We need this block to not visit same URL twice and ;; to break link loops: (scrapycl/core:request - (unless (seen-url-p (request-url object)) - (register-url (request-url object)) - (enqueue spider object))) + (cond ((request-dont-filter object) + (enqueue spider object)) + ((not (seen-url-p (request-url object))) + (register-url (request-url object)) + (enqueue spider object)))) (t (enqueue spider object))))) (declare (dynamic-extent #'walk)) @@ -138,7 +141,7 @@ (unless results (log:debug "Process didn't return new objects")) results))) - + (:method ((spider t) (object t)) (cond (*output-func* @@ -153,7 +156,7 @@ (bt2:with-lock-held ((scrapycl/spider::%spider-queue-lock spider)) (clear-queue (scrapycl/spider::%spider-queue spider)) (values)) - + (uiop:while-collecting (collect-item) (let* ((output-is-function (and output diff --git a/src/request.lisp b/src/request.lisp index 81196cd..c53c3a1 100644 --- a/src/request.lisp +++ b/src/request.lisp @@ -3,6 +3,7 @@ (:import-from #:scrapycl/core #:request #:request-url + #:request-dont-filter #:url)) (in-package #:scrapycl/request) @@ -12,7 +13,12 @@ :type url :initform (error "Please, provide :URL argument.") :reader request-url - :documentation "URL to fetch data from."))) + :documentation "URL to fetch data from.") + (dont-filter :initarg :dont-filter + :type (member t nil) + :initform nil + :reader request-dont-filter + :documentation "Exclude request from filtering."))) (defmethod print-object ((obj request) stream) From 021778b43c59432902968101364dc68561a98729 Mon Sep 17 00:00:00 2001 From: yan3ku Date: Thu, 10 Apr 2025 11:09:25 +0200 Subject: [PATCH 2/2] Update changelog --- docs/changelog.lisp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/changelog.lisp b/docs/changelog.lisp index acd76ed..3471cac 100644 --- a/docs/changelog.lisp +++ b/docs/changelog.lisp @@ -14,6 +14,14 @@ "DOWNLOADER" "SCRAPYCL" "HTTP")) + (0.2.1 2025-04-10 + " +Changed +======= + +Added `request-dont-filter` slot to `scrapycl:request`, allowing repeated scraping of the +same URL when set to T. +") (0.2.0 2025-02-07 " Changed