Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/changelog.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@
"DOWNLOADER"
"SCRAPYCL"
"HTTP"))
(0.2.1 2025-04-10
"
Changed
=======

Added `request-dont-filter` slot to `scrapycl:request`, allowing repeated scraping of the
same URL when set to T.
")
(0.2.0 2025-02-07
"
Changed
Expand Down
1 change: 1 addition & 0 deletions src/core.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#:response-body
#:response-headers)
(:export #:request-url
#:request-dont-filter
#:request)
(:export #:typed-output)
(:export #:json-lines
Expand Down
17 changes: 10 additions & 7 deletions src/engine.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#:start
#:process
#:enqueue
#:request-url)
#:request-url
#:request-dont-filter)
(:import-from #:scrapycl/spider)
(:import-from #:scrapycl/task
#:task
Expand Down Expand Up @@ -50,7 +51,7 @@
(setf (gethash url seen-urls)
t)))
(declare (dynamic-extent #'seen-url-p))

(loop for task = (get-next-task spider)
while task
do (let* (;; It is important to have this var
Expand All @@ -71,9 +72,11 @@
;; We need this block to not visit same URL twice and
;; to break link loops:
(scrapycl/core:request
(unless (seen-url-p (request-url object))
(register-url (request-url object))
(enqueue spider object)))
(cond ((request-dont-filter object)
(enqueue spider object))
((not (seen-url-p (request-url object)))
(register-url (request-url object))
(enqueue spider object))))
(t
(enqueue spider object)))))
(declare (dynamic-extent #'walk))
Expand Down Expand Up @@ -138,7 +141,7 @@
(unless results
(log:debug "Process didn't return new objects"))
results)))

(:method ((spider t) (object t))
(cond
(*output-func*
Expand All @@ -153,7 +156,7 @@
(bt2:with-lock-held ((scrapycl/spider::%spider-queue-lock spider))
(clear-queue (scrapycl/spider::%spider-queue spider))
(values))

(uiop:while-collecting (collect-item)
(let* ((output-is-function
(and output
Expand Down
8 changes: 7 additions & 1 deletion src/request.lisp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
(:import-from #:scrapycl/core
#:request
#:request-url
#:request-dont-filter
#:url))
(in-package #:scrapycl/request)

Expand All @@ -12,7 +13,12 @@
:type url
:initform (error "Please, provide :URL argument.")
:reader request-url
:documentation "URL to fetch data from.")))
:documentation "URL to fetch data from.")
(dont-filter :initarg :dont-filter
:type (member t nil)
:initform nil
:reader request-dont-filter
:documentation "Exclude request from filtering.")))


(defmethod print-object ((obj request) stream)
Expand Down