document main crawler elements

mikkeldenker · mikkeldenker · commit 8f7d71283fe2 · 2024-12-05T13:58:23.000+01:00
diff --git a/crates/core/src/crawler/mod.rs b/crates/core/src/crawler/mod.rs
@@ -14,6 +14,15 @@
 // You should have received a copy of the GNU Affero General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
+//! # Crawler
+//!
+//! The crawler is responsible for fetching webpages and storing them in WARC files
+//! for later processing.
+//!
+//! Before starting a crawl, a plan needs to be created. This plan is then used by
+//! the crawler coordinator to assign sites to crawl to different workers.
+//! A site is only assigned to one worker at a time for politeness.
+
 use std::{collections::VecDeque, future::Future, net::SocketAddr, sync::Arc};
 
 type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
@@ -35,7 +44,7 @@ pub use router::Router;
 mod file_queue;
 pub mod planner;
 pub mod robot_client;
-mod wander_prirotiser;
+mod wander_prioritiser;
 mod warc_writer;
 mod worker;
 
@@ -304,7 +313,7 @@ impl Crawler {
     }
 }
 
-pub trait DatumStream: Send + Sync {
+pub trait DatumSink: Send + Sync {
     fn write(&self, crawl_datum: CrawlDatum) -> impl Future<Output = Result<()>> + Send;
     fn finish(&self) -> impl Future<Output = Result<()>> + Send;
 }
diff --git a/crates/core/src/crawler/planner.rs b/crates/core/src/crawler/planner.rs
@@ -13,6 +13,7 @@
 //
 // You should have received a copy of the GNU Affero General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
 use anyhow::{anyhow, Result};
 use futures::stream::FuturesOrdered;
 use futures::StreamExt;
@@ -71,6 +72,7 @@ impl From<StoredUrl> for Url {
     }
 }
 
+/// Store urls in groups on disk based on their harmonic rank.
 struct UrlGrouper {
     groups: Vec<speedy_kv::Db<StoredUrl, ()>>,
     folder: std::path::PathBuf,
@@ -169,6 +171,7 @@ struct Budget {
     remaining_schedulable: u64,
 }
 
+/// Create a crawl plan based on the harmonic rank of the hosts.
 pub struct CrawlPlanner {
     host_centrality: Arc<speedy_kv::Db<NodeID, f64>>,
     host_centrality_rank: Arc<speedy_kv::Db<NodeID, u64>>,
diff --git a/crates/core/src/crawler/robot_client.rs b/crates/core/src/crawler/robot_client.rs
@@ -45,6 +45,7 @@ pub(super) fn reqwest_client(config: &CrawlerConfig) -> Result<reqwest::Client>
         .map_err(|e| Error::from(anyhow!(e)))
 }
 
+/// Reqwest client that respects robots.txt for each request.
 #[derive(Clone)]
 pub struct RobotClient {
     robots_txt_manager: RobotsTxtManager,
diff --git a/crates/core/src/crawler/wander_prioritiser.rs b/crates/core/src/crawler/wander_prioritiser.rs
diff --git a/crates/core/src/crawler/warc_writer.rs b/crates/core/src/crawler/warc_writer.rs
@@ -21,7 +21,7 @@ use crate::{
     warc,
 };
 
-use super::{CrawlDatum, DatumStream, Error, Result};
+use super::{CrawlDatum, DatumSink, Error, Result};
 use anyhow::anyhow;
 
 /// The WarcWriter is responsible for storing the crawl datums
@@ -30,7 +30,7 @@ pub struct WarcWriter {
     tx: tokio::sync::mpsc::Sender<WarcWriterMessage>,
 }
 
-impl DatumStream for WarcWriter {
+impl DatumSink for WarcWriter {
     async fn write(&self, crawl_datum: CrawlDatum) -> Result<()> {
         self.tx
             .send(WarcWriterMessage::Crawl(crawl_datum))
diff --git a/crates/core/src/crawler/worker.rs b/crates/core/src/crawler/worker.rs
@@ -39,8 +39,8 @@ use crate::{
 };
 
 use super::{
-    encoded_body, robot_client::RobotClient, wander_prirotiser::WanderPrioritiser, CrawlDatum,
-    DatumStream, Domain, Error, Result, RetrieableUrl, Site, WarcWriter, WeightedUrl, WorkerJob,
+    encoded_body, robot_client::RobotClient, wander_prioritiser::WanderPrioritiser, CrawlDatum,
+    DatumSink, Domain, Error, Result, RetrieableUrl, Site, WarcWriter, WeightedUrl, WorkerJob,
     MAX_CONTENT_LENGTH, MAX_OUTGOING_URLS_PER_PAGE,
 };
 
@@ -126,7 +126,8 @@ impl WorkerThread {
     }
 }
 
-pub struct JobExecutor<S: DatumStream> {
+/// JobExecutor receives a job from the coordinator and crawls the urls in the job.
+pub struct JobExecutor<S: DatumSink> {
     writer: Arc<S>,
     client: RobotClient,
     has_gotten_429_response: bool,
@@ -144,7 +145,7 @@ pub struct JobExecutor<S: DatumStream> {
     job: WorkerJob,
 }
 
-impl<S: DatumStream> JobExecutor<S> {
+impl<S: DatumSink> JobExecutor<S> {
     pub fn new(
         job: WorkerJob,
         config: Arc<CrawlerConfig>,
diff --git a/crates/core/src/live_index/crawler/crawlable_site.rs b/crates/core/src/live_index/crawler/crawlable_site.rs
@@ -136,7 +136,7 @@ impl CrawlableSite {
     }
 }
 
-impl crawler::DatumStream for tokio::sync::Mutex<Vec<crawler::CrawlDatum>> {
+impl crawler::DatumSink for tokio::sync::Mutex<Vec<crawler::CrawlDatum>> {
     async fn write(&self, crawl_datum: crawler::CrawlDatum) -> Result<(), crawler::Error> {
         self.lock().await.push(crawl_datum);
         Ok(())

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ pub(super) fn reqwest_client(config: &CrawlerConfig) -> Result<reqwest::Client>`
`45`	`45`	`.map_err(\|e\| Error::from(anyhow!(e)))`
`46`	`46`	`}`
`47`	`47`
	`48`	`+/// Reqwest client that respects robots.txt for each request.`
`48`	`49`	`#[derive(Clone)]`
`49`	`50`	`pub struct RobotClient {`
`50`	`51`	`robots_txt_manager: RobotsTxtManager,`
Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ impl CrawlableSite {`
`136`	`136`	`}`
`137`	`137`	`}`
`138`	`138`
`139`		`-impl crawler::DatumStream for tokio::sync::Mutex<Vec<crawler::CrawlDatum>> {`
	`139`	`+impl crawler::DatumSink for tokio::sync::Mutex<Vec<crawler::CrawlDatum>> {`
`140`	`140`	`async fn write(&self, crawl_datum: crawler::CrawlDatum) -> Result<(), crawler::Error> {`
`141`	`141`	`self.lock().await.push(crawl_datum);`
`142`	`142`	`Ok(())`