Skip to content

Commit 8f7d712

Browse files
committed
document main crawler elements
1 parent 93ddf47 commit 8f7d712

File tree

7 files changed

+23
-9
lines changed

7 files changed

+23
-9
lines changed

crates/core/src/crawler/mod.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,15 @@
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
1616

17+
//! # Crawler
18+
//!
19+
//! The crawler is responsible for fetching webpages and storing them in WARC files
20+
//! for later processing.
21+
//!
22+
//! Before starting a crawl, a plan needs to be created. This plan is then used by
23+
//! the crawler coordinator to assign sites to crawl to different workers.
24+
//! A site is only assigned to one worker at a time for politeness.
25+
1726
use std::{collections::VecDeque, future::Future, net::SocketAddr, sync::Arc};
1827

1928
type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
@@ -35,7 +44,7 @@ pub use router::Router;
3544
mod file_queue;
3645
pub mod planner;
3746
pub mod robot_client;
38-
mod wander_prirotiser;
47+
mod wander_prioritiser;
3948
mod warc_writer;
4049
mod worker;
4150

@@ -304,7 +313,7 @@ impl Crawler {
304313
}
305314
}
306315

307-
pub trait DatumStream: Send + Sync {
316+
pub trait DatumSink: Send + Sync {
308317
fn write(&self, crawl_datum: CrawlDatum) -> impl Future<Output = Result<()>> + Send;
309318
fn finish(&self) -> impl Future<Output = Result<()>> + Send;
310319
}

crates/core/src/crawler/planner.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
//
1414
// You should have received a copy of the GNU Affero General Public License
1515
// along with this program. If not, see <https://www.gnu.org/licenses/>.
16+
1617
use anyhow::{anyhow, Result};
1718
use futures::stream::FuturesOrdered;
1819
use futures::StreamExt;
@@ -71,6 +72,7 @@ impl From<StoredUrl> for Url {
7172
}
7273
}
7374

75+
/// Store urls in groups on disk based on their harmonic rank.
7476
struct UrlGrouper {
7577
groups: Vec<speedy_kv::Db<StoredUrl, ()>>,
7678
folder: std::path::PathBuf,
@@ -169,6 +171,7 @@ struct Budget {
169171
remaining_schedulable: u64,
170172
}
171173

174+
/// Create a crawl plan based on the harmonic rank of the hosts.
172175
pub struct CrawlPlanner {
173176
host_centrality: Arc<speedy_kv::Db<NodeID, f64>>,
174177
host_centrality_rank: Arc<speedy_kv::Db<NodeID, u64>>,

crates/core/src/crawler/robot_client.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ pub(super) fn reqwest_client(config: &CrawlerConfig) -> Result<reqwest::Client>
4545
.map_err(|e| Error::from(anyhow!(e)))
4646
}
4747

48+
/// Reqwest client that respects robots.txt for each request.
4849
#[derive(Clone)]
4950
pub struct RobotClient {
5051
robots_txt_manager: RobotsTxtManager,
File renamed without changes.

crates/core/src/crawler/warc_writer.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use crate::{
2121
warc,
2222
};
2323

24-
use super::{CrawlDatum, DatumStream, Error, Result};
24+
use super::{CrawlDatum, DatumSink, Error, Result};
2525
use anyhow::anyhow;
2626

2727
/// The WarcWriter is responsible for storing the crawl datums
@@ -30,7 +30,7 @@ pub struct WarcWriter {
3030
tx: tokio::sync::mpsc::Sender<WarcWriterMessage>,
3131
}
3232

33-
impl DatumStream for WarcWriter {
33+
impl DatumSink for WarcWriter {
3434
async fn write(&self, crawl_datum: CrawlDatum) -> Result<()> {
3535
self.tx
3636
.send(WarcWriterMessage::Crawl(crawl_datum))

crates/core/src/crawler/worker.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ use crate::{
3939
};
4040

4141
use super::{
42-
encoded_body, robot_client::RobotClient, wander_prirotiser::WanderPrioritiser, CrawlDatum,
43-
DatumStream, Domain, Error, Result, RetrieableUrl, Site, WarcWriter, WeightedUrl, WorkerJob,
42+
encoded_body, robot_client::RobotClient, wander_prioritiser::WanderPrioritiser, CrawlDatum,
43+
DatumSink, Domain, Error, Result, RetrieableUrl, Site, WarcWriter, WeightedUrl, WorkerJob,
4444
MAX_CONTENT_LENGTH, MAX_OUTGOING_URLS_PER_PAGE,
4545
};
4646

@@ -126,7 +126,8 @@ impl WorkerThread {
126126
}
127127
}
128128

129-
pub struct JobExecutor<S: DatumStream> {
129+
/// JobExecutor receives a job from the coordinator and crawls the urls in the job.
130+
pub struct JobExecutor<S: DatumSink> {
130131
writer: Arc<S>,
131132
client: RobotClient,
132133
has_gotten_429_response: bool,
@@ -144,7 +145,7 @@ pub struct JobExecutor<S: DatumStream> {
144145
job: WorkerJob,
145146
}
146147

147-
impl<S: DatumStream> JobExecutor<S> {
148+
impl<S: DatumSink> JobExecutor<S> {
148149
pub fn new(
149150
job: WorkerJob,
150151
config: Arc<CrawlerConfig>,

crates/core/src/live_index/crawler/crawlable_site.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ impl CrawlableSite {
136136
}
137137
}
138138

139-
impl crawler::DatumStream for tokio::sync::Mutex<Vec<crawler::CrawlDatum>> {
139+
impl crawler::DatumSink for tokio::sync::Mutex<Vec<crawler::CrawlDatum>> {
140140
async fn write(&self, crawl_datum: crawler::CrawlDatum) -> Result<(), crawler::Error> {
141141
self.lock().await.push(crawl_datum);
142142
Ok(())

0 commit comments

Comments
 (0)