Add DomainLocks to serialize requests to each domain

All async tasks must wait 1 second since the last request to make
another to the same domain.
This commit is contained in:
2023-07-14 00:52:36 -04:00
parent b7efc61cfc
commit 923776d7a1
7 changed files with 157 additions and 28 deletions

View File

@@ -1,20 +1,17 @@
use std::fmt::{self, Display, Formatter};
use std::fs;
use std::path::Path;
use std::sync::Arc;
use bytes::Buf;
use feed_rs::parser;
use readability::extractor;
use reqwest::Client;
use sqlx::PgPool;
use tokio::sync::{broadcast, mpsc, Mutex};
use tokio::sync::{broadcast, mpsc};
use tracing::{info, instrument};
use url::Url;
use crate::config::Config;
use crate::models::entry::{update_entry, CreateEntry, Entry};
use crate::models::feed::{upsert_feed, CreateFeed, Feed};
use crate::domain_locks::DomainLocks;
use crate::models::entry::Entry;
/// The `EntryCrawler` actor fetches an entry url, extracts the content, and saves the content to
/// the file system and any associated metadata to the database.
@@ -27,6 +24,7 @@ struct EntryCrawler {
receiver: mpsc::Receiver<EntryCrawlerMessage>,
pool: PgPool,
client: Client,
domain_locks: DomainLocks,
content_dir: String,
}
@@ -68,12 +66,14 @@ impl EntryCrawler {
receiver: mpsc::Receiver<EntryCrawlerMessage>,
pool: PgPool,
client: Client,
domain_locks: DomainLocks,
content_dir: String,
) -> Self {
EntryCrawler {
receiver,
pool,
client,
domain_locks,
content_dir,
}
}
@@ -84,17 +84,26 @@ impl EntryCrawler {
let content_dir = Path::new(&self.content_dir);
let url =
Url::parse(&entry.url).map_err(|_| EntryCrawlerError::InvalidUrl(entry.url.clone()))?;
let domain = url
.domain()
.ok_or(EntryCrawlerError::InvalidUrl(entry.url.clone()))?;
let bytes = self
.client
.get(url.clone())
.send()
.await
.map_err(|_| EntryCrawlerError::FetchError(entry.url.clone()))?
.bytes()
.await
.map_err(|_| EntryCrawlerError::FetchError(entry.url.clone()))?;
.domain_locks
.run_request(domain, async {
self.client
.get(url.clone())
.send()
.await
.map_err(|_| EntryCrawlerError::FetchError(entry.url.clone()))?
.bytes()
.await
.map_err(|_| EntryCrawlerError::FetchError(entry.url.clone()))
})
.await?;
info!("fetched entry");
let article = extractor::extract(&mut bytes.reader(), &url)
.map_err(|_| EntryCrawlerError::ExtractError(entry.url.clone()))?;
info!("extracted content");
let id = entry.entry_id;
// TODO: update entry with scraped data
// if let Some(date) = article.date {
@@ -109,6 +118,7 @@ impl EntryCrawler {
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
fs::write(content_dir.join(format!("{}.txt", id)), article.text)
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
info!("saved content to filesystem");
Ok(entry)
}
@@ -153,9 +163,14 @@ pub enum EntryCrawlerHandleMessage {
impl EntryCrawlerHandle {
/// Creates an async actor task that will listen for messages on the `sender` channel.
pub fn new(pool: PgPool, client: Client, content_dir: String) -> Self {
pub fn new(
pool: PgPool,
client: Client,
domain_locks: DomainLocks,
content_dir: String,
) -> Self {
let (sender, receiver) = mpsc::channel(8);
let mut crawler = EntryCrawler::new(receiver, pool, client, content_dir);
let mut crawler = EntryCrawler::new(receiver, pool, client, domain_locks, content_dir);
tokio::spawn(async move { crawler.run().await });
Self { sender }

View File

@@ -10,6 +10,7 @@ use tracing::{info, info_span, instrument};
use url::Url;
use crate::actors::entry_crawler::EntryCrawlerHandle;
use crate::domain_locks::DomainLocks;
use crate::models::entry::{upsert_entries, CreateEntry, Entry};
use crate::models::feed::{upsert_feed, CreateFeed, Feed};
@@ -23,6 +24,7 @@ struct FeedCrawler {
receiver: mpsc::Receiver<FeedCrawlerMessage>,
pool: PgPool,
client: Client,
domain_locks: DomainLocks,
content_dir: String,
}
@@ -46,6 +48,8 @@ impl Display for FeedCrawlerMessage {
/// across threads (does not reference the originating Errors which are usually not cloneable).
#[derive(thiserror::Error, Debug, Clone)]
pub enum FeedCrawlerError {
#[error("invalid feed url: {0}")]
InvalidUrl(Url),
#[error("failed to fetch feed: {0}")]
FetchError(Url),
#[error("failed to parse feed: {0}")]
@@ -62,27 +66,36 @@ impl FeedCrawler {
receiver: mpsc::Receiver<FeedCrawlerMessage>,
pool: PgPool,
client: Client,
domain_locks: DomainLocks,
content_dir: String,
) -> Self {
FeedCrawler {
receiver,
pool,
client,
domain_locks,
content_dir,
}
}
#[instrument(skip_all, fields(url = %url))]
async fn crawl_feed(&self, url: Url) -> FeedCrawlerResult<Feed> {
let domain = url
.domain()
.ok_or(FeedCrawlerError::InvalidUrl(url.clone()))?;
let bytes = self
.client
.get(url.clone())
.send()
.await
.map_err(|_| FeedCrawlerError::FetchError(url.clone()))?
.bytes()
.await
.map_err(|_| FeedCrawlerError::FetchError(url.clone()))?;
.domain_locks
.run_request(domain, async {
self.client
.get(url.clone())
.send()
.await
.map_err(|_| FeedCrawlerError::FetchError(url.clone()))?
.bytes()
.await
.map_err(|_| FeedCrawlerError::FetchError(url.clone()))
})
.await?;
info!("fetched feed");
let parsed_feed =
parser::parse(&bytes[..]).map_err(|_| FeedCrawlerError::ParseError(url.clone()))?;
@@ -128,6 +141,7 @@ impl FeedCrawler {
let entry_crawler = EntryCrawlerHandle::new(
self.pool.clone(),
self.client.clone(),
self.domain_locks.clone(),
self.content_dir.clone(),
);
// TODO: ignoring this receiver for the time being, pipe through events eventually
@@ -179,9 +193,14 @@ pub enum FeedCrawlerHandleMessage {
impl FeedCrawlerHandle {
/// Creates an async actor task that will listen for messages on the `sender` channel.
pub fn new(pool: PgPool, client: Client, content_dir: String) -> Self {
pub fn new(
pool: PgPool,
client: Client,
domain_locks: DomainLocks,
content_dir: String,
) -> Self {
let (sender, receiver) = mpsc::channel(8);
let mut crawler = FeedCrawler::new(receiver, pool, client, content_dir);
let mut crawler = FeedCrawler::new(receiver, pool, client, domain_locks, content_dir);
tokio::spawn(async move { crawler.run().await });
Self { sender }