Move feed fetching to crawl_feed job, DomainRequestLimiter
`DomainRequestLimiter` is a distributed version of `DomainLocks` based on redis.
This commit is contained in:
parent
9c75a88c69
commit
65eac1975c
53
Cargo.lock
generated
53
Cargo.lock
generated
@ -605,12 +605,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cookie-factory"
|
name = "cookie-factory"
|
||||||
version = "0.3.3"
|
version = "0.3.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "9885fa71e26b8ab7855e2ec7cae6e9b380edff76cd052e07c683a0319d51b3a2"
|
checksum = "396de984970346b0d9e93d1415082923c679e5ae5c3ee3dcbd104f5610af126b"
|
||||||
dependencies = [
|
|
||||||
"futures",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "core-foundation"
|
name = "core-foundation"
|
||||||
@ -658,6 +655,7 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"feed-rs",
|
"feed-rs",
|
||||||
|
"fred 9.1.2",
|
||||||
"futures",
|
"futures",
|
||||||
"headers",
|
"headers",
|
||||||
"http 1.1.0",
|
"http 1.1.0",
|
||||||
@ -668,6 +666,7 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"opml",
|
"opml",
|
||||||
"password-auth",
|
"password-auth",
|
||||||
|
"rand",
|
||||||
"readability",
|
"readability",
|
||||||
"reqwest 0.12.4",
|
"reqwest 0.12.4",
|
||||||
"serde",
|
"serde",
|
||||||
@ -1014,7 +1013,33 @@ dependencies = [
|
|||||||
"log",
|
"log",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
"rand",
|
"rand",
|
||||||
"redis-protocol",
|
"redis-protocol 4.1.0",
|
||||||
|
"semver",
|
||||||
|
"socket2",
|
||||||
|
"tokio",
|
||||||
|
"tokio-stream",
|
||||||
|
"tokio-util",
|
||||||
|
"url",
|
||||||
|
"urlencoding",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fred"
|
||||||
|
version = "9.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "152397076bd317aa06bca9666e954ad15cde1a8f17b6ea4b007cf0bfc074d1d0"
|
||||||
|
dependencies = [
|
||||||
|
"arc-swap",
|
||||||
|
"async-trait",
|
||||||
|
"bytes",
|
||||||
|
"bytes-utils",
|
||||||
|
"crossbeam-queue",
|
||||||
|
"float-cmp",
|
||||||
|
"futures",
|
||||||
|
"log",
|
||||||
|
"parking_lot",
|
||||||
|
"rand",
|
||||||
|
"redis-protocol 5.0.1",
|
||||||
"semver",
|
"semver",
|
||||||
"socket2",
|
"socket2",
|
||||||
"tokio",
|
"tokio",
|
||||||
@ -2533,6 +2558,20 @@ dependencies = [
|
|||||||
"nom",
|
"nom",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "redis-protocol"
|
||||||
|
version = "5.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "65deb7c9501fbb2b6f812a30d59c0253779480853545153a51d8e9e444ddc99f"
|
||||||
|
dependencies = [
|
||||||
|
"bytes",
|
||||||
|
"bytes-utils",
|
||||||
|
"cookie-factory",
|
||||||
|
"crc16",
|
||||||
|
"log",
|
||||||
|
"nom",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "redox_syscall"
|
name = "redox_syscall"
|
||||||
version = "0.4.1"
|
version = "0.4.1"
|
||||||
@ -3684,7 +3723,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "0460effd120251714fae61067fa3e8b2a3c8202501cdcae375d4bd14194c85cf"
|
checksum = "0460effd120251714fae61067fa3e8b2a3c8202501cdcae375d4bd14194c85cf"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"async-trait",
|
"async-trait",
|
||||||
"fred",
|
"fred 8.0.6",
|
||||||
"rmp-serde",
|
"rmp-serde",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
"time",
|
"time",
|
||||||
|
@ -31,6 +31,7 @@ chrono = { version = "0.4", features = ["serde"] }
|
|||||||
clap = { version = "4.4", features = ["derive", "env"] }
|
clap = { version = "4.4", features = ["derive", "env"] }
|
||||||
dotenvy = "0.15"
|
dotenvy = "0.15"
|
||||||
feed-rs = "1.3"
|
feed-rs = "1.3"
|
||||||
|
fred = "9"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
headers = "0.4"
|
headers = "0.4"
|
||||||
http = "1.0.0"
|
http = "1.0.0"
|
||||||
@ -44,6 +45,7 @@ notify = "6"
|
|||||||
once_cell = "1.18"
|
once_cell = "1.18"
|
||||||
opml = "1.1"
|
opml = "1.1"
|
||||||
password-auth = "1.0"
|
password-auth = "1.0"
|
||||||
|
rand = { version = "0.8", features = ["small_rng"] }
|
||||||
readability = "0.3"
|
readability = "0.3"
|
||||||
reqwest = { version = "0.12", features = ["json"] }
|
reqwest = { version = "0.12", features = ["json"] }
|
||||||
serde = { version = "1", features = ["derive"] }
|
serde = { version = "1", features = ["derive"] }
|
||||||
|
@ -66,7 +66,9 @@ pub async fn crawl_fn(job: Crawl, state: Data<Arc<State>>) -> Result<(), CrawlEr
|
|||||||
// self.spawn_crawler_loop(feed, respond_to.clone());
|
// self.spawn_crawler_loop(feed, respond_to.clone());
|
||||||
// TODO: implement uniqueness on jobs per feed for ~1 minute
|
// TODO: implement uniqueness on jobs per feed for ~1 minute
|
||||||
apalis
|
apalis
|
||||||
.push(AsyncJob::CrawlFeed(CrawlFeedJob { feed }))
|
.push(AsyncJob::CrawlFeed(CrawlFeedJob {
|
||||||
|
feed_id: feed.feed_id,
|
||||||
|
}))
|
||||||
.await
|
.await
|
||||||
.map_err(|err| CrawlError::QueueJobError(err.to_string()))?;
|
.map_err(|err| CrawlError::QueueJobError(err.to_string()))?;
|
||||||
}
|
}
|
||||||
|
@ -5,11 +5,16 @@ use apalis::prelude::*;
|
|||||||
use apalis_redis::RedisStorage;
|
use apalis_redis::RedisStorage;
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use dotenvy::dotenv;
|
use dotenvy::dotenv;
|
||||||
|
use fred::prelude::*;
|
||||||
|
use reqwest::Client;
|
||||||
|
use sqlx::postgres::PgPoolOptions;
|
||||||
use tower::retry::RetryLayer;
|
use tower::retry::RetryLayer;
|
||||||
|
|
||||||
use lib::config::Config;
|
use lib::config::Config;
|
||||||
|
use lib::domain_request_limiter::DomainRequestLimiter;
|
||||||
use lib::jobs::{handle_async_job, AsyncJob};
|
use lib::jobs::{handle_async_job, AsyncJob};
|
||||||
use lib::log::init_worker_tracing;
|
use lib::log::init_worker_tracing;
|
||||||
|
use lib::USER_AGENT;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
@ -23,11 +28,27 @@ async fn main() -> Result<()> {
|
|||||||
let apalis_storage: RedisStorage<AsyncJob> =
|
let apalis_storage: RedisStorage<AsyncJob> =
|
||||||
RedisStorage::new_with_config(redis_conn, apalis_config);
|
RedisStorage::new_with_config(redis_conn, apalis_config);
|
||||||
|
|
||||||
|
let redis_config = RedisConfig::from_url(&config.redis_url)?;
|
||||||
|
let redis_pool = RedisPool::new(redis_config, None, None, None, 5)?;
|
||||||
|
redis_pool.connect();
|
||||||
|
redis_pool.wait_for_connect().await?;
|
||||||
|
let domain_request_limiter = DomainRequestLimiter::new(redis_pool, 10, 5, 100, 0.5);
|
||||||
|
|
||||||
|
let http_client = Client::builder().user_agent(USER_AGENT).build()?;
|
||||||
|
let db = PgPoolOptions::new()
|
||||||
|
.max_connections(config.database_max_connections)
|
||||||
|
.acquire_timeout(std::time::Duration::from_secs(3))
|
||||||
|
.connect(&config.database_url)
|
||||||
|
.await?;
|
||||||
|
|
||||||
Monitor::<TokioExecutor>::new()
|
Monitor::<TokioExecutor>::new()
|
||||||
.register_with_count(2, {
|
.register_with_count(2, {
|
||||||
WorkerBuilder::new("worker")
|
WorkerBuilder::new("worker")
|
||||||
.layer(RetryLayer::new(RetryPolicy::default()))
|
.layer(RetryLayer::new(RetryPolicy::default()))
|
||||||
.layer(TraceLayer::new())
|
.layer(TraceLayer::new())
|
||||||
|
.data(http_client)
|
||||||
|
.data(db)
|
||||||
|
.data(domain_request_limiter)
|
||||||
.backend(apalis_storage)
|
.backend(apalis_storage)
|
||||||
.build_fn(handle_async_job)
|
.build_fn(handle_async_job)
|
||||||
})
|
})
|
||||||
|
123
src/domain_request_limiter.rs
Normal file
123
src/domain_request_limiter.rs
Normal file
@ -0,0 +1,123 @@
|
|||||||
|
use anyhow::{anyhow, Result};
|
||||||
|
use fred::{clients::RedisPool, interfaces::KeysInterface, prelude::*};
|
||||||
|
use rand::{rngs::SmallRng, Rng, SeedableRng};
|
||||||
|
use std::{sync::Arc, time::Duration};
|
||||||
|
use tokio::{sync::Mutex, time::sleep};
|
||||||
|
|
||||||
|
/// A Redis-based rate limiter for domain-specific requests with jittered retry delay.
|
||||||
|
///
|
||||||
|
/// This limiter uses a fixed window algorithm with a 1-second window and applies
|
||||||
|
/// jitter to the retry delay to help prevent synchronized retries in distributed systems.
|
||||||
|
/// It uses fred's RedisPool for efficient connection management.
|
||||||
|
///
|
||||||
|
/// Limitations:
|
||||||
|
/// 1. Fixed window: The limit resets every second, potentially allowing short traffic bursts
|
||||||
|
/// at window boundaries.
|
||||||
|
/// 2. No token bucket: Doesn't accumulate unused capacity from quiet periods.
|
||||||
|
/// 3. Potential overcounting: In distributed systems, there's a small chance of overcounting
|
||||||
|
/// near window ends due to race conditions.
|
||||||
|
/// 4. Redis dependency: Rate limiting fails open if Redis is unavailable.
|
||||||
|
/// 5. Blocking: The acquire method will block until a request is allowed or max_retries is reached.
|
||||||
|
///
|
||||||
|
/// Usage example:
|
||||||
|
/// ```
|
||||||
|
/// use fred::prelude::*;
|
||||||
|
///
|
||||||
|
/// #[tokio::main]
|
||||||
|
/// async fn main() -> Result<()> {
|
||||||
|
/// let config = RedisConfig::default();
|
||||||
|
/// let pool = RedisPool::new(config, None, None, 5)?;
|
||||||
|
/// pool.connect();
|
||||||
|
/// pool.wait_for_connect().await?;
|
||||||
|
///
|
||||||
|
/// let limiter = DomainRequestLimiter::new(pool, 10, 5, 100, 0.5);
|
||||||
|
/// let domain = "example.com";
|
||||||
|
///
|
||||||
|
/// for _ in 0..15 {
|
||||||
|
/// match limiter.acquire(domain).await {
|
||||||
|
/// Ok(()) => println!("Request allowed"),
|
||||||
|
/// Err(_) => println!("Max retries reached, request denied"),
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// Ok(())
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct DomainRequestLimiter {
|
||||||
|
redis_pool: RedisPool,
|
||||||
|
requests_per_second: u32,
|
||||||
|
max_retries: u32,
|
||||||
|
base_retry_delay_ms: u64,
|
||||||
|
jitter_factor: f64,
|
||||||
|
// TODO: I think I can get rid of this if I instantiate a DomainRequestLimiter per-worker, but
|
||||||
|
// I'm not sure how to do that in apalis (then I could just use thread_rng)
|
||||||
|
rng: Arc<Mutex<SmallRng>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DomainRequestLimiter {
|
||||||
|
/// Create a new DomainRequestLimiter.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `redis_pool` - A fred RedisPool.
|
||||||
|
/// * `requests_per_second` - Maximum allowed requests per second per domain.
|
||||||
|
/// * `max_retries` - Maximum number of retries before giving up.
|
||||||
|
/// * `base_retry_delay_ms` - Base delay between retries in milliseconds.
|
||||||
|
/// * `jitter_factor` - Factor to determine the maximum jitter (0.0 to 1.0).
|
||||||
|
pub fn new(
|
||||||
|
redis_pool: RedisPool,
|
||||||
|
requests_per_second: u32,
|
||||||
|
max_retries: u32,
|
||||||
|
base_retry_delay_ms: u64,
|
||||||
|
jitter_factor: f64,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
redis_pool,
|
||||||
|
requests_per_second,
|
||||||
|
max_retries,
|
||||||
|
base_retry_delay_ms,
|
||||||
|
jitter_factor: jitter_factor.clamp(0.0, 1.0),
|
||||||
|
rng: Arc::new(Mutex::new(SmallRng::from_entropy())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Attempt to acquire permission for a request, retrying if necessary.
|
||||||
|
///
|
||||||
|
/// This method will attempt to acquire permission up to max_retries times,
|
||||||
|
/// sleeping for a jittered delay between each attempt.
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
/// * `domain` - The domain for which to check the rate limit.
|
||||||
|
///
|
||||||
|
/// # Returns
|
||||||
|
/// Ok(()) if permission is granted, or an error if max retries are exceeded.
|
||||||
|
pub async fn acquire(&self, domain: &str) -> Result<()> {
|
||||||
|
for attempt in 0..=self.max_retries {
|
||||||
|
if self.try_acquire(domain).await? {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
if attempt < self.max_retries {
|
||||||
|
let mut rng = self.rng.lock().await;
|
||||||
|
let jitter =
|
||||||
|
rng.gen::<f64>() * self.jitter_factor * self.base_retry_delay_ms as f64;
|
||||||
|
let delay = self.base_retry_delay_ms + jitter as u64;
|
||||||
|
sleep(Duration::from_millis(delay)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(anyhow!(
|
||||||
|
"Max retries exceeded for domain: {:?}, request denied",
|
||||||
|
domain
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn try_acquire(&self, domain: &str) -> Result<bool, RedisError> {
|
||||||
|
let key = format!("rate_limit:{}", domain);
|
||||||
|
|
||||||
|
let count: u32 = self.redis_pool.incr(&key).await?;
|
||||||
|
if count == 1 {
|
||||||
|
self.redis_pool.expire(&key, 1).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(count <= self.requests_per_second)
|
||||||
|
}
|
||||||
|
}
|
@ -1,16 +1,188 @@
|
|||||||
use anyhow::Result;
|
use std::cmp::Ordering;
|
||||||
use serde::{Deserialize, Serialize};
|
|
||||||
use tracing::{info, instrument};
|
|
||||||
|
|
||||||
use crate::models::feed::Feed;
|
use anyhow::{anyhow, Result};
|
||||||
|
use apalis::prelude::*;
|
||||||
|
use chrono::{Duration, Utc};
|
||||||
|
use feed_rs::parser;
|
||||||
|
use http::{header, HeaderMap, StatusCode};
|
||||||
|
use reqwest::Client;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use tracing::{info, info_span, instrument, warn};
|
||||||
|
use url::Url;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::domain_request_limiter::DomainRequestLimiter;
|
||||||
|
use crate::models::entry::{CreateEntry, Entry};
|
||||||
|
use crate::models::feed::{Feed, MAX_CRAWL_INTERVAL_MINUTES, MIN_CRAWL_INTERVAL_MINUTES};
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||||
pub struct CrawlFeedJob {
|
pub struct CrawlFeedJob {
|
||||||
pub feed: Feed,
|
pub feed_id: Uuid,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip_all, fields(feed_id = %feed.feed_id))]
|
#[instrument(skip_all, fields(feed_id = %feed_id))]
|
||||||
pub async fn crawl_feed(CrawlFeedJob { feed }: CrawlFeedJob) -> Result<()> {
|
pub async fn crawl_feed(
|
||||||
info!("Crawling feed: {:?}", feed.feed_id);
|
CrawlFeedJob { feed_id }: CrawlFeedJob,
|
||||||
Err(anyhow::anyhow!("Not implemented"))
|
http_client: Data<Client>,
|
||||||
|
db: Data<PgPool>,
|
||||||
|
domain_request_limiter: Data<DomainRequestLimiter>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut feed = Feed::get(&*db, feed_id).await?;
|
||||||
|
info!("got feed from db");
|
||||||
|
let url = Url::parse(&feed.url)?;
|
||||||
|
let domain = url
|
||||||
|
.domain()
|
||||||
|
.ok_or(anyhow!("invalid url: {:?}", feed.url.clone()))?;
|
||||||
|
let mut headers = HeaderMap::new();
|
||||||
|
if let Some(etag) = &feed.etag_header {
|
||||||
|
if let Ok(etag) = etag.parse() {
|
||||||
|
headers.insert(header::IF_NONE_MATCH, etag);
|
||||||
|
} else {
|
||||||
|
warn!(%etag, "failed to parse saved etag header");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(last_modified) = &feed.last_modified_header {
|
||||||
|
if let Ok(last_modified) = last_modified.parse() {
|
||||||
|
headers.insert(header::IF_MODIFIED_SINCE, last_modified);
|
||||||
|
} else {
|
||||||
|
warn!(
|
||||||
|
%last_modified,
|
||||||
|
"failed to parse saved last_modified header",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(url=%url, "starting fetch");
|
||||||
|
domain_request_limiter.acquire(domain).await?;
|
||||||
|
let resp = http_client.get(url.clone()).headers(headers).send().await?;
|
||||||
|
let headers = resp.headers();
|
||||||
|
if let Some(etag) = headers.get(header::ETAG) {
|
||||||
|
if let Ok(etag) = etag.to_str() {
|
||||||
|
feed.etag_header = Some(etag.to_string());
|
||||||
|
} else {
|
||||||
|
warn!(?etag, "failed to convert response etag header to string");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(last_modified) = headers.get(header::LAST_MODIFIED) {
|
||||||
|
if let Ok(last_modified) = last_modified.to_str() {
|
||||||
|
feed.last_modified_header = Some(last_modified.to_string());
|
||||||
|
} else {
|
||||||
|
warn!(
|
||||||
|
?last_modified,
|
||||||
|
"failed to convert response last_modified header to string",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
info!(url=%url, "fetched feed");
|
||||||
|
if resp.status() == StatusCode::NOT_MODIFIED {
|
||||||
|
info!("feed returned not modified status");
|
||||||
|
feed.last_crawled_at = Some(Utc::now());
|
||||||
|
feed.last_crawl_error = None;
|
||||||
|
feed.save(&*db).await?;
|
||||||
|
info!("updated feed in db");
|
||||||
|
return Ok(());
|
||||||
|
} else if !resp.status().is_success() {
|
||||||
|
warn!("feed returned non-successful status");
|
||||||
|
feed.last_crawled_at = Some(Utc::now());
|
||||||
|
feed.last_crawl_error = resp.status().canonical_reason().map(|s| s.to_string());
|
||||||
|
feed.save(&*db).await?;
|
||||||
|
info!("updated feed in db");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let bytes = resp.bytes().await?;
|
||||||
|
|
||||||
|
let parsed_feed = parser::parse(&bytes[..])?;
|
||||||
|
info!("parsed feed");
|
||||||
|
feed.url = url.to_string();
|
||||||
|
feed.feed_type = parsed_feed.feed_type.into();
|
||||||
|
feed.last_crawled_at = Some(Utc::now());
|
||||||
|
feed.last_crawl_error = None;
|
||||||
|
if let Some(title) = parsed_feed.title {
|
||||||
|
feed.title = Some(title.content);
|
||||||
|
}
|
||||||
|
if let Some(description) = parsed_feed.description {
|
||||||
|
feed.description = Some(description.content);
|
||||||
|
}
|
||||||
|
let last_entry_published_at = parsed_feed.entries.iter().filter_map(|e| e.published).max();
|
||||||
|
if let Some(prev_last_entry_published_at) = feed.last_entry_published_at {
|
||||||
|
if let Some(published_at) = last_entry_published_at {
|
||||||
|
let time_since_last_entry = if published_at == prev_last_entry_published_at {
|
||||||
|
// No new entry since last crawl, compare current time to last publish instead
|
||||||
|
Utc::now() - prev_last_entry_published_at
|
||||||
|
} else {
|
||||||
|
// Compare new entry publish time to previous publish time
|
||||||
|
published_at - prev_last_entry_published_at
|
||||||
|
};
|
||||||
|
match time_since_last_entry.cmp(&Duration::minutes(feed.crawl_interval_minutes.into()))
|
||||||
|
{
|
||||||
|
Ordering::Greater => {
|
||||||
|
feed.crawl_interval_minutes = i32::max(
|
||||||
|
(feed.crawl_interval_minutes as f32 * 1.2).ceil() as i32,
|
||||||
|
MAX_CRAWL_INTERVAL_MINUTES,
|
||||||
|
);
|
||||||
|
info!(
|
||||||
|
interval = feed.crawl_interval_minutes,
|
||||||
|
"increased crawl interval"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ordering::Less => {
|
||||||
|
feed.crawl_interval_minutes = i32::max(
|
||||||
|
(feed.crawl_interval_minutes as f32 / 1.2).ceil() as i32,
|
||||||
|
MIN_CRAWL_INTERVAL_MINUTES,
|
||||||
|
);
|
||||||
|
info!(
|
||||||
|
interval = feed.crawl_interval_minutes,
|
||||||
|
"decreased crawl interval"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ordering::Equal => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
feed.last_entry_published_at = last_entry_published_at;
|
||||||
|
let feed = feed.save(&*db).await?;
|
||||||
|
info!("updated feed in db");
|
||||||
|
|
||||||
|
let mut payload = Vec::with_capacity(parsed_feed.entries.len());
|
||||||
|
for entry in parsed_feed.entries {
|
||||||
|
let entry_span = info_span!("entry", id = entry.id);
|
||||||
|
let _entry_span_guard = entry_span.enter();
|
||||||
|
if let Some(link) = entry.links.get(0) {
|
||||||
|
// if no scraped or feed date is available, fallback to the current time
|
||||||
|
let published_at = entry.published.unwrap_or_else(Utc::now);
|
||||||
|
let entry = CreateEntry {
|
||||||
|
title: entry.title.map(|t| t.content),
|
||||||
|
url: link.href.clone(),
|
||||||
|
description: entry.summary.map(|s| s.content),
|
||||||
|
feed_id: feed.feed_id,
|
||||||
|
published_at,
|
||||||
|
};
|
||||||
|
payload.push(entry);
|
||||||
|
} else {
|
||||||
|
warn!("skipping feed entry with no links");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let entries = Entry::bulk_upsert(&*db, payload).await?;
|
||||||
|
let (new, updated) = entries
|
||||||
|
.into_iter()
|
||||||
|
.partition::<Vec<_>, _>(|entry| entry.updated_at.is_none());
|
||||||
|
info!(new = new.len(), updated = updated.len(), "saved entries");
|
||||||
|
|
||||||
|
for entry in new {
|
||||||
|
// TODO: queue the entry crawls
|
||||||
|
//
|
||||||
|
// let entry_crawler = EntryCrawlerHandle::new(
|
||||||
|
// self.pool.clone(),
|
||||||
|
// self.client.clone(),
|
||||||
|
// self.domain_locks.clone(),
|
||||||
|
// self.content_dir.clone(),
|
||||||
|
// );
|
||||||
|
// let mut entry_receiver = entry_crawler.crawl(entry).await;
|
||||||
|
// while let Ok(EntryCrawlerHandleMessage::Entry(result)) = entry_receiver.recv().await {
|
||||||
|
// let _ = respond_to.send(FeedCrawlerHandleMessage::Entry(result));
|
||||||
|
// }
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use apalis::prelude::*;
|
use apalis::prelude::*;
|
||||||
use apalis_redis::RedisStorage;
|
use reqwest::Client;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::PgPool;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tracing::{error, info, instrument};
|
use tracing::{error, info, instrument};
|
||||||
|
|
||||||
@ -8,6 +9,8 @@ mod crawl_feed;
|
|||||||
|
|
||||||
pub use crawl_feed::CrawlFeedJob;
|
pub use crawl_feed::CrawlFeedJob;
|
||||||
|
|
||||||
|
use crate::domain_request_limiter::DomainRequestLimiter;
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Serialize, Clone)]
|
#[derive(Debug, Deserialize, Serialize, Clone)]
|
||||||
pub enum AsyncJob {
|
pub enum AsyncJob {
|
||||||
HelloWorld(String),
|
HelloWorld(String),
|
||||||
@ -27,13 +30,18 @@ pub async fn handle_async_job(
|
|||||||
// TODO: add task_id to tracing instrumentation context
|
// TODO: add task_id to tracing instrumentation context
|
||||||
// it casuses a panic in 0.6.0 currently, see: https://github.com/geofmureithi/apalis/issues/398
|
// it casuses a panic in 0.6.0 currently, see: https://github.com/geofmureithi/apalis/issues/398
|
||||||
// task_id: Data<TaskId>,
|
// task_id: Data<TaskId>,
|
||||||
|
http_client: Data<Client>,
|
||||||
|
db: Data<PgPool>,
|
||||||
|
domain_request_limiter: Data<DomainRequestLimiter>,
|
||||||
) -> Result<(), AsyncJobError> {
|
) -> Result<(), AsyncJobError> {
|
||||||
let result = match job {
|
let result = match job {
|
||||||
AsyncJob::HelloWorld(name) => {
|
AsyncJob::HelloWorld(name) => {
|
||||||
info!("Hello, {}!", name);
|
info!("Hello, {}!", name);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
AsyncJob::CrawlFeed(job) => crawl_feed::crawl_feed(job).await,
|
AsyncJob::CrawlFeed(job) => {
|
||||||
|
crawl_feed::crawl_feed(job, http_client, db, domain_request_limiter).await
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
match result {
|
match result {
|
||||||
|
@ -3,6 +3,7 @@ pub mod api_response;
|
|||||||
pub mod auth;
|
pub mod auth;
|
||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod domain_locks;
|
pub mod domain_locks;
|
||||||
|
pub mod domain_request_limiter;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
pub mod handlers;
|
pub mod handlers;
|
||||||
pub mod headers;
|
pub mod headers;
|
||||||
|
Loading…
Reference in New Issue
Block a user