Update feed last_crawl_error & crawl_interval_minutes
This commit is contained in:
parent
4837cbb903
commit
ae95921966
@ -1,19 +1,20 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
use std::fmt::{self, Display, Formatter};
|
use std::fmt::{self, Display, Formatter};
|
||||||
|
|
||||||
use chrono::Utc;
|
use chrono::{Duration, Utc};
|
||||||
use feed_rs::parser;
|
use feed_rs::parser;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use sqlx::PgPool;
|
use sqlx::PgPool;
|
||||||
use tokio::sync::{broadcast, mpsc};
|
use tokio::sync::{broadcast, mpsc};
|
||||||
use tracing::log::warn;
|
use tracing::log::warn;
|
||||||
use tracing::{info, info_span, instrument};
|
use tracing::{error, info, info_span, instrument};
|
||||||
use url::Url;
|
use url::Url;
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
use crate::actors::entry_crawler::EntryCrawlerHandle;
|
use crate::actors::entry_crawler::EntryCrawlerHandle;
|
||||||
use crate::domain_locks::DomainLocks;
|
use crate::domain_locks::DomainLocks;
|
||||||
use crate::models::entry::{CreateEntry, Entry};
|
use crate::models::entry::{CreateEntry, Entry};
|
||||||
use crate::models::feed::Feed;
|
use crate::models::feed::{Feed, MAX_CRAWL_INTERVAL_MINUTES, MIN_CRAWL_INTERVAL_MINUTES};
|
||||||
use crate::uuid::Base62Uuid;
|
use crate::uuid::Base62Uuid;
|
||||||
|
|
||||||
/// The `FeedCrawler` actor fetches a feed url, parses it, and saves it to the database.
|
/// The `FeedCrawler` actor fetches a feed url, parses it, and saves it to the database.
|
||||||
@ -88,8 +89,8 @@ impl FeedCrawler {
|
|||||||
.await
|
.await
|
||||||
.map_err(|_| FeedCrawlerError::GetFeedError(Base62Uuid::from(feed_id)))?;
|
.map_err(|_| FeedCrawlerError::GetFeedError(Base62Uuid::from(feed_id)))?;
|
||||||
info!("got feed from db");
|
info!("got feed from db");
|
||||||
let url = Url::parse(&feed.url)
|
let url =
|
||||||
.map_err(|_| FeedCrawlerError::InvalidUrl(feed.url.clone()))?;
|
Url::parse(&feed.url).map_err(|_| FeedCrawlerError::InvalidUrl(feed.url.clone()))?;
|
||||||
let domain = url
|
let domain = url
|
||||||
.domain()
|
.domain()
|
||||||
.ok_or(FeedCrawlerError::InvalidUrl(feed.url.clone()))?;
|
.ok_or(FeedCrawlerError::InvalidUrl(feed.url.clone()))?;
|
||||||
@ -113,12 +114,32 @@ impl FeedCrawler {
|
|||||||
feed.url = url.to_string();
|
feed.url = url.to_string();
|
||||||
feed.feed_type = parsed_feed.feed_type.into();
|
feed.feed_type = parsed_feed.feed_type.into();
|
||||||
feed.last_crawled_at = Some(Utc::now());
|
feed.last_crawled_at = Some(Utc::now());
|
||||||
|
feed.last_crawl_error = None;
|
||||||
if let Some(title) = parsed_feed.title {
|
if let Some(title) = parsed_feed.title {
|
||||||
feed.title = Some(title.content);
|
feed.title = Some(title.content);
|
||||||
}
|
}
|
||||||
if let Some(description) = parsed_feed.description {
|
if let Some(description) = parsed_feed.description {
|
||||||
feed.description = Some(description.content);
|
feed.description = Some(description.content);
|
||||||
}
|
}
|
||||||
|
let last_entry_published_at = parsed_feed.entries.iter().filter_map(|e| e.published).max();
|
||||||
|
if let Some(prev_last_entry_published_at) = feed.last_entry_published_at {
|
||||||
|
if let Some(published_at) = last_entry_published_at {
|
||||||
|
let time_since_last_entry = published_at - prev_last_entry_published_at;
|
||||||
|
match time_since_last_entry
|
||||||
|
.cmp(&Duration::minutes(feed.crawl_interval_minutes.into()))
|
||||||
|
{
|
||||||
|
Ordering::Greater => {
|
||||||
|
feed.crawl_interval_minutes =
|
||||||
|
i32::max(feed.crawl_interval_minutes * 2, MAX_CRAWL_INTERVAL_MINUTES);
|
||||||
|
},
|
||||||
|
Ordering::Less => {
|
||||||
|
feed.crawl_interval_minutes =
|
||||||
|
i32::max(feed.crawl_interval_minutes / 2, MIN_CRAWL_INTERVAL_MINUTES);
|
||||||
|
},
|
||||||
|
Ordering::Equal => {},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
let feed = feed
|
let feed = feed
|
||||||
.save(&self.pool)
|
.save(&self.pool)
|
||||||
.await
|
.await
|
||||||
@ -173,6 +194,13 @@ impl FeedCrawler {
|
|||||||
respond_to,
|
respond_to,
|
||||||
} => {
|
} => {
|
||||||
let result = self.crawl_feed(feed_id).await;
|
let result = self.crawl_feed(feed_id).await;
|
||||||
|
if let Err(error) = &result {
|
||||||
|
match Feed::update_crawl_error(&self.pool, feed_id, format!("{}", error)).await {
|
||||||
|
Ok(_) => info!("updated feed last_crawl_error"),
|
||||||
|
Err(e) => error!("failed to update feed last_crawl_error: {}", e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ignore the result since the initiator may have cancelled waiting for the
|
// ignore the result since the initiator may have cancelled waiting for the
|
||||||
// response, and that is ok
|
// response, and that is ok
|
||||||
let _ = respond_to.send(FeedCrawlerHandleMessage::Feed(result));
|
let _ = respond_to.send(FeedCrawlerHandleMessage::Feed(result));
|
||||||
@ -227,10 +255,7 @@ impl FeedCrawlerHandle {
|
|||||||
/// Sends a `FeedCrawlerMessage::Crawl` message to the running `FeedCrawler` actor.
|
/// Sends a `FeedCrawlerMessage::Crawl` message to the running `FeedCrawler` actor.
|
||||||
///
|
///
|
||||||
/// Listen to the result of the crawl via the returned `broadcast::Receiver`.
|
/// Listen to the result of the crawl via the returned `broadcast::Receiver`.
|
||||||
pub async fn crawl(
|
pub async fn crawl(&self, feed_id: Uuid) -> broadcast::Receiver<FeedCrawlerHandleMessage> {
|
||||||
&self,
|
|
||||||
feed_id: Uuid,
|
|
||||||
) -> broadcast::Receiver<FeedCrawlerHandleMessage> {
|
|
||||||
let (sender, receiver) = broadcast::channel(8);
|
let (sender, receiver) = broadcast::channel(8);
|
||||||
let msg = FeedCrawlerMessage::Crawl {
|
let msg = FeedCrawlerMessage::Crawl {
|
||||||
feed_id,
|
feed_id,
|
||||||
|
@ -2,7 +2,7 @@ use std::str::FromStr;
|
|||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use sqlx::{FromRow, PgPool};
|
use sqlx::{FromRow, PgPool, postgres::PgQueryResult};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
use validator::Validate;
|
use validator::Validate;
|
||||||
|
|
||||||
@ -47,6 +47,9 @@ impl From<feed_rs::model::FeedType> for FeedType {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub const MIN_CRAWL_INTERVAL_MINUTES: i32 = 1;
|
||||||
|
pub const MAX_CRAWL_INTERVAL_MINUTES: i32 = 5040;
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize, Clone, FromRow)]
|
#[derive(Debug, Serialize, Deserialize, Clone, FromRow)]
|
||||||
pub struct Feed {
|
pub struct Feed {
|
||||||
pub feed_id: Uuid,
|
pub feed_id: Uuid,
|
||||||
@ -276,6 +279,18 @@ impl Feed {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn update_crawl_error(pool: &PgPool, feed_id: Uuid, last_crawl_error: String) -> Result<PgQueryResult> {
|
||||||
|
Ok(sqlx::query!(
|
||||||
|
r#"update feed set
|
||||||
|
last_crawl_error = $2
|
||||||
|
where feed_id = $1"#,
|
||||||
|
feed_id,
|
||||||
|
last_crawl_error,
|
||||||
|
)
|
||||||
|
.execute(pool)
|
||||||
|
.await?)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn save(&self, pool: &PgPool) -> Result<Feed> {
|
pub async fn save(&self, pool: &PgPool) -> Result<Feed> {
|
||||||
Ok(sqlx::query_as!(
|
Ok(sqlx::query_as!(
|
||||||
Feed,
|
Feed,
|
||||||
|
Loading…
Reference in New Issue
Block a user