Add CrawlScheduler actor, shared client w/ last modified headers
This commit is contained in:
parent
02d5cb9976
commit
d17f909312
@ -22,7 +22,6 @@ begin
|
||||
BEFORE UPDATE
|
||||
ON %s
|
||||
FOR EACH ROW
|
||||
WHEN (OLD is distinct from NEW)
|
||||
EXECUTE FUNCTION set_updated_at();', tablename);
|
||||
end;
|
||||
$$ language plpgsql;
|
||||
@ -41,6 +40,8 @@ create table if not exists "feed" (
|
||||
description text default null,
|
||||
crawl_interval_minutes int not null default 180,
|
||||
last_crawl_error text default null,
|
||||
etag_header text default null,
|
||||
last_modified_header text default null,
|
||||
last_crawled_at timestamptz default null,
|
||||
last_entry_published_at timestamptz default null,
|
||||
created_at timestamptz not null default now(),
|
||||
@ -57,6 +58,8 @@ create table if not exists "entry" (
|
||||
url varchar(2048) not null,
|
||||
description text,
|
||||
feed_id uuid not null references "feed" (feed_id) on delete cascade,
|
||||
etag_header text default null,
|
||||
last_modified_header text default null,
|
||||
published_at timestamptz not null,
|
||||
created_at timestamptz not null default now(),
|
||||
updated_at timestamptz,
|
||||
|
284
src/actors/crawl_scheduler.rs
Normal file
284
src/actors/crawl_scheduler.rs
Normal file
@ -0,0 +1,284 @@
|
||||
use std::fmt::{self, Display, Formatter};
|
||||
use std::time::Duration;
|
||||
|
||||
use chrono::Utc;
|
||||
use reqwest::Client;
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tokio::time::{interval_at, Instant};
|
||||
use tracing::{debug, error, info, instrument};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::actors::feed_crawler::{FeedCrawlerError, FeedCrawlerHandle, FeedCrawlerHandleMessage};
|
||||
use crate::domain_locks::DomainLocks;
|
||||
use crate::models::feed::{Feed, GetFeedsOptions};
|
||||
|
||||
struct CrawlScheduler {
|
||||
receiver: mpsc::Receiver<CrawlSchedulerMessage>,
|
||||
pool: PgPool,
|
||||
client: Client,
|
||||
domain_locks: DomainLocks,
|
||||
content_dir: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum CrawlSchedulerMessage {
|
||||
Schedule {
|
||||
feed_id: Uuid,
|
||||
respond_to: broadcast::Sender<CrawlSchedulerHandleMessage>,
|
||||
},
|
||||
Bootstrap {
|
||||
respond_to: broadcast::Sender<CrawlSchedulerHandleMessage>,
|
||||
},
|
||||
}
|
||||
|
||||
impl Display for CrawlSchedulerMessage {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
CrawlSchedulerMessage::Schedule { feed_id, .. } => write!(f, "Schedule({})", feed_id),
|
||||
CrawlSchedulerMessage::Bootstrap { .. } => write!(f, "Bootstrap"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An error type that enumerates possible failures during a crawl and is cloneable and can be sent
|
||||
/// across threads (does not reference the originating Errors which are usually not cloneable).
|
||||
#[derive(thiserror::Error, Debug, Clone)]
|
||||
pub enum CrawlSchedulerError {
|
||||
#[error("failed to fetch feed from database: {0}")]
|
||||
FetchFeedError(String),
|
||||
#[error("failed to fetch feeds from database: {0}")]
|
||||
FetchFeedsError(String),
|
||||
#[error("failed to crawl feed: {0}")]
|
||||
FeedCrawlerError(FeedCrawlerError),
|
||||
}
|
||||
pub type CrawlSchedulerResult<T, E = CrawlSchedulerError> = ::std::result::Result<T, E>;
|
||||
|
||||
impl CrawlScheduler {
|
||||
fn new(
|
||||
receiver: mpsc::Receiver<CrawlSchedulerMessage>,
|
||||
pool: PgPool,
|
||||
client: Client,
|
||||
domain_locks: DomainLocks,
|
||||
content_dir: String,
|
||||
) -> Self {
|
||||
CrawlScheduler {
|
||||
receiver,
|
||||
pool,
|
||||
client,
|
||||
domain_locks,
|
||||
content_dir,
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn bootstrap(
|
||||
&self,
|
||||
respond_to: broadcast::Sender<CrawlSchedulerHandleMessage>,
|
||||
) -> CrawlSchedulerResult<()> {
|
||||
debug!("scheduling crawlers");
|
||||
let mut options = GetFeedsOptions::default();
|
||||
loop {
|
||||
info!("fetching feeds before: {:?}", options.before);
|
||||
let feeds = match Feed::get_all(&self.pool, options.clone()).await {
|
||||
Err(err) => {
|
||||
return Err(CrawlSchedulerError::FetchFeedsError(err.to_string()));
|
||||
}
|
||||
Ok(feeds) if feeds.is_empty() => {
|
||||
info!("no more feeds found");
|
||||
break;
|
||||
}
|
||||
Ok(feeds) => feeds,
|
||||
};
|
||||
info!("found {} feeds", feeds.len());
|
||||
options.before = feeds.last().map(|f| f.created_at);
|
||||
|
||||
for feed in feeds.into_iter() {
|
||||
self.spawn_crawler_loop(feed, respond_to.clone());
|
||||
}
|
||||
}
|
||||
debug!("done scheduling crawlers");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(feed_id = %feed_id))]
|
||||
async fn schedule(
|
||||
&self,
|
||||
feed_id: Uuid,
|
||||
respond_to: broadcast::Sender<CrawlSchedulerHandleMessage>,
|
||||
) -> CrawlSchedulerResult<()> {
|
||||
let feed = Feed::get(&self.pool, feed_id)
|
||||
.await
|
||||
.map_err(|err| CrawlSchedulerError::FetchFeedError(err.to_string()))?;
|
||||
self.spawn_crawler_loop(feed, respond_to);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(feed_id = %feed.feed_id))]
|
||||
fn spawn_crawler_loop(
|
||||
&self,
|
||||
feed: Feed,
|
||||
respond_to: broadcast::Sender<CrawlSchedulerHandleMessage>,
|
||||
) {
|
||||
let crawl_interval = Duration::from_secs(feed.crawl_interval_minutes as u64 * 60);
|
||||
let mut interval = tokio::time::interval(crawl_interval);
|
||||
if let Some(last_crawled_at) = feed.last_crawled_at {
|
||||
if let Ok(duration_since_last_crawl) = (Utc::now() - last_crawled_at).to_std() {
|
||||
if duration_since_last_crawl < crawl_interval {
|
||||
info!(
|
||||
"last crawled at {:?}, crawling again in {:?}",
|
||||
last_crawled_at,
|
||||
crawl_interval - duration_since_last_crawl
|
||||
);
|
||||
interval = interval_at(
|
||||
Instant::now() + (crawl_interval - duration_since_last_crawl),
|
||||
crawl_interval,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
let feed_crawler = FeedCrawlerHandle::new(
|
||||
self.pool.clone(),
|
||||
self.client.clone(),
|
||||
self.domain_locks.clone(),
|
||||
self.content_dir.clone(),
|
||||
);
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
debug!("spawned crawler for feed");
|
||||
interval.tick().await;
|
||||
debug!("tick!");
|
||||
let mut receiver = feed_crawler.crawl(feed.feed_id).await;
|
||||
match receiver.recv().await {
|
||||
Ok(FeedCrawlerHandleMessage::Feed(Ok(feed))) => {
|
||||
let crawl_interval =
|
||||
Duration::from_secs(feed.crawl_interval_minutes as u64 * 60);
|
||||
interval = interval_at(Instant::now() + crawl_interval, crawl_interval);
|
||||
info!(
|
||||
minutes = feed.crawl_interval_minutes,
|
||||
"updated crawl interval"
|
||||
);
|
||||
let _ = respond_to.send(CrawlSchedulerHandleMessage::FeedCrawler(
|
||||
FeedCrawlerHandleMessage::Feed(Ok(feed)),
|
||||
));
|
||||
}
|
||||
Ok(result) => {
|
||||
let _ = respond_to.send(CrawlSchedulerHandleMessage::FeedCrawler(result));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(msg = %msg))]
|
||||
async fn handle_message(&mut self, msg: CrawlSchedulerMessage) {
|
||||
match msg {
|
||||
CrawlSchedulerMessage::Bootstrap { respond_to } => {
|
||||
let result = self.bootstrap(respond_to.clone()).await;
|
||||
if let Err(err) = &result {
|
||||
error!("failed to bootstrap: {}", err);
|
||||
}
|
||||
|
||||
// ignore the result since the initiator may have cancelled waiting for the
|
||||
// response, and that is ok
|
||||
let _ = respond_to.send(CrawlSchedulerHandleMessage::Bootstrap(result));
|
||||
}
|
||||
CrawlSchedulerMessage::Schedule {
|
||||
feed_id,
|
||||
respond_to,
|
||||
} => {
|
||||
let result = self.schedule(feed_id, respond_to.clone()).await;
|
||||
if let Err(err) = &result {
|
||||
error!("failed to schedule: {}", err);
|
||||
}
|
||||
|
||||
// ignore the result since the initiator may have cancelled waiting for the
|
||||
// response, and that is ok
|
||||
let _ = respond_to.send(CrawlSchedulerHandleMessage::Schedule(result));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn run(&mut self) {
|
||||
debug!("starting crawl scheduler");
|
||||
while let Some(msg) = self.receiver.recv().await {
|
||||
self.handle_message(msg).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The `CrawlSchedulerHandle` is used to initialize and communicate with a `CrawlScheduler` actor.
|
||||
///
|
||||
/// Spawns an async task separate from the main web server that fetches all feeds from the database
|
||||
/// and then spawns a long-lived async task for each feed that repeatedly crawls the feed at the
|
||||
/// interval specified by each feeds' `crawl_interval_minutes`.
|
||||
///
|
||||
/// Initially, all feeds will immediately be crawled unless the `last_crawled_at` timestamp set in
|
||||
/// the database is less than the current time minus its `crawl_interval` in which case the crawl
|
||||
/// will be scheduled in the future.
|
||||
///
|
||||
/// After each crawl, the interval may be updated based on the result of the crawl.
|
||||
#[derive(Clone)]
|
||||
pub struct CrawlSchedulerHandle {
|
||||
sender: mpsc::Sender<CrawlSchedulerMessage>,
|
||||
}
|
||||
|
||||
/// The `CrawlSchedulerHandleMessage` is the response to a `CrawlSchedulerMessage` sent to the
|
||||
/// `CrawlSchedulerHandle`.
|
||||
///
|
||||
/// `CrawlSchedulerHandleMessage::Feed` contains the result of crawling a feed url.
|
||||
#[derive(Clone)]
|
||||
pub enum CrawlSchedulerHandleMessage {
|
||||
Bootstrap(CrawlSchedulerResult<()>),
|
||||
Schedule(CrawlSchedulerResult<()>),
|
||||
FeedCrawler(FeedCrawlerHandleMessage),
|
||||
}
|
||||
|
||||
impl CrawlSchedulerHandle {
|
||||
/// Creates an async actor task that will listen for messages on the `sender` channel.
|
||||
pub fn new(
|
||||
pool: PgPool,
|
||||
client: Client,
|
||||
domain_locks: DomainLocks,
|
||||
content_dir: String,
|
||||
) -> Self {
|
||||
let (sender, receiver) = mpsc::channel(8);
|
||||
let mut scheduler = CrawlScheduler::new(receiver, pool, client, domain_locks, content_dir);
|
||||
tokio::spawn(async move { scheduler.run().await });
|
||||
|
||||
Self { sender }
|
||||
}
|
||||
|
||||
/// Sends a `CrawlSchedulerMessage::Bootstrap` message to the running `CrawlScheduler` actor.
|
||||
///
|
||||
/// Listen to the result of the scheduling via the returned `broadcast::Receiver`.
|
||||
pub async fn bootstrap(&self) -> broadcast::Receiver<CrawlSchedulerHandleMessage> {
|
||||
let (sender, receiver) = broadcast::channel(8);
|
||||
let msg = CrawlSchedulerMessage::Bootstrap { respond_to: sender };
|
||||
|
||||
self.sender
|
||||
.send(msg)
|
||||
.await
|
||||
.expect("crawl scheduler task has died");
|
||||
receiver
|
||||
}
|
||||
|
||||
/// Sends a `CrawlSchedulerMessage::Schedule` message to the running `CrawlScheduler` actor.
|
||||
///
|
||||
/// Listen to the result of the scheduling via the returned `broadcast::Receiver`.
|
||||
pub async fn schedule(&self, feed_id: Uuid) -> broadcast::Receiver<CrawlSchedulerHandleMessage> {
|
||||
let (sender, receiver) = broadcast::channel(8);
|
||||
let msg = CrawlSchedulerMessage::Schedule {
|
||||
feed_id,
|
||||
respond_to: sender,
|
||||
};
|
||||
|
||||
self.sender
|
||||
.send(msg)
|
||||
.await
|
||||
.expect("crawl scheduler task has died");
|
||||
receiver
|
||||
}
|
||||
}
|
@ -7,7 +7,7 @@ use readability::extractor;
|
||||
use reqwest::Client;
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tracing::{info, instrument};
|
||||
use tracing::{debug, info, instrument};
|
||||
use url::Url;
|
||||
|
||||
use crate::domain_locks::DomainLocks;
|
||||
@ -80,7 +80,7 @@ impl EntryCrawler {
|
||||
|
||||
#[instrument(skip_all, fields(entry = %entry.url))]
|
||||
async fn crawl_entry(&self, entry: Entry) -> EntryCrawlerResult<Entry> {
|
||||
info!("Fetching and parsing entry");
|
||||
info!("starting fetch");
|
||||
let content_dir = Path::new(&self.content_dir);
|
||||
let url =
|
||||
Url::parse(&entry.url).map_err(|_| EntryCrawlerError::InvalidUrl(entry.url.clone()))?;
|
||||
@ -136,7 +136,7 @@ impl EntryCrawler {
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn run(&mut self) {
|
||||
info!("starting entry crawler");
|
||||
debug!("starting entry crawler");
|
||||
while let Some(msg) = self.receiver.recv().await {
|
||||
self.handle_message(msg).await;
|
||||
}
|
||||
|
@ -3,11 +3,14 @@ use std::fmt::{self, Display, Formatter};
|
||||
|
||||
use chrono::{Duration, Utc};
|
||||
use feed_rs::parser;
|
||||
use reqwest::Client;
|
||||
use reqwest::StatusCode;
|
||||
use reqwest::{
|
||||
header::{self, HeaderMap},
|
||||
Client,
|
||||
};
|
||||
use sqlx::PgPool;
|
||||
use tokio::sync::{broadcast, mpsc};
|
||||
use tracing::log::warn;
|
||||
use tracing::{error, info, info_span, instrument};
|
||||
use tracing::{debug, error, info, info_span, instrument, warn};
|
||||
use url::Url;
|
||||
use uuid::Uuid;
|
||||
|
||||
@ -94,20 +97,82 @@ impl FeedCrawler {
|
||||
let domain = url
|
||||
.domain()
|
||||
.ok_or(FeedCrawlerError::InvalidUrl(feed.url.clone()))?;
|
||||
let bytes = self
|
||||
let mut headers = HeaderMap::new();
|
||||
if let Some(etag) = &feed.etag_header {
|
||||
if let Ok(etag) = etag.parse() {
|
||||
headers.insert(header::IF_NONE_MATCH, etag);
|
||||
} else {
|
||||
warn!(%etag, "failed to parse saved etag header");
|
||||
}
|
||||
}
|
||||
if let Some(last_modified) = &feed.last_modified_header {
|
||||
if let Ok(last_modified) = last_modified.parse() {
|
||||
headers.insert(header::IF_MODIFIED_SINCE, last_modified);
|
||||
} else {
|
||||
warn!(
|
||||
%last_modified,
|
||||
"failed to parse saved last_modified header",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
info!(url=%url, "starting fetch");
|
||||
let resp = self
|
||||
.domain_locks
|
||||
.run_request(domain, async {
|
||||
self.client
|
||||
.get(url.clone())
|
||||
.headers(headers)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|_| FeedCrawlerError::FetchError(url.clone()))?
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|_| FeedCrawlerError::FetchError(url.clone()))
|
||||
})
|
||||
.await?;
|
||||
let headers = resp.headers();
|
||||
if let Some(etag) = headers.get(header::ETAG) {
|
||||
if let Ok(etag) = etag.to_str() {
|
||||
feed.etag_header = Some(etag.to_string());
|
||||
} else {
|
||||
warn!(?etag, "failed to convert response etag header to string");
|
||||
}
|
||||
}
|
||||
if let Some(last_modified) = headers.get(header::LAST_MODIFIED) {
|
||||
if let Ok(last_modified) = last_modified.to_str() {
|
||||
feed.last_modified_header = Some(last_modified.to_string());
|
||||
} else {
|
||||
warn!(
|
||||
?last_modified,
|
||||
"failed to convert response last_modified header to string",
|
||||
);
|
||||
}
|
||||
}
|
||||
info!(url=%url, "fetched feed");
|
||||
if resp.status() == StatusCode::NOT_MODIFIED {
|
||||
info!("feed returned not modified status");
|
||||
feed.last_crawled_at = Some(Utc::now());
|
||||
feed.last_crawl_error = None;
|
||||
let feed = feed
|
||||
.save(&self.pool)
|
||||
.await
|
||||
.map_err(|_| FeedCrawlerError::CreateFeedError(url.clone()))?;
|
||||
info!("updated feed in db");
|
||||
return Ok(feed);
|
||||
} else if !resp.status().is_success() {
|
||||
warn!("feed returned non-successful status");
|
||||
feed.last_crawl_error = resp.status().canonical_reason().map(|s| s.to_string());
|
||||
let feed = feed
|
||||
.save(&self.pool)
|
||||
.await
|
||||
.map_err(|_| FeedCrawlerError::CreateFeedError(url.clone()))?;
|
||||
info!("updated feed in db");
|
||||
return Ok(feed);
|
||||
}
|
||||
|
||||
let bytes = resp
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|_| FeedCrawlerError::FetchError(url.clone()))?;
|
||||
|
||||
let parsed_feed =
|
||||
parser::parse(&bytes[..]).map_err(|_| FeedCrawlerError::ParseError(url.clone()))?;
|
||||
info!("parsed feed");
|
||||
@ -131,15 +196,16 @@ impl FeedCrawler {
|
||||
Ordering::Greater => {
|
||||
feed.crawl_interval_minutes =
|
||||
i32::max(feed.crawl_interval_minutes * 2, MAX_CRAWL_INTERVAL_MINUTES);
|
||||
},
|
||||
}
|
||||
Ordering::Less => {
|
||||
feed.crawl_interval_minutes =
|
||||
i32::max(feed.crawl_interval_minutes / 2, MIN_CRAWL_INTERVAL_MINUTES);
|
||||
},
|
||||
Ordering::Equal => {},
|
||||
}
|
||||
Ordering::Equal => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
feed.last_entry_published_at = last_entry_published_at;
|
||||
let feed = feed
|
||||
.save(&self.pool)
|
||||
.await
|
||||
@ -162,7 +228,7 @@ impl FeedCrawler {
|
||||
};
|
||||
payload.push(entry);
|
||||
} else {
|
||||
warn!("Skipping feed entry with no links");
|
||||
warn!("skipping feed entry with no links");
|
||||
}
|
||||
}
|
||||
let entries = Entry::bulk_upsert(&self.pool, payload)
|
||||
@ -195,7 +261,8 @@ impl FeedCrawler {
|
||||
} => {
|
||||
let result = self.crawl_feed(feed_id).await;
|
||||
if let Err(error) = &result {
|
||||
match Feed::update_crawl_error(&self.pool, feed_id, format!("{}", error)).await {
|
||||
match Feed::update_crawl_error(&self.pool, feed_id, format!("{}", error)).await
|
||||
{
|
||||
Ok(_) => info!("updated feed last_crawl_error"),
|
||||
Err(e) => error!("failed to update feed last_crawl_error: {}", e),
|
||||
}
|
||||
@ -210,7 +277,7 @@ impl FeedCrawler {
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn run(&mut self) {
|
||||
info!("starting feed crawler");
|
||||
debug!("starting feed crawler");
|
||||
while let Some(msg) = self.receiver.recv().await {
|
||||
self.handle_message(msg).await;
|
||||
}
|
||||
|
@ -1,2 +1,3 @@
|
||||
pub mod crawl_scheduler;
|
||||
pub mod entry_crawler;
|
||||
pub mod feed_crawler;
|
||||
|
@ -10,9 +10,10 @@ use std::env;
|
||||
use tracing::info;
|
||||
use uuid::Uuid;
|
||||
|
||||
use lib::models::entry::{Entry, CreateEntry};
|
||||
use lib::models::entry::{CreateEntry, Entry};
|
||||
use lib::models::feed::{CreateFeed, Feed, FeedType};
|
||||
use lib::uuid::Base62Uuid;
|
||||
use lib::USER_AGENT;
|
||||
|
||||
/// CLI for crawlnicle
|
||||
#[derive(Parser)]
|
||||
@ -137,7 +138,7 @@ pub async fn main() -> Result<()> {
|
||||
}
|
||||
Commands::Crawl(CrawlFeed { id }) => {
|
||||
info!("Crawling feed {}...", Base62Uuid::from(id));
|
||||
let client = Client::new();
|
||||
let client = Client::builder().user_agent(USER_AGENT).build()?;
|
||||
// NOTE: this is not the same DomainLocks as the one used in the server so, if the
|
||||
// server is running, it will *not* serialize same-domain requests with it.
|
||||
let domain_locks = DomainLocks::new();
|
||||
|
@ -7,17 +7,14 @@ use axum::response::{IntoResponse, Redirect, Response, Sse};
|
||||
use axum::Form;
|
||||
use feed_rs::parser;
|
||||
use maud::html;
|
||||
use reqwest::Client;
|
||||
use serde::Deserialize;
|
||||
use serde_with::{serde_as, NoneAsEmptyString};
|
||||
use sqlx::PgPool;
|
||||
use tokio_stream::wrappers::errors::BroadcastStreamRecvError;
|
||||
use tokio_stream::wrappers::BroadcastStream;
|
||||
use tokio_stream::StreamExt;
|
||||
|
||||
use crate::actors::feed_crawler::{FeedCrawlerHandle, FeedCrawlerHandleMessage};
|
||||
use crate::config::Config;
|
||||
use crate::domain_locks::DomainLocks;
|
||||
use crate::actors::crawl_scheduler::{CrawlSchedulerHandle, CrawlSchedulerHandleMessage};
|
||||
use crate::actors::feed_crawler::FeedCrawlerHandleMessage;
|
||||
use crate::error::{Error, Result};
|
||||
use crate::models::entry::Entry;
|
||||
use crate::models::feed::{CreateFeed, Feed};
|
||||
@ -109,19 +106,9 @@ impl IntoResponse for AddFeedError {
|
||||
pub async fn post(
|
||||
State(pool): State<PgPool>,
|
||||
State(crawls): State<Crawls>,
|
||||
State(domain_locks): State<DomainLocks>,
|
||||
State(config): State<Config>,
|
||||
State(crawl_scheduler): State<CrawlSchedulerHandle>,
|
||||
Form(add_feed): Form<AddFeed>,
|
||||
) -> AddFeedResult<Response> {
|
||||
// TODO: store the client in axum state (as long as it can be used concurrently?)
|
||||
let client = Client::new();
|
||||
let feed_crawler = FeedCrawlerHandle::new(
|
||||
pool.clone(),
|
||||
client.clone(),
|
||||
domain_locks.clone(),
|
||||
config.content_dir.clone(),
|
||||
);
|
||||
|
||||
let feed = Feed::create(
|
||||
&pool,
|
||||
CreateFeed {
|
||||
@ -144,7 +131,7 @@ pub async fn post(
|
||||
AddFeedError::CreateFeedError(add_feed.url.clone(), err)
|
||||
})?;
|
||||
|
||||
let receiver = feed_crawler.crawl(feed.feed_id).await;
|
||||
let receiver = crawl_scheduler.schedule(feed.feed_id).await;
|
||||
{
|
||||
let mut crawls = crawls.lock().map_err(|_| {
|
||||
AddFeedError::CreateFeedError(add_feed.url.clone(), Error::InternalServerError)
|
||||
@ -185,20 +172,24 @@ pub async fn stream(
|
||||
let stream = BroadcastStream::new(receiver);
|
||||
let feed_id = format!("feed-{}", id);
|
||||
let stream = stream.map(move |msg| match msg {
|
||||
Ok(FeedCrawlerHandleMessage::Feed(Ok(feed))) => Ok::<Event, String>(
|
||||
Event::default().data(
|
||||
html! {
|
||||
turbo-stream action="remove" target="feed-stream" {}
|
||||
turbo-stream action="replace" target=(feed_id) {
|
||||
template {
|
||||
li id=(feed_id) { (feed_link(&feed, false)) }
|
||||
Ok(CrawlSchedulerHandleMessage::FeedCrawler(FeedCrawlerHandleMessage::Feed(Ok(feed)))) => {
|
||||
Ok::<Event, String>(
|
||||
Event::default().data(
|
||||
html! {
|
||||
turbo-stream action="remove" target="feed-stream" {}
|
||||
turbo-stream action="replace" target=(feed_id) {
|
||||
template {
|
||||
li id=(feed_id) { (feed_link(&feed, false)) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
.into_string(),
|
||||
),
|
||||
),
|
||||
Ok(FeedCrawlerHandleMessage::Feed(Err(error))) => Ok(Event::default().data(
|
||||
.into_string(),
|
||||
),
|
||||
)
|
||||
}
|
||||
Ok(CrawlSchedulerHandleMessage::FeedCrawler(FeedCrawlerHandleMessage::Feed(Err(
|
||||
error,
|
||||
)))) => Ok(Event::default().data(
|
||||
html! {
|
||||
turbo-stream action="remove" target="feed-stream" {}
|
||||
turbo-stream action="replace" target=(feed_id) {
|
||||
@ -210,18 +201,22 @@ pub async fn stream(
|
||||
.into_string(),
|
||||
)),
|
||||
// TODO: these Entry messages are not yet sent, need to handle them better
|
||||
Ok(FeedCrawlerHandleMessage::Entry(Ok(_))) => Ok(Event::default().data(
|
||||
html! {
|
||||
turbo-stream action="remove" target="feed-stream" {}
|
||||
turbo-stream action="replace" target=(feed_id) {
|
||||
template {
|
||||
li id=(feed_id) { "fetched entry" }
|
||||
Ok(CrawlSchedulerHandleMessage::FeedCrawler(FeedCrawlerHandleMessage::Entry(Ok(_)))) => {
|
||||
Ok(Event::default().data(
|
||||
html! {
|
||||
turbo-stream action="remove" target="feed-stream" {}
|
||||
turbo-stream action="replace" target=(feed_id) {
|
||||
template {
|
||||
li id=(feed_id) { "fetched entry" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
.into_string(),
|
||||
)),
|
||||
Ok(FeedCrawlerHandleMessage::Entry(Err(error))) => Ok(Event::default().data(
|
||||
.into_string(),
|
||||
))
|
||||
}
|
||||
Ok(CrawlSchedulerHandleMessage::FeedCrawler(FeedCrawlerHandleMessage::Entry(Err(
|
||||
error,
|
||||
)))) => Ok(Event::default().data(
|
||||
html! {
|
||||
turbo-stream action="remove" target="feed-stream" {}
|
||||
turbo-stream action="replace" target=(feed_id) {
|
||||
@ -232,7 +227,7 @@ pub async fn stream(
|
||||
}
|
||||
.into_string(),
|
||||
)),
|
||||
Err(BroadcastStreamRecvError::Lagged(_)) => Ok(Event::default()),
|
||||
_ => Ok(Event::default()),
|
||||
});
|
||||
Ok(Sse::new(stream).keep_alive(
|
||||
KeepAlive::new()
|
||||
|
@ -11,5 +11,6 @@ pub mod turbo_stream;
|
||||
pub mod utils;
|
||||
pub mod uuid;
|
||||
|
||||
pub const USER_AGENT: &str = "crawlnicle/0.1.0";
|
||||
pub const JS_BUNDLES: &str = include_str!("../static/js/manifest.txt");
|
||||
pub const CSS_BUNDLES: &str = include_str!("../static/css/manifest.txt");
|
||||
|
54
src/main.rs
54
src/main.rs
@ -11,7 +11,6 @@ use axum::{
|
||||
Router,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use chrono::{Duration, Utc};
|
||||
use clap::Parser;
|
||||
use dotenvy::dotenv;
|
||||
use notify::Watcher;
|
||||
@ -21,13 +20,15 @@ use tokio::sync::watch::channel;
|
||||
use tower::ServiceBuilder;
|
||||
use tower_http::{services::ServeDir, trace::TraceLayer};
|
||||
use tower_livereload::LiveReloadLayer;
|
||||
use tracing::{debug, info};
|
||||
use tracing::debug;
|
||||
|
||||
use lib::actors::crawl_scheduler::CrawlSchedulerHandle;
|
||||
use lib::config::Config;
|
||||
use lib::domain_locks::DomainLocks;
|
||||
use lib::handlers;
|
||||
use lib::log::init_tracing;
|
||||
use lib::state::AppState;
|
||||
use lib::{actors::feed_crawler::FeedCrawlerHandle, config::Config, models::feed::Feed};
|
||||
use lib::{domain_locks::DomainLocks, models::feed::GetFeedsOptions};
|
||||
use lib::USER_AGENT;
|
||||
|
||||
async fn serve(app: Router, addr: SocketAddr) -> Result<()> {
|
||||
debug!("listening on {}", addr);
|
||||
@ -48,6 +49,7 @@ async fn main() -> Result<()> {
|
||||
|
||||
let crawls = Arc::new(Mutex::new(HashMap::new()));
|
||||
let domain_locks = DomainLocks::new();
|
||||
let client = Client::builder().user_agent(USER_AGENT).build()?;
|
||||
|
||||
let pool = PgPoolOptions::new()
|
||||
.max_connections(config.database_max_connections)
|
||||
@ -56,6 +58,14 @@ async fn main() -> Result<()> {
|
||||
|
||||
sqlx::migrate!().run(&pool).await?;
|
||||
|
||||
let crawl_scheduler = CrawlSchedulerHandle::new(
|
||||
pool.clone(),
|
||||
client.clone(),
|
||||
domain_locks.clone(),
|
||||
config.content_dir.clone(),
|
||||
);
|
||||
let _ = crawl_scheduler.bootstrap().await;
|
||||
|
||||
let addr = format!("{}:{}", &config.host, &config.port).parse()?;
|
||||
let mut app = Router::new()
|
||||
.route("/api/v1/feeds", get(handlers::api::feeds::get))
|
||||
@ -75,42 +85,16 @@ async fn main() -> Result<()> {
|
||||
.route("/log/stream", get(handlers::log::stream))
|
||||
.nest_service("/static", ServeDir::new("static"))
|
||||
.with_state(AppState {
|
||||
pool: pool.clone(),
|
||||
config: config.clone(),
|
||||
pool,
|
||||
config,
|
||||
log_receiver,
|
||||
crawls,
|
||||
domain_locks: domain_locks.clone(),
|
||||
domain_locks,
|
||||
client,
|
||||
crawl_scheduler,
|
||||
})
|
||||
.layer(ServiceBuilder::new().layer(TraceLayer::new_for_http()));
|
||||
|
||||
info!("starting crawlers");
|
||||
let mut options = GetFeedsOptions::default();
|
||||
loop {
|
||||
let feeds = Feed::get_all(&pool, options.clone()).await?;
|
||||
if feeds.is_empty() {
|
||||
break;
|
||||
}
|
||||
for feed in feeds.iter() {
|
||||
let client = Client::new(); // TODO: store in state and reuse
|
||||
if let Some(last_crawled_at) = feed.last_crawled_at {
|
||||
if last_crawled_at
|
||||
>= Utc::now() - Duration::minutes(feed.crawl_interval_minutes.into())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let feed_crawler = FeedCrawlerHandle::new(
|
||||
pool.clone(),
|
||||
client.clone(),
|
||||
domain_locks.clone(),
|
||||
config.content_dir.clone(),
|
||||
);
|
||||
let _ = feed_crawler.crawl(feed.feed_id).await;
|
||||
}
|
||||
options.before = feeds.last().map(|f| f.created_at);
|
||||
}
|
||||
info!("done starting crawlers");
|
||||
|
||||
if cfg!(debug_assertions) {
|
||||
debug!("starting livereload");
|
||||
let livereload = LiveReloadLayer::new();
|
||||
|
@ -15,6 +15,8 @@ pub struct Entry {
|
||||
pub url: String,
|
||||
pub description: Option<String>,
|
||||
pub feed_id: Uuid,
|
||||
pub etag_header: Option<String>,
|
||||
pub last_modified_header: Option<String>,
|
||||
pub published_at: DateTime<Utc>,
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub updated_at: Option<DateTime<Utc>>,
|
||||
@ -272,7 +274,9 @@ impl Entry {
|
||||
url = $3,
|
||||
description = $4,
|
||||
feed_id = $5,
|
||||
published_at = $6
|
||||
etag_header = $6,
|
||||
last_modified_header = $7,
|
||||
published_at = $8
|
||||
where entry_id = $1
|
||||
returning *
|
||||
",
|
||||
@ -281,6 +285,8 @@ impl Entry {
|
||||
payload.url,
|
||||
payload.description,
|
||||
payload.feed_id,
|
||||
payload.etag_header,
|
||||
payload.last_modified_header,
|
||||
payload.published_at,
|
||||
)
|
||||
.fetch_one(pool)
|
||||
|
@ -62,6 +62,8 @@ pub struct Feed {
|
||||
pub description: Option<String>,
|
||||
pub crawl_interval_minutes: i32,
|
||||
pub last_crawl_error: Option<String>,
|
||||
pub etag_header: Option<String>,
|
||||
pub last_modified_header: Option<String>,
|
||||
pub last_crawled_at: Option<DateTime<Utc>>,
|
||||
pub last_entry_published_at: Option<DateTime<Utc>>,
|
||||
pub created_at: DateTime<Utc>,
|
||||
@ -137,6 +139,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -170,6 +174,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -197,6 +203,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -226,6 +234,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -253,6 +263,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -282,6 +294,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -309,6 +323,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -338,6 +354,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -365,6 +383,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -401,6 +421,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -436,6 +458,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -526,8 +550,10 @@ impl Feed {
|
||||
description = $5,
|
||||
crawl_interval_minutes = $6,
|
||||
last_crawl_error = $7,
|
||||
last_crawled_at = $8,
|
||||
last_entry_published_at = $9
|
||||
etag_header = $8,
|
||||
last_modified_header = $9,
|
||||
last_crawled_at = $10,
|
||||
last_entry_published_at = $11
|
||||
where feed_id = $1
|
||||
returning
|
||||
feed_id,
|
||||
@ -537,6 +563,8 @@ impl Feed {
|
||||
description,
|
||||
crawl_interval_minutes,
|
||||
last_crawl_error,
|
||||
etag_header,
|
||||
last_modified_header,
|
||||
last_crawled_at,
|
||||
last_entry_published_at,
|
||||
created_at,
|
||||
@ -550,6 +578,8 @@ impl Feed {
|
||||
self.description,
|
||||
self.crawl_interval_minutes,
|
||||
self.last_crawl_error,
|
||||
self.etag_header,
|
||||
self.last_modified_header,
|
||||
self.last_crawled_at,
|
||||
self.last_entry_published_at,
|
||||
)
|
||||
|
@ -4,10 +4,10 @@ pub fn header(title: &str) -> Markup {
|
||||
html! {
|
||||
header class="header" {
|
||||
nav {
|
||||
h1 { a href="/" data-turbo-frame="main" { (title) } }
|
||||
h1 { a href="/" { (title) } }
|
||||
ul {
|
||||
li { a href="/feeds" data-turbo-frame="main" { "feeds" } }
|
||||
li { a href="/log" data-turbo-frame="main" { "log" } }
|
||||
li { a href="/feeds" { "feeds" } }
|
||||
li { a href="/log" { "log" } }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -107,9 +107,7 @@ impl Layout {
|
||||
}
|
||||
body {
|
||||
(header(&self.title))
|
||||
turbo-frame id="main" data-turbo-action="advance" {
|
||||
(template)
|
||||
}
|
||||
(template)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
23
src/state.rs
23
src/state.rs
@ -7,13 +7,14 @@ use axum::extract::FromRef;
|
||||
use bytes::Bytes;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
use reqwest::Client;
|
||||
|
||||
use crate::actors::feed_crawler::FeedCrawlerHandleMessage;
|
||||
use crate::actors::crawl_scheduler::{CrawlSchedulerHandle, CrawlSchedulerHandleMessage};
|
||||
use crate::config::Config;
|
||||
use crate::domain_locks::DomainLocks;
|
||||
|
||||
/// A map of feed IDs to a channel receiver for the active `FeedCrawler` running a crawl for that
|
||||
/// feed.
|
||||
/// A map of feed IDs to a channel receiver for the active `CrawlScheduler` running a feed crawl
|
||||
/// for that feed.
|
||||
///
|
||||
/// Currently, the only purpose of this is to keep track of active crawls so that axum handlers can
|
||||
/// subscribe to the result of the crawl via the receiver channel which are then sent to end-users
|
||||
@ -22,7 +23,7 @@ use crate::domain_locks::DomainLocks;
|
||||
/// This map should only contain crawls that have just been created but not yet subscribed to.
|
||||
/// Entries are only added when a user adds a feed in the UI and entries are removed by the same
|
||||
/// user once a server-sent event connection is established.
|
||||
pub type Crawls = Arc<Mutex<HashMap<Uuid, broadcast::Receiver<FeedCrawlerHandleMessage>>>>;
|
||||
pub type Crawls = Arc<Mutex<HashMap<Uuid, broadcast::Receiver<CrawlSchedulerHandleMessage>>>>;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AppState {
|
||||
@ -31,6 +32,8 @@ pub struct AppState {
|
||||
pub log_receiver: watch::Receiver<Bytes>,
|
||||
pub crawls: Crawls,
|
||||
pub domain_locks: DomainLocks,
|
||||
pub client: Client,
|
||||
pub crawl_scheduler: CrawlSchedulerHandle,
|
||||
}
|
||||
|
||||
impl FromRef<AppState> for PgPool {
|
||||
@ -62,3 +65,15 @@ impl FromRef<AppState> for DomainLocks {
|
||||
state.domain_locks.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromRef<AppState> for Client {
|
||||
fn from_ref(state: &AppState) -> Self {
|
||||
state.client.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl FromRef<AppState> for CrawlSchedulerHandle {
|
||||
fn from_ref(state: &AppState) -> Self {
|
||||
state.crawl_scheduler.clone()
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user