Add CrawlScheduler actor, shared client w/ last modified headers

This commit is contained in:
2023-07-15 21:40:31 -04:00
parent 02d5cb9976
commit d17f909312
14 changed files with 494 additions and 109 deletions

View File

@@ -11,7 +11,6 @@ use axum::{
Router,
};
use bytes::Bytes;
use chrono::{Duration, Utc};
use clap::Parser;
use dotenvy::dotenv;
use notify::Watcher;
@@ -21,13 +20,15 @@ use tokio::sync::watch::channel;
use tower::ServiceBuilder;
use tower_http::{services::ServeDir, trace::TraceLayer};
use tower_livereload::LiveReloadLayer;
use tracing::{debug, info};
use tracing::debug;
use lib::actors::crawl_scheduler::CrawlSchedulerHandle;
use lib::config::Config;
use lib::domain_locks::DomainLocks;
use lib::handlers;
use lib::log::init_tracing;
use lib::state::AppState;
use lib::{actors::feed_crawler::FeedCrawlerHandle, config::Config, models::feed::Feed};
use lib::{domain_locks::DomainLocks, models::feed::GetFeedsOptions};
use lib::USER_AGENT;
async fn serve(app: Router, addr: SocketAddr) -> Result<()> {
debug!("listening on {}", addr);
@@ -48,6 +49,7 @@ async fn main() -> Result<()> {
let crawls = Arc::new(Mutex::new(HashMap::new()));
let domain_locks = DomainLocks::new();
let client = Client::builder().user_agent(USER_AGENT).build()?;
let pool = PgPoolOptions::new()
.max_connections(config.database_max_connections)
@@ -56,6 +58,14 @@ async fn main() -> Result<()> {
sqlx::migrate!().run(&pool).await?;
let crawl_scheduler = CrawlSchedulerHandle::new(
pool.clone(),
client.clone(),
domain_locks.clone(),
config.content_dir.clone(),
);
let _ = crawl_scheduler.bootstrap().await;
let addr = format!("{}:{}", &config.host, &config.port).parse()?;
let mut app = Router::new()
.route("/api/v1/feeds", get(handlers::api::feeds::get))
@@ -75,42 +85,16 @@ async fn main() -> Result<()> {
.route("/log/stream", get(handlers::log::stream))
.nest_service("/static", ServeDir::new("static"))
.with_state(AppState {
pool: pool.clone(),
config: config.clone(),
pool,
config,
log_receiver,
crawls,
domain_locks: domain_locks.clone(),
domain_locks,
client,
crawl_scheduler,
})
.layer(ServiceBuilder::new().layer(TraceLayer::new_for_http()));
info!("starting crawlers");
let mut options = GetFeedsOptions::default();
loop {
let feeds = Feed::get_all(&pool, options.clone()).await?;
if feeds.is_empty() {
break;
}
for feed in feeds.iter() {
let client = Client::new(); // TODO: store in state and reuse
if let Some(last_crawled_at) = feed.last_crawled_at {
if last_crawled_at
>= Utc::now() - Duration::minutes(feed.crawl_interval_minutes.into())
{
continue;
}
}
let feed_crawler = FeedCrawlerHandle::new(
pool.clone(),
client.clone(),
domain_locks.clone(),
config.content_dir.clone(),
);
let _ = feed_crawler.crawl(feed.feed_id).await;
}
options.before = feeds.last().map(|f| f.created_at);
}
info!("done starting crawlers");
if cfg!(debug_assertions) {
debug!("starting livereload");
let livereload = LiveReloadLayer::new();