Finish implemtning OPML importer

Now with progress messages!
This commit is contained in:
Tyler Hallada 2023-08-29 00:35:19 -04:00
parent eddf39b62e
commit 2f39be4152
10 changed files with 148 additions and 58 deletions

View File

@ -43,15 +43,15 @@ ul.entries {
font-size: 16px; font-size: 16px;
} }
ul.entries li { li.entry {
margin-bottom: 8px; margin-bottom: 8px;
} }
ul.entries li a { a.entry-link {
text-decoration: none; text-decoration: none;
} }
ul.entries li em.domain { em.entry-link-domain {
margin-left: 8px; margin-left: 8px;
color: rgba(0, 0, 0, 0.75); color: rgba(0, 0, 0, 0.75);
} }
@ -152,6 +152,20 @@ form.feed-form .form-grid button {
grid-column: 3 / 4; grid-column: 3 / 4;
} }
ul#add-feed-messages {
list-style: none;
padding: 0;
margin: 0;
overflow-x: hidden;
white-space: nowrap;
}
ul#add-feed-messages li {
overflow: hidden;
white-space: no-wrap;
text-overflow: ellipsis;
}
/* Feed */ /* Feed */
header.feed-header { header.feed-header {

View File

@ -123,7 +123,11 @@ impl CrawlScheduler {
let crawl_interval = Duration::from_secs(feed.crawl_interval_minutes as u64 * 60); let crawl_interval = Duration::from_secs(feed.crawl_interval_minutes as u64 * 60);
let mut interval = tokio::time::interval(crawl_interval); let mut interval = tokio::time::interval(crawl_interval);
if let Some(last_crawled_at) = feed.last_crawled_at { if let Some(last_crawled_at) = feed.last_crawled_at {
dbg!(last_crawled_at);
dbg!(Utc::now());
if let Ok(duration_since_last_crawl) = (Utc::now() - last_crawled_at).to_std() { if let Ok(duration_since_last_crawl) = (Utc::now() - last_crawled_at).to_std() {
dbg!(duration_since_last_crawl);
dbg!(crawl_interval);
if duration_since_last_crawl < crawl_interval { if duration_since_last_crawl < crawl_interval {
info!( info!(
"last crawled at {:?}, crawling again in {:?}", "last crawled at {:?}, crawling again in {:?}",
@ -145,27 +149,27 @@ impl CrawlScheduler {
); );
tokio::spawn(async move { tokio::spawn(async move {
loop { loop {
debug!("spawned crawler for feed");
interval.tick().await; interval.tick().await;
debug!("tick!");
let mut receiver = feed_crawler.crawl(feed.feed_id).await; let mut receiver = feed_crawler.crawl(feed.feed_id).await;
match receiver.recv().await { while let Ok(msg) = receiver.recv().await {
Ok(FeedCrawlerHandleMessage::Feed(Ok(feed))) => { match msg {
let crawl_interval = FeedCrawlerHandleMessage::Feed(Ok(feed)) => {
Duration::from_secs(feed.crawl_interval_minutes as u64 * 60); let crawl_interval =
interval = interval_at(Instant::now() + crawl_interval, crawl_interval); Duration::from_secs(feed.crawl_interval_minutes as u64 * 60);
info!( interval = interval_at(Instant::now() + crawl_interval, crawl_interval);
minutes = feed.crawl_interval_minutes, info!(
"updated crawl interval" minutes = feed.crawl_interval_minutes,
); "updated crawl interval"
let _ = respond_to.send(CrawlSchedulerHandleMessage::FeedCrawler( );
FeedCrawlerHandleMessage::Feed(Ok(feed)), let _ = respond_to.send(CrawlSchedulerHandleMessage::FeedCrawler(
)); FeedCrawlerHandleMessage::Feed(Ok(feed)),
));
}
result => {
let _ =
respond_to.send(CrawlSchedulerHandleMessage::FeedCrawler(result));
}
} }
Ok(result) => {
let _ = respond_to.send(CrawlSchedulerHandleMessage::FeedCrawler(result));
}
_ => {}
} }
} }
}); });
@ -229,7 +233,7 @@ pub struct CrawlSchedulerHandle {
/// `CrawlSchedulerHandle`. /// `CrawlSchedulerHandle`.
/// ///
/// `CrawlSchedulerHandleMessage::Feed` contains the result of crawling a feed url. /// `CrawlSchedulerHandleMessage::Feed` contains the result of crawling a feed url.
#[derive(Clone)] #[derive(Debug, Clone)]
pub enum CrawlSchedulerHandleMessage { pub enum CrawlSchedulerHandleMessage {
Bootstrap(CrawlSchedulerResult<()>), Bootstrap(CrawlSchedulerResult<()>),
Schedule(CrawlSchedulerResult<()>), Schedule(CrawlSchedulerResult<()>),
@ -268,7 +272,10 @@ impl CrawlSchedulerHandle {
/// Sends a `CrawlSchedulerMessage::Schedule` message to the running `CrawlScheduler` actor. /// Sends a `CrawlSchedulerMessage::Schedule` message to the running `CrawlScheduler` actor.
/// ///
/// Listen to the result of the scheduling via the returned `broadcast::Receiver`. /// Listen to the result of the scheduling via the returned `broadcast::Receiver`.
pub async fn schedule(&self, feed_id: Uuid) -> broadcast::Receiver<CrawlSchedulerHandleMessage> { pub async fn schedule(
&self,
feed_id: Uuid,
) -> broadcast::Receiver<CrawlSchedulerHandleMessage> {
let (sender, receiver) = broadcast::channel(8); let (sender, receiver) = broadcast::channel(8);
let msg = CrawlSchedulerMessage::Schedule { let msg = CrawlSchedulerMessage::Schedule {
feed_id, feed_id,

View File

@ -14,7 +14,9 @@ use tracing::{debug, error, info, info_span, instrument, warn};
use url::Url; use url::Url;
use uuid::Uuid; use uuid::Uuid;
use crate::actors::entry_crawler::EntryCrawlerHandle; use crate::actors::entry_crawler::{
EntryCrawlerHandle, EntryCrawlerHandleMessage, EntryCrawlerResult,
};
use crate::domain_locks::DomainLocks; use crate::domain_locks::DomainLocks;
use crate::models::entry::{CreateEntry, Entry}; use crate::models::entry::{CreateEntry, Entry};
use crate::models::feed::{Feed, MAX_CRAWL_INTERVAL_MINUTES, MIN_CRAWL_INTERVAL_MINUTES}; use crate::models::feed::{Feed, MAX_CRAWL_INTERVAL_MINUTES, MIN_CRAWL_INTERVAL_MINUTES};
@ -87,7 +89,11 @@ impl FeedCrawler {
} }
#[instrument(skip_all, fields(feed_id = %feed_id))] #[instrument(skip_all, fields(feed_id = %feed_id))]
async fn crawl_feed(&self, feed_id: Uuid) -> FeedCrawlerResult<Feed> { async fn crawl_feed(
&self,
feed_id: Uuid,
respond_to: broadcast::Sender<FeedCrawlerHandleMessage>,
) -> FeedCrawlerResult<Feed> {
let mut feed = Feed::get(&self.pool, feed_id) let mut feed = Feed::get(&self.pool, feed_id)
.await .await
.map_err(|_| FeedCrawlerError::GetFeedError(Base62Uuid::from(feed_id)))?; .map_err(|_| FeedCrawlerError::GetFeedError(Base62Uuid::from(feed_id)))?;
@ -159,6 +165,7 @@ impl FeedCrawler {
return Ok(feed); return Ok(feed);
} else if !resp.status().is_success() { } else if !resp.status().is_success() {
warn!("feed returned non-successful status"); warn!("feed returned non-successful status");
feed.last_crawled_at = Some(Utc::now());
feed.last_crawl_error = resp.status().canonical_reason().map(|s| s.to_string()); feed.last_crawl_error = resp.status().canonical_reason().map(|s| s.to_string());
let feed = feed let feed = feed
.save(&self.pool) .save(&self.pool)
@ -246,8 +253,10 @@ impl FeedCrawler {
self.domain_locks.clone(), self.domain_locks.clone(),
self.content_dir.clone(), self.content_dir.clone(),
); );
// TODO: ignoring this receiver for the time being, pipe through events eventually let mut entry_receiver = entry_crawler.crawl(entry).await;
let _ = entry_crawler.crawl(entry).await; while let Ok(EntryCrawlerHandleMessage::Entry(result)) = entry_receiver.recv().await {
let _ = respond_to.send(FeedCrawlerHandleMessage::Entry(result));
}
} }
Ok(feed) Ok(feed)
} }
@ -259,7 +268,7 @@ impl FeedCrawler {
feed_id, feed_id,
respond_to, respond_to,
} => { } => {
let result = self.crawl_feed(feed_id).await; let result = self.crawl_feed(feed_id, respond_to.clone()).await;
if let Err(error) = &result { if let Err(error) = &result {
match Feed::update_crawl_error(&self.pool, feed_id, format!("{}", error)).await match Feed::update_crawl_error(&self.pool, feed_id, format!("{}", error)).await
{ {
@ -298,10 +307,10 @@ pub struct FeedCrawlerHandle {
/// ///
/// `FeedCrawlerHandleMessage::Feed` contains the result of crawling a feed url. /// `FeedCrawlerHandleMessage::Feed` contains the result of crawling a feed url.
/// `FeedCrawlerHandleMessage::Entry` contains the result of crawling an entry url within the feed. /// `FeedCrawlerHandleMessage::Entry` contains the result of crawling an entry url within the feed.
#[derive(Clone)] #[derive(Debug, Clone)]
pub enum FeedCrawlerHandleMessage { pub enum FeedCrawlerHandleMessage {
Feed(FeedCrawlerResult<Feed>), Feed(FeedCrawlerResult<Feed>),
Entry(FeedCrawlerResult<Entry>), Entry(EntryCrawlerResult<Entry>),
} }
impl FeedCrawlerHandle { impl FeedCrawlerHandle {

View File

@ -8,8 +8,10 @@ use opml::OPML;
use sqlx::PgPool; use sqlx::PgPool;
use tokio::sync::{broadcast, mpsc}; use tokio::sync::{broadcast, mpsc};
use tracing::{debug, error, instrument}; use tracing::{debug, error, instrument};
use uuid::Uuid;
use crate::actors::crawl_scheduler::{CrawlSchedulerHandle, CrawlSchedulerHandleMessage}; use crate::actors::crawl_scheduler::{CrawlSchedulerHandle, CrawlSchedulerHandleMessage};
use crate::actors::feed_crawler::FeedCrawlerHandleMessage;
use crate::models::feed::{Feed, UpsertFeed}; use crate::models::feed::{Feed, UpsertFeed};
use crate::uuid::Base62Uuid; use crate::uuid::Base62Uuid;
@ -46,6 +48,17 @@ impl Display for ImporterMessage {
} }
} }
async fn listen_to_crawl(
feed_id: Uuid,
crawl_scheduler: CrawlSchedulerHandle,
respond_to: broadcast::Sender<ImporterHandleMessage>,
) {
let mut receiver = crawl_scheduler.schedule(feed_id).await;
while let Ok(msg) = receiver.recv().await {
let _ = respond_to.send(ImporterHandleMessage::CrawlScheduler(msg));
}
}
/// An error type that enumerates possible failures during a crawl and is cloneable and can be sent /// An error type that enumerates possible failures during a crawl and is cloneable and can be sent
/// across threads (does not reference the originating Errors which are usually not cloneable). /// across threads (does not reference the originating Errors which are usually not cloneable).
#[derive(thiserror::Error, Debug, Clone)] #[derive(thiserror::Error, Debug, Clone)]
@ -80,7 +93,6 @@ impl Importer {
) -> ImporterResult<()> { ) -> ImporterResult<()> {
let document = OPML::from_reader(&mut Cursor::new(bytes)) let document = OPML::from_reader(&mut Cursor::new(bytes))
.map_err(|_| ImporterError::InvalidOPML(file_name.unwrap_or(import_id.to_string())))?; .map_err(|_| ImporterError::InvalidOPML(file_name.unwrap_or(import_id.to_string())))?;
let mut receivers = Vec::new();
for url in Self::gather_feed_urls(document.body.outlines) { for url in Self::gather_feed_urls(document.body.outlines) {
let feed = Feed::upsert( let feed = Feed::upsert(
&self.pool, &self.pool,
@ -91,19 +103,15 @@ impl Importer {
) )
.await .await
.map_err(|_| ImporterError::CreateFeedError(url))?; .map_err(|_| ImporterError::CreateFeedError(url))?;
if feed.updated_at.is_some() { if feed.updated_at.is_none() {
receivers.push(self.crawl_scheduler.schedule(feed.feed_id).await); tokio::spawn(listen_to_crawl(
feed.feed_id,
self.crawl_scheduler.clone(),
respond_to.clone(),
));
} }
} }
let mut future_recvs: FuturesUnordered<_> =
receivers.iter_mut().map(|rx| rx.recv()).collect();
while let Some(result) = future_recvs.next().await {
if let Ok(crawl_scheduler_msg) = result {
let _ = respond_to.send(ImporterHandleMessage::CrawlScheduler(crawl_scheduler_msg));
}
}
Ok(()) Ok(())
} }
@ -161,7 +169,7 @@ pub struct ImporterHandle {
/// ///
/// `ImporterHandleMessage::Import` contains the result of importing the OPML file. /// `ImporterHandleMessage::Import` contains the result of importing the OPML file.
#[allow(clippy::large_enum_variant)] #[allow(clippy::large_enum_variant)]
#[derive(Clone)] #[derive(Debug, Clone)]
pub enum ImporterHandleMessage { pub enum ImporterHandleMessage {
// TODO: send stats of import or forward crawler messages? // TODO: send stats of import or forward crawler messages?
Import(ImporterResult<()>), Import(ImporterResult<()>),

View File

@ -38,13 +38,14 @@ pub async fn get(State(pool): State<PgPool>, layout: Layout) -> Result<Response>
button type="submit" { "Add Feed" } button type="submit" { "Add Feed" }
} }
} }
form action="/import/opml" method="post" enctype="mulipart/form-data" class="feed-form" { form action="/import/opml" method="post" enctype="multipart/form-data" class="feed-form" {
div class="form-grid" { div class="form-grid" {
label for="opml" { "OPML: " } label for="opml" { "OPML: " }
input type="file" id="opml" name="opml" required="true" accept="text/x-opml,application/xml,text/xml"; input type="file" id="opml" name="opml" required="true" accept="text/x-opml,application/xml,text/xml";
button type="submit" { "Import Feeds" } button type="submit" { "Import Feeds" }
} }
} }
ul id="add-feed-messages" {}
} }
} }
})) }))

View File

@ -12,6 +12,7 @@ use crate::actors::crawl_scheduler::CrawlSchedulerHandleMessage;
use crate::actors::feed_crawler::FeedCrawlerHandleMessage; use crate::actors::feed_crawler::FeedCrawlerHandleMessage;
use crate::actors::importer::{ImporterHandle, ImporterHandleMessage}; use crate::actors::importer::{ImporterHandle, ImporterHandleMessage};
use crate::error::{Error, Result}; use crate::error::{Error, Result};
use crate::partials::entry_link::entry_link;
use crate::partials::feed_link::feed_link; use crate::partials::feed_link::feed_link;
use crate::state::Imports; use crate::state::Imports;
use crate::turbo_stream::TurboStream; use crate::turbo_stream::TurboStream;
@ -23,7 +24,10 @@ pub async fn opml(
mut multipart: Multipart, mut multipart: Multipart,
) -> Result<Response> { ) -> Result<Response> {
dbg!("opml handler"); dbg!("opml handler");
if let Some(field) = multipart.next_field().await.map_err(|err| { dbg!(&err); err })? { if let Some(field) = multipart.next_field().await.map_err(|err| {
dbg!(&err);
err
})? {
let import_id = Base62Uuid::new(); let import_id = Base62Uuid::new();
dbg!(&import_id); dbg!(&import_id);
let file_name = field.file_name().map(|s| s.to_string()); let file_name = field.file_name().map(|s| s.to_string());
@ -43,9 +47,9 @@ pub async fn opml(
TurboStream( TurboStream(
html! { html! {
turbo-stream-source src=(import_stream) id="import-stream" {} turbo-stream-source src=(import_stream) id="import-stream" {}
turbo-stream action="append" target="feeds" { turbo-stream action="append" target="add-feed-messages" {
template { template {
li id=(import_html_id) { "Importing..." } li { "Uploading file..." }
} }
} }
turbo-stream action="remove" target="no-feeds"; turbo-stream action="remove" target="no-feeds";
@ -75,10 +79,21 @@ pub async fn stream(
Ok(ImporterHandleMessage::Import(Ok(_))) => Ok::<Event, String>( Ok(ImporterHandleMessage::Import(Ok(_))) => Ok::<Event, String>(
Event::default().data( Event::default().data(
html! { html! {
turbo-stream action="remove" target="import-stream" {} turbo-stream action="append" target="add-feed-messages" {
turbo-stream action="replace" target=(import_html_id) { template { li { "Importing...." } }
}
}
.into_string(),
),
),
Ok(ImporterHandleMessage::CrawlScheduler(CrawlSchedulerHandleMessage::FeedCrawler(
FeedCrawlerHandleMessage::Entry(Ok(entry)),
))) => Ok::<Event, String>(
Event::default().data(
html! {
turbo-stream action="append" target="add-feed-messages" {
template { template {
li id=(import_html_id) { "Done importing" } li { "Imported: " (entry_link(entry)) }
} }
} }
} }
@ -90,6 +105,12 @@ pub async fn stream(
))) => Ok::<Event, String>( ))) => Ok::<Event, String>(
Event::default().data( Event::default().data(
html! { html! {
turbo-stream action="remove" target="import-stream" {}
turbo-stream action="append" target="add-feed-messages" {
template {
li { "Finished import." }
}
}
turbo-stream action="prepend" target="feeds" { turbo-stream action="prepend" target="feeds" {
template { template {
li id=(format!("feed-{}", feed.feed_id)) { (feed_link(&feed, false)) } li id=(format!("feed-{}", feed.feed_id)) { (feed_link(&feed, false)) }
@ -104,7 +125,21 @@ pub async fn stream(
))) => Ok::<Event, String>( ))) => Ok::<Event, String>(
Event::default().data( Event::default().data(
html! { html! {
turbo-stream action="prepend" target="feeds" { turbo-stream action="append" target="add-feed-messages" {
template {
li { span class="error" { (error) } }
}
}
}
.into_string(),
),
),
Ok(ImporterHandleMessage::CrawlScheduler(CrawlSchedulerHandleMessage::FeedCrawler(
FeedCrawlerHandleMessage::Entry(Err(error)),
))) => Ok::<Event, String>(
Event::default().data(
html! {
turbo-stream action="append" target="add-feed-messages" {
template { template {
li { span class="error" { (error) } } li { span class="error" { (error) } }
} }
@ -116,6 +151,11 @@ pub async fn stream(
Ok(ImporterHandleMessage::Import(Err(error))) => Ok(Event::default().data( Ok(ImporterHandleMessage::Import(Err(error))) => Ok(Event::default().data(
html! { html! {
turbo-stream action="remove" target="import-stream" {} turbo-stream action="remove" target="import-stream" {}
turbo-stream action="append" target="add-feed-messages" {
template {
li { span class="error" { (error) } }
}
}
turbo-stream action="replace" target=(import_html_id) { turbo-stream action="replace" target=(import_html_id) {
template { template {
li id=(import_html_id) { span class="error" { (error) } } li id=(import_html_id) { span class="error" { (error) } }

View File

@ -444,7 +444,7 @@ impl Feed {
r#"insert into feed ( r#"insert into feed (
title, url, type, description title, url, type, description
) values ( ) values (
$1, $2, $3, $4 $1, $2, COALESCE($3, 'unknown'::feed_type), $4
) on conflict (url) do update set ) on conflict (url) do update set
title = excluded.title, title = excluded.title,
url = excluded.url, url = excluded.url,

View File

@ -0,0 +1,14 @@
use maud::{html, Markup};
use crate::models::entry::Entry;
use crate::utils::get_domain;
use crate::uuid::Base62Uuid;
pub fn entry_link(entry: Entry) -> Markup {
let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
let url = format!("/entry/{}", Base62Uuid::from(entry.entry_id));
let domain = get_domain(&entry.url).unwrap_or_default();
html! {
a href=(url) class="entry-link" { (title) } em class="entry-link-domain" { (domain) }
}
}

View File

@ -1,17 +1,13 @@
use maud::{html, Markup}; use maud::{html, Markup};
use crate::models::entry::Entry; use crate::models::entry::Entry;
use crate::utils::get_domain; use crate::partials::entry_link::entry_link;
use crate::uuid::Base62Uuid;
pub fn entry_list(entries: Vec<Entry>) -> Markup { pub fn entry_list(entries: Vec<Entry>) -> Markup {
html! { html! {
ul class="entries" { ul class="entries" {
@for entry in entries { @for entry in entries {
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string()); li class="entry" { (entry_link(entry)) }
@let url = format!("/entry/{}", Base62Uuid::from(entry.entry_id));
@let domain = get_domain(&entry.url).unwrap_or_default();
li { a href=(url) { (title) } em class="domain" { (domain) }}
} }
} }
} }

View File

@ -1,3 +1,4 @@
pub mod entry_link;
pub mod entry_list; pub mod entry_list;
pub mod feed_link; pub mod feed_link;
pub mod header; pub mod header;