Add crawl metadata to feed & improve model interface

This commit is contained in:
Tyler Hallada 2023-07-15 00:36:44 -04:00
parent 0fa0cfc508
commit 4837cbb903
18 changed files with 621 additions and 522 deletions

View File

@ -88,8 +88,3 @@ You can also build the binary in release mode for running in production with the
This project also comes with a CLI binary which allows you to manipulate the This project also comes with a CLI binary which allows you to manipulate the
database directly without needing to go through the REST API server. Run database directly without needing to go through the REST API server. Run
`cli --help` to see all of the available commands. `cli --help` to see all of the available commands.
## Running jobs
To periodically fetch new items from all of the feeds execute the `cli crawl`
command in a cronjob.

View File

@ -31,14 +31,18 @@ $$ language plpgsql;
-- over things like usernames and emails, ithout needing to remember to do case-conversion. -- over things like usernames and emails, ithout needing to remember to do case-conversion.
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false); create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
create type feed_type as enum ('atom', 'rss'); create type feed_type as enum ('atom', 'json', 'rss0', 'rss1', 'rss2', 'unknown');
create table if not exists "feed" ( create table if not exists "feed" (
feed_id uuid primary key default uuid_generate_v1mc(), feed_id uuid primary key default uuid_generate_v1mc(),
title text, title text,
url varchar(2048) not null, url varchar(2048) not null,
type feed_type not null, type feed_type not null default 'unknown',
description text, description text default null,
crawl_interval_minutes int not null default 180,
last_crawl_error text default null,
last_crawled_at timestamptz default null,
last_entry_published_at timestamptz default null,
created_at timestamptz not null default now(), created_at timestamptz not null default now(),
updated_at timestamptz, updated_at timestamptz,
deleted_at timestamptz deleted_at timestamptz

View File

@ -8,11 +8,13 @@ use tokio::sync::{broadcast, mpsc};
use tracing::log::warn; use tracing::log::warn;
use tracing::{info, info_span, instrument}; use tracing::{info, info_span, instrument};
use url::Url; use url::Url;
use uuid::Uuid;
use crate::actors::entry_crawler::EntryCrawlerHandle; use crate::actors::entry_crawler::EntryCrawlerHandle;
use crate::domain_locks::DomainLocks; use crate::domain_locks::DomainLocks;
use crate::models::entry::{upsert_entries, CreateEntry, Entry}; use crate::models::entry::{CreateEntry, Entry};
use crate::models::feed::{upsert_feed, CreateFeed, Feed}; use crate::models::feed::Feed;
use crate::uuid::Base62Uuid;
/// The `FeedCrawler` actor fetches a feed url, parses it, and saves it to the database. /// The `FeedCrawler` actor fetches a feed url, parses it, and saves it to the database.
/// ///
@ -31,7 +33,7 @@ struct FeedCrawler {
#[derive(Debug)] #[derive(Debug)]
enum FeedCrawlerMessage { enum FeedCrawlerMessage {
Crawl { Crawl {
url: Url, feed_id: Uuid,
respond_to: broadcast::Sender<FeedCrawlerHandleMessage>, respond_to: broadcast::Sender<FeedCrawlerHandleMessage>,
}, },
} }
@ -39,7 +41,7 @@ enum FeedCrawlerMessage {
impl Display for FeedCrawlerMessage { impl Display for FeedCrawlerMessage {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self { match self {
FeedCrawlerMessage::Crawl { url, .. } => write!(f, "Crawl({})", url), FeedCrawlerMessage::Crawl { feed_id, .. } => write!(f, "Crawl({})", feed_id),
} }
} }
} }
@ -49,11 +51,13 @@ impl Display for FeedCrawlerMessage {
#[derive(thiserror::Error, Debug, Clone)] #[derive(thiserror::Error, Debug, Clone)]
pub enum FeedCrawlerError { pub enum FeedCrawlerError {
#[error("invalid feed url: {0}")] #[error("invalid feed url: {0}")]
InvalidUrl(Url), InvalidUrl(String),
#[error("failed to fetch feed: {0}")] #[error("failed to fetch feed: {0}")]
FetchError(Url), FetchError(Url),
#[error("failed to parse feed: {0}")] #[error("failed to parse feed: {0}")]
ParseError(Url), ParseError(Url),
#[error("failed to find feed in database: {0}")]
GetFeedError(Base62Uuid),
#[error("failed to create feed: {0}")] #[error("failed to create feed: {0}")]
CreateFeedError(Url), CreateFeedError(Url),
#[error("failed to create feed entries: {0}")] #[error("failed to create feed entries: {0}")]
@ -78,11 +82,17 @@ impl FeedCrawler {
} }
} }
#[instrument(skip_all, fields(url = %url))] #[instrument(skip_all, fields(feed_id = %feed_id))]
async fn crawl_feed(&self, url: Url) -> FeedCrawlerResult<Feed> { async fn crawl_feed(&self, feed_id: Uuid) -> FeedCrawlerResult<Feed> {
let mut feed = Feed::get(&self.pool, feed_id)
.await
.map_err(|_| FeedCrawlerError::GetFeedError(Base62Uuid::from(feed_id)))?;
info!("got feed from db");
let url = Url::parse(&feed.url)
.map_err(|_| FeedCrawlerError::InvalidUrl(feed.url.clone()))?;
let domain = url let domain = url
.domain() .domain()
.ok_or(FeedCrawlerError::InvalidUrl(url.clone()))?; .ok_or(FeedCrawlerError::InvalidUrl(feed.url.clone()))?;
let bytes = self let bytes = self
.domain_locks .domain_locks
.run_request(domain, async { .run_request(domain, async {
@ -96,22 +106,24 @@ impl FeedCrawler {
.map_err(|_| FeedCrawlerError::FetchError(url.clone())) .map_err(|_| FeedCrawlerError::FetchError(url.clone()))
}) })
.await?; .await?;
info!("fetched feed"); info!(url=%url, "fetched feed");
let parsed_feed = let parsed_feed =
parser::parse(&bytes[..]).map_err(|_| FeedCrawlerError::ParseError(url.clone()))?; parser::parse(&bytes[..]).map_err(|_| FeedCrawlerError::ParseError(url.clone()))?;
info!("parsed feed"); info!("parsed feed");
let feed = upsert_feed( feed.url = url.to_string();
&self.pool, feed.feed_type = parsed_feed.feed_type.into();
CreateFeed { feed.last_crawled_at = Some(Utc::now());
title: parsed_feed.title.map(|text| text.content), if let Some(title) = parsed_feed.title {
url: url.to_string(), feed.title = Some(title.content);
feed_type: parsed_feed.feed_type.into(), }
description: parsed_feed.description.map(|text| text.content), if let Some(description) = parsed_feed.description {
}, feed.description = Some(description.content);
) }
let feed = feed
.save(&self.pool)
.await .await
.map_err(|_| FeedCrawlerError::CreateFeedError(url.clone()))?; .map_err(|_| FeedCrawlerError::CreateFeedError(url.clone()))?;
info!(%feed.feed_id, "upserted feed"); info!("updated feed in db");
let mut payload = Vec::with_capacity(parsed_feed.entries.len()); let mut payload = Vec::with_capacity(parsed_feed.entries.len());
for entry in parsed_feed.entries { for entry in parsed_feed.entries {
@ -132,7 +144,7 @@ impl FeedCrawler {
warn!("Skipping feed entry with no links"); warn!("Skipping feed entry with no links");
} }
} }
let entries = upsert_entries(&self.pool, payload) let entries = Entry::bulk_upsert(&self.pool, payload)
.await .await
.map_err(|_| FeedCrawlerError::CreateFeedEntriesError(url.clone()))?; .map_err(|_| FeedCrawlerError::CreateFeedEntriesError(url.clone()))?;
let (new, updated) = entries let (new, updated) = entries
@ -156,8 +168,11 @@ impl FeedCrawler {
#[instrument(skip_all, fields(msg = %msg))] #[instrument(skip_all, fields(msg = %msg))]
async fn handle_message(&mut self, msg: FeedCrawlerMessage) { async fn handle_message(&mut self, msg: FeedCrawlerMessage) {
match msg { match msg {
FeedCrawlerMessage::Crawl { url, respond_to } => { FeedCrawlerMessage::Crawl {
let result = self.crawl_feed(url).await; feed_id,
respond_to,
} => {
let result = self.crawl_feed(feed_id).await;
// ignore the result since the initiator may have cancelled waiting for the // ignore the result since the initiator may have cancelled waiting for the
// response, and that is ok // response, and that is ok
let _ = respond_to.send(FeedCrawlerHandleMessage::Feed(result)); let _ = respond_to.send(FeedCrawlerHandleMessage::Feed(result));
@ -212,10 +227,13 @@ impl FeedCrawlerHandle {
/// Sends a `FeedCrawlerMessage::Crawl` message to the running `FeedCrawler` actor. /// Sends a `FeedCrawlerMessage::Crawl` message to the running `FeedCrawler` actor.
/// ///
/// Listen to the result of the crawl via the returned `broadcast::Receiver`. /// Listen to the result of the crawl via the returned `broadcast::Receiver`.
pub async fn crawl(&self, url: Url) -> broadcast::Receiver<FeedCrawlerHandleMessage> { pub async fn crawl(
&self,
feed_id: Uuid,
) -> broadcast::Receiver<FeedCrawlerHandleMessage> {
let (sender, receiver) = broadcast::channel(8); let (sender, receiver) = broadcast::channel(8);
let msg = FeedCrawlerMessage::Crawl { let msg = FeedCrawlerMessage::Crawl {
url, feed_id,
respond_to: sender, respond_to: sender,
}; };

View File

@ -1,15 +1,17 @@
use anyhow::Result; use anyhow::Result;
use clap::{Args, Parser, Subcommand};
use chrono::Utc; use chrono::Utc;
use clap::{Args, Parser, Subcommand};
use dotenvy::dotenv; use dotenvy::dotenv;
use lib::actors::feed_crawler::FeedCrawlerHandle;
use lib::domain_locks::DomainLocks;
use reqwest::Client;
use sqlx::postgres::PgPoolOptions; use sqlx::postgres::PgPoolOptions;
use std::env; use std::env;
use tracing::info; use tracing::info;
use uuid::Uuid; use uuid::Uuid;
use lib::jobs::crawl::crawl; use lib::models::entry::{Entry, CreateEntry};
use lib::models::feed::{create_feed, delete_feed, CreateFeed, FeedType}; use lib::models::feed::{CreateFeed, Feed, FeedType};
use lib::models::entry::{create_entry, delete_entry, CreateEntry};
use lib::uuid::Base62Uuid; use lib::uuid::Base62Uuid;
/// CLI for crawlnicle /// CLI for crawlnicle
@ -23,14 +25,20 @@ struct Cli {
#[derive(Subcommand)] #[derive(Subcommand)]
enum Commands { enum Commands {
/// Fetches new entries from all feeds in the database Crawl(CrawlFeed),
Crawl,
AddFeed(AddFeed), AddFeed(AddFeed),
DeleteFeed(DeleteFeed), DeleteFeed(DeleteFeed),
AddEntry(AddEntry), AddEntry(AddEntry),
DeleteEntry(DeleteEntry), DeleteEntry(DeleteEntry),
} }
#[derive(Args)]
/// Crawl a feed (get new entries)
struct CrawlFeed {
/// id of the feed to crawl
id: Uuid,
}
/// Add a feed to the database /// Add a feed to the database
#[derive(Args)] #[derive(Args)]
struct AddFeed { struct AddFeed {
@ -94,12 +102,11 @@ pub async fn main() -> Result<()> {
match cli.commands { match cli.commands {
Commands::AddFeed(args) => { Commands::AddFeed(args) => {
let feed = create_feed( let feed = Feed::create(
&pool, &pool,
CreateFeed { CreateFeed {
title: args.title, title: args.title,
url: args.url, url: args.url,
feed_type: args.feed_type,
description: args.description, description: args.description,
}, },
) )
@ -107,11 +114,11 @@ pub async fn main() -> Result<()> {
info!("Created feed with id {}", Base62Uuid::from(feed.feed_id)); info!("Created feed with id {}", Base62Uuid::from(feed.feed_id));
} }
Commands::DeleteFeed(args) => { Commands::DeleteFeed(args) => {
delete_feed(&pool, args.id).await?; Feed::delete(&pool, args.id).await?;
info!("Deleted feed with id {}", Base62Uuid::from(args.id)); info!("Deleted feed with id {}", Base62Uuid::from(args.id));
} }
Commands::AddEntry(args) => { Commands::AddEntry(args) => {
let entry = create_entry( let entry = Entry::create(
&pool, &pool,
CreateEntry { CreateEntry {
title: args.title, title: args.title,
@ -125,12 +132,22 @@ pub async fn main() -> Result<()> {
info!("Created entry with id {}", Base62Uuid::from(entry.entry_id)); info!("Created entry with id {}", Base62Uuid::from(entry.entry_id));
} }
Commands::DeleteEntry(args) => { Commands::DeleteEntry(args) => {
delete_entry(&pool, args.id).await?; Entry::delete(&pool, args.id).await?;
info!("Deleted entry with id {}", Base62Uuid::from(args.id)); info!("Deleted entry with id {}", Base62Uuid::from(args.id));
} }
Commands::Crawl => { Commands::Crawl(CrawlFeed { id }) => {
info!("Crawling..."); info!("Crawling feed {}...", Base62Uuid::from(id));
crawl(&pool).await?; let client = Client::new();
// NOTE: this is not the same DomainLocks as the one used in the server so, if the
// server is running, it will *not* serialize same-domain requests with it.
let domain_locks = DomainLocks::new();
let feed_crawler = FeedCrawlerHandle::new(
pool.clone(),
client.clone(),
domain_locks.clone(),
env::var("CONTENT_DIR")?,
);
let _ = feed_crawler.crawl(id).await;
} }
} }

View File

@ -2,8 +2,8 @@ use axum::{extract::State, Json};
use sqlx::PgPool; use sqlx::PgPool;
use crate::error::Error; use crate::error::Error;
use crate::models::entry::{get_entries, Entry, GetEntriesOptions}; use crate::models::entry::Entry;
pub async fn get(State(pool): State<PgPool>) -> Result<Json<Vec<Entry>>, Error> { pub async fn get(State(pool): State<PgPool>) -> Result<Json<Vec<Entry>>, Error> {
Ok(Json(get_entries(&pool, GetEntriesOptions::default()).await?)) Ok(Json(Entry::get_all(&pool, Default::default()).await?))
} }

View File

@ -5,19 +5,19 @@ use axum::{
use sqlx::PgPool; use sqlx::PgPool;
use crate::error::Error; use crate::error::Error;
use crate::models::entry::{create_entry, get_entry, CreateEntry, Entry}; use crate::models::entry::{CreateEntry, Entry};
use crate::uuid::Base62Uuid; use crate::uuid::Base62Uuid;
pub async fn get( pub async fn get(
State(pool): State<PgPool>, State(pool): State<PgPool>,
Path(id): Path<Base62Uuid>, Path(id): Path<Base62Uuid>,
) -> Result<Json<Entry>, Error> { ) -> Result<Json<Entry>, Error> {
Ok(Json(get_entry(&pool, id.as_uuid()).await?)) Ok(Json(Entry::get(&pool, id.as_uuid()).await?))
} }
pub async fn post( pub async fn post(
State(pool): State<PgPool>, State(pool): State<PgPool>,
Json(payload): Json<CreateEntry>, Json(payload): Json<CreateEntry>,
) -> Result<Json<Entry>, Error> { ) -> Result<Json<Entry>, Error> {
Ok(Json(create_entry(&pool, payload).await?)) Ok(Json(Entry::create(&pool, payload).await?))
} }

View File

@ -5,20 +5,20 @@ use axum::{
use sqlx::PgPool; use sqlx::PgPool;
use crate::error::{Error, Result}; use crate::error::{Error, Result};
use crate::models::feed::{create_feed, delete_feed, get_feed, CreateFeed, Feed}; use crate::models::feed::{CreateFeed, Feed};
use crate::uuid::Base62Uuid; use crate::uuid::Base62Uuid;
pub async fn get(State(pool): State<PgPool>, Path(id): Path<Base62Uuid>) -> Result<Json<Feed>> { pub async fn get(State(pool): State<PgPool>, Path(id): Path<Base62Uuid>) -> Result<Json<Feed>> {
Ok(Json(get_feed(&pool, id.as_uuid()).await?)) Ok(Json(Feed::get(&pool, id.as_uuid()).await?))
} }
pub async fn post( pub async fn post(
State(pool): State<PgPool>, State(pool): State<PgPool>,
Json(payload): Json<CreateFeed>, Json(payload): Json<CreateFeed>,
) -> Result<Json<Feed>, Error> { ) -> Result<Json<Feed>, Error> {
Ok(Json(create_feed(&pool, payload).await?)) Ok(Json(Feed::create(&pool, payload).await?))
} }
pub async fn delete(State(pool): State<PgPool>, Path(id): Path<Base62Uuid>) -> Result<()> { pub async fn delete(State(pool): State<PgPool>, Path(id): Path<Base62Uuid>) -> Result<()> {
delete_feed(&pool, id.as_uuid()).await Feed::delete(&pool, id.as_uuid()).await
} }

View File

@ -2,8 +2,9 @@ use axum::{extract::State, Json};
use sqlx::PgPool; use sqlx::PgPool;
use crate::error::Error; use crate::error::Error;
use crate::models::feed::{get_feeds, Feed}; use crate::models::feed::Feed;
pub async fn get(State(pool): State<PgPool>) -> Result<Json<Vec<Feed>>, Error> { pub async fn get(State(pool): State<PgPool>) -> Result<Json<Vec<Feed>>, Error> {
Ok(Json(get_feeds(&pool).await?)) // TODO: pagination
Ok(Json(Feed::get_all(&pool).await?))
} }

View File

@ -7,7 +7,7 @@ use sqlx::PgPool;
use crate::config::Config; use crate::config::Config;
use crate::error::Result; use crate::error::Result;
use crate::models::entry::get_entry; use crate::models::entry::Entry;
use crate::partials::layout::Layout; use crate::partials::layout::Layout;
use crate::uuid::Base62Uuid; use crate::uuid::Base62Uuid;
@ -17,7 +17,7 @@ pub async fn get(
State(config): State<Config>, State(config): State<Config>,
layout: Layout, layout: Layout,
) -> Result<Response> { ) -> Result<Response> {
let entry = get_entry(&pool, id.as_uuid()).await?; let entry = Entry::get(&pool, id.as_uuid()).await?;
let content_dir = std::path::Path::new(&config.content_dir); let content_dir = std::path::Path::new(&config.content_dir);
let content_path = content_dir.join(format!("{}.html", entry.entry_id)); let content_path = content_dir.join(format!("{}.html", entry.entry_id));
Ok(layout.render(html! { Ok(layout.render(html! {

View File

@ -20,8 +20,8 @@ use crate::actors::feed_crawler::{FeedCrawlerHandle, FeedCrawlerHandleMessage};
use crate::config::Config; use crate::config::Config;
use crate::domain_locks::DomainLocks; use crate::domain_locks::DomainLocks;
use crate::error::{Error, Result}; use crate::error::{Error, Result};
use crate::models::entry::get_entries_for_feed; use crate::models::entry::Entry;
use crate::models::feed::{create_feed, delete_feed, get_feed, CreateFeed, FeedType}; use crate::models::feed::{CreateFeed, Feed};
use crate::partials::{entry_list::entry_list, feed_link::feed_link, layout::Layout}; use crate::partials::{entry_list::entry_list, feed_link::feed_link, layout::Layout};
use crate::state::Crawls; use crate::state::Crawls;
use crate::turbo_stream::TurboStream; use crate::turbo_stream::TurboStream;
@ -32,8 +32,8 @@ pub async fn get(
State(pool): State<PgPool>, State(pool): State<PgPool>,
layout: Layout, layout: Layout,
) -> Result<Response> { ) -> Result<Response> {
let feed = get_feed(&pool, id.as_uuid()).await?; let feed = Feed::get(&pool, id.as_uuid()).await?;
let entries = get_entries_for_feed(&pool, feed.feed_id, Default::default()).await?; let entries = Entry::get_all_for_feed(&pool, feed.feed_id, Default::default()).await?;
let delete_url = format!("/feed/{}/delete", id); let delete_url = format!("/feed/{}/delete", id);
Ok(layout.render(html! { Ok(layout.render(html! {
header class="feed-header" { header class="feed-header" {
@ -123,12 +123,11 @@ pub async fn post(
config.content_dir.clone(), config.content_dir.clone(),
); );
let feed = create_feed( let feed = Feed::create(
&pool, &pool,
CreateFeed { CreateFeed {
title: add_feed.title, title: add_feed.title,
url: add_feed.url.clone(), url: add_feed.url.clone(),
feed_type: FeedType::Rss, // eh, get rid of this
description: add_feed.description, description: add_feed.description,
}, },
) )
@ -148,7 +147,7 @@ pub async fn post(
let url: Url = Url::parse(&add_feed.url) let url: Url = Url::parse(&add_feed.url)
.map_err(|err| AddFeedError::InvalidUrl(add_feed.url.clone(), err))?; .map_err(|err| AddFeedError::InvalidUrl(add_feed.url.clone(), err))?;
let receiver = feed_crawler.crawl(url).await; let receiver = feed_crawler.crawl(feed.feed_id).await;
{ {
let mut crawls = crawls.lock().map_err(|_| { let mut crawls = crawls.lock().map_err(|_| {
AddFeedError::CreateFeedError(add_feed.url.clone(), Error::InternalServerError) AddFeedError::CreateFeedError(add_feed.url.clone(), Error::InternalServerError)
@ -245,6 +244,6 @@ pub async fn stream(
} }
pub async fn delete(State(pool): State<PgPool>, Path(id): Path<Base62Uuid>) -> Result<Redirect> { pub async fn delete(State(pool): State<PgPool>, Path(id): Path<Base62Uuid>) -> Result<Redirect> {
delete_feed(&pool, id.as_uuid()).await?; Feed::delete(&pool, id.as_uuid()).await?;
Ok(Redirect::to("/feeds")) Ok(Redirect::to("/feeds"))
} }

View File

@ -4,11 +4,12 @@ use maud::html;
use sqlx::PgPool; use sqlx::PgPool;
use crate::error::Result; use crate::error::Result;
use crate::models::feed::get_feeds; use crate::models::feed::Feed;
use crate::partials::{feed_link::feed_link, layout::Layout}; use crate::partials::{feed_link::feed_link, layout::Layout};
pub async fn get(State(pool): State<PgPool>, layout: Layout) -> Result<Response> { pub async fn get(State(pool): State<PgPool>, layout: Layout) -> Result<Response> {
let feeds = get_feeds(&pool).await?; // TODO: pagination
let feeds = Feed::get_all(&pool).await?;
Ok(layout.render(html! { Ok(layout.render(html! {
h2 { "Feeds" } h2 { "Feeds" }
div class="feeds" { div class="feeds" {

View File

@ -3,10 +3,10 @@ use axum::response::Response;
use sqlx::PgPool; use sqlx::PgPool;
use crate::error::Result; use crate::error::Result;
use crate::models::entry::{get_entries, GetEntriesOptions}; use crate::models::entry::Entry;
use crate::partials::{layout::Layout, entry_list::entry_list}; use crate::partials::{layout::Layout, entry_list::entry_list};
pub async fn get(State(pool): State<PgPool>, layout: Layout) -> Result<Response> { pub async fn get(State(pool): State<PgPool>, layout: Layout) -> Result<Response> {
let entries = get_entries(&pool, GetEntriesOptions::default()).await?; let entries = Entry::get_all(&pool, Default::default()).await?;
Ok(layout.render(entry_list(entries))) Ok(layout.render(entry_list(entries)))
} }

View File

@ -1,77 +0,0 @@
use std::fs;
use std::env;
use std::path::Path;
use article_scraper::ArticleScraper;
use chrono::Utc;
use feed_rs::parser;
use reqwest::{Client, Url};
use sqlx::PgPool;
use tracing::{info, info_span, warn};
use crate::models::feed::get_feeds;
use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
use crate::uuid::Base62Uuid;
/// DEPRECATED: Use FeedCrawler instead, keeping this for reference until I set up scheduled jobs.
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
/// database.
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
let scraper = ArticleScraper::new(None).await;
let client = Client::new();
let content_dir = env::var("CONTENT_DIR")?;
let content_dir = Path::new(&content_dir);
let feeds = get_feeds(pool).await?;
for feed in feeds {
let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
let feed_span = info_span!("feed", id = feed_id_str, url = feed.url.as_str());
let _feed_span_guard = feed_span.enter();
info!("Fetching feed");
// TODO: handle these results
let bytes = client.get(feed.url).send().await?.bytes().await?;
info!("Parsing feed");
let parsed_feed = parser::parse(&bytes[..])?;
let mut payload = Vec::with_capacity(parsed_feed.entries.len());
for entry in parsed_feed.entries {
let entry_span = info_span!("entry", id = entry.id, title = entry.title.clone().map(|t| t.content));
let _entry_span_guard = entry_span.enter();
if let Some(link) = entry.links.get(0) {
// if no scraped or feed date is available, fallback to the current time
let published_at = entry.published.unwrap_or_else(Utc::now);
let entry = CreateEntry {
title: entry.title.map(|t| t.content),
url: link.href.clone(),
description: entry.summary.map(|s| s.content),
feed_id: feed.feed_id,
published_at,
};
payload.push(entry);
} else {
warn!("Skipping feed entry with no links");
}
}
let entries = upsert_entries(pool, payload).await?;
info!("Created {} entries", entries.len());
// TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that
// doesn't implement Send so this isn't trivial.
for mut entry in entries {
info!("Fetching and parsing entry link: {}", entry.url);
if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
let id = entry.entry_id;
if let Some(date) = article.date {
// prefer scraped date over rss feed date
entry.published_at = date;
update_entry(pool, entry).await?;
};
let html_content = article.get_content();
if let Some(content) = html_content {
fs::write(content_dir.join(format!("{}.html", id)), content)?;
}
} else {
warn!("Failed to fetch article for entry: {:?}", &entry.url);
}
}
}
Ok(())
}

View File

@ -1 +0,0 @@
pub mod crawl;

View File

@ -3,7 +3,6 @@ pub mod config;
pub mod domain_locks; pub mod domain_locks;
pub mod error; pub mod error;
pub mod handlers; pub mod handlers;
pub mod jobs;
pub mod log; pub mod log;
pub mod models; pub mod models;
pub mod partials; pub mod partials;

View File

@ -33,7 +33,14 @@ pub struct CreateEntry {
pub published_at: DateTime<Utc>, pub published_at: DateTime<Utc>,
} }
pub async fn get_entry(pool: &PgPool, entry_id: Uuid) -> Result<Entry> { #[derive(Default)]
pub struct GetEntriesOptions {
pub published_before: Option<DateTime<Utc>>,
pub limit: Option<i64>,
}
impl Entry {
pub async fn get(pool: &PgPool, entry_id: Uuid) -> Result<Entry> {
sqlx::query_as!(Entry, "select * from entry where entry_id = $1", entry_id) sqlx::query_as!(Entry, "select * from entry where entry_id = $1", entry_id)
.fetch_one(pool) .fetch_one(pool)
.await .await
@ -45,13 +52,7 @@ pub async fn get_entry(pool: &PgPool, entry_id: Uuid) -> Result<Entry> {
}) })
} }
#[derive(Default)] pub async fn get_all(pool: &PgPool, options: GetEntriesOptions) -> sqlx::Result<Vec<Entry>> {
pub struct GetEntriesOptions {
pub published_before: Option<DateTime<Utc>>,
pub limit: Option<i64>,
}
pub async fn get_entries(pool: &PgPool, options: GetEntriesOptions) -> sqlx::Result<Vec<Entry>> {
if let Some(published_before) = options.published_before { if let Some(published_before) = options.published_before {
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
@ -81,7 +82,7 @@ pub async fn get_entries(pool: &PgPool, options: GetEntriesOptions) -> sqlx::Res
} }
} }
pub async fn get_entries_for_feed( pub async fn get_all_for_feed(
pool: &PgPool, pool: &PgPool,
feed_id: Uuid, feed_id: Uuid,
options: GetEntriesOptions, options: GetEntriesOptions,
@ -119,7 +120,7 @@ pub async fn get_entries_for_feed(
} }
} }
pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry> { pub async fn create(pool: &PgPool, payload: CreateEntry) -> Result<Entry> {
payload.validate()?; payload.validate()?;
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
@ -146,7 +147,7 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
}) })
} }
pub async fn upsert_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry> { pub async fn upsert(pool: &PgPool, payload: CreateEntry) -> Result<Entry> {
payload.validate()?; payload.validate()?;
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
@ -177,7 +178,7 @@ pub async fn upsert_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
}) })
} }
pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<Vec<Entry>> { pub async fn bulk_create(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<Vec<Entry>> {
let mut titles = Vec::with_capacity(payload.len()); let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len()); let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
@ -218,7 +219,7 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
}) })
} }
pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<Vec<Entry>> { pub async fn bulk_upsert(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<Vec<Entry>> {
let mut titles = Vec::with_capacity(payload.len()); let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len()); let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
@ -263,7 +264,7 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
}) })
} }
pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> { pub async fn update(pool: &PgPool, payload: Entry) -> Result<Entry> {
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
"update entry set "update entry set
@ -294,7 +295,7 @@ pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
}) })
} }
pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> { pub async fn delete(pool: &PgPool, entry_id: Uuid) -> Result<()> {
sqlx::query!( sqlx::query!(
"update entry set deleted_at = now() where entry_id = $1", "update entry set deleted_at = now() where entry_id = $1",
entry_id entry_id
@ -303,3 +304,4 @@ pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
.await?; .await?;
Ok(()) Ok(())
} }
}

View File

@ -2,18 +2,22 @@ use std::str::FromStr;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sqlx::PgPool; use sqlx::{FromRow, PgPool};
use uuid::Uuid; use uuid::Uuid;
use validator::Validate; use validator::Validate;
use crate::error::{Error, Result}; use crate::error::{Error, Result};
#[derive(Debug, Serialize, Deserialize, sqlx::Type, Clone)] #[derive(Debug, Serialize, Deserialize, sqlx::Type, Clone, Copy)]
#[sqlx(type_name = "feed_type", rename_all = "lowercase")] #[sqlx(type_name = "feed_type", rename_all = "lowercase")]
#[serde(rename_all = "lowercase")] #[serde(rename_all = "lowercase")]
pub enum FeedType { pub enum FeedType {
Atom, Atom,
Rss, JSON,
RSS0,
RSS1,
RSS2,
Unknown,
} }
impl FromStr for FeedType { impl FromStr for FeedType {
@ -21,7 +25,11 @@ impl FromStr for FeedType {
fn from_str(s: &str) -> Result<Self, Self::Err> { fn from_str(s: &str) -> Result<Self, Self::Err> {
match s { match s {
"atom" => Ok(FeedType::Atom), "atom" => Ok(FeedType::Atom),
"rss" => Ok(FeedType::Rss), "json" => Ok(FeedType::JSON),
"rss0" => Ok(FeedType::RSS0),
"rss1" => Ok(FeedType::RSS1),
"rss2" => Ok(FeedType::RSS2),
"unknown" => Ok(FeedType::Unknown),
_ => Err(format!("invalid feed type: {}", s)), _ => Err(format!("invalid feed type: {}", s)),
} }
} }
@ -31,13 +39,15 @@ impl From<feed_rs::model::FeedType> for FeedType {
fn from(value: feed_rs::model::FeedType) -> Self { fn from(value: feed_rs::model::FeedType) -> Self {
match value { match value {
feed_rs::model::FeedType::Atom => FeedType::Atom, feed_rs::model::FeedType::Atom => FeedType::Atom,
// TODO: this isn't really accurate feed_rs::model::FeedType::JSON => FeedType::JSON,
_ => FeedType::Rss, feed_rs::model::FeedType::RSS0 => FeedType::RSS0,
feed_rs::model::FeedType::RSS1 => FeedType::RSS1,
feed_rs::model::FeedType::RSS2 => FeedType::RSS2,
} }
} }
} }
#[derive(Debug, Serialize, Deserialize, Clone)] #[derive(Debug, Serialize, Deserialize, Clone, FromRow)]
pub struct Feed { pub struct Feed {
pub feed_id: Uuid, pub feed_id: Uuid,
pub title: Option<String>, pub title: Option<String>,
@ -45,6 +55,10 @@ pub struct Feed {
#[serde(rename = "type")] #[serde(rename = "type")]
pub feed_type: FeedType, pub feed_type: FeedType,
pub description: Option<String>, pub description: Option<String>,
pub crawl_interval_minutes: i32,
pub last_crawl_error: Option<String>,
pub last_crawled_at: Option<DateTime<Utc>>,
pub last_entry_published_at: Option<DateTime<Utc>>,
pub created_at: DateTime<Utc>, pub created_at: DateTime<Utc>,
pub updated_at: Option<DateTime<Utc>>, pub updated_at: Option<DateTime<Utc>>,
pub deleted_at: Option<DateTime<Utc>>, pub deleted_at: Option<DateTime<Utc>>,
@ -56,13 +70,40 @@ pub struct CreateFeed {
pub title: Option<String>, pub title: Option<String>,
#[validate(url)] #[validate(url)]
pub url: String, pub url: String,
#[serde(rename = "type")]
pub feed_type: FeedType,
#[validate(length(max = 524288))] #[validate(length(max = 524288))]
pub description: Option<String>, pub description: Option<String>,
} }
pub async fn get_feed(pool: &PgPool, feed_id: Uuid) -> Result<Feed> { #[derive(Debug, Deserialize, Validate)]
pub struct UpsertFeed {
#[validate(length(max = 255))]
pub title: Option<String>,
#[validate(url)]
pub url: String,
pub feed_type: Option<FeedType>,
#[validate(length(max = 524288))]
pub description: Option<String>,
pub last_crawl_error: Option<String>,
pub last_crawled_at: Option<DateTime<Utc>>,
}
#[derive(Debug, Deserialize, Default, Validate)]
pub struct UpdateFeed {
#[validate(length(max = 255))]
pub title: Option<Option<String>>,
#[validate(url)]
pub url: Option<String>,
pub feed_type: Option<FeedType>,
#[validate(length(max = 524288))]
pub description: Option<Option<String>>,
pub crawl_interval_minutes: Option<i32>,
pub last_crawl_error: Option<Option<String>>,
pub last_crawled_at: Option<Option<DateTime<Utc>>>,
pub last_entry_published_at: Option<Option<DateTime<Utc>>>,
}
impl Feed {
pub async fn get(pool: &PgPool, feed_id: Uuid) -> Result<Feed> {
sqlx::query_as!( sqlx::query_as!(
Feed, Feed,
// Unable to SELECT * here due to https://github.com/launchbadge/sqlx/issues/1004 // Unable to SELECT * here due to https://github.com/launchbadge/sqlx/issues/1004
@ -73,6 +114,10 @@ pub async fn get_feed(pool: &PgPool, feed_id: Uuid) -> Result<Feed> {
url, url,
type as "feed_type: FeedType", type as "feed_type: FeedType",
description, description,
crawl_interval_minutes,
last_crawl_error,
last_crawled_at,
last_entry_published_at,
created_at, created_at,
updated_at, updated_at,
deleted_at deleted_at
@ -89,7 +134,7 @@ pub async fn get_feed(pool: &PgPool, feed_id: Uuid) -> Result<Feed> {
}) })
} }
pub async fn get_feeds(pool: &PgPool) -> sqlx::Result<Vec<Feed>> { pub async fn get_all(pool: &PgPool) -> sqlx::Result<Vec<Feed>> {
sqlx::query_as!( sqlx::query_as!(
Feed, Feed,
r#"select r#"select
@ -98,6 +143,10 @@ pub async fn get_feeds(pool: &PgPool) -> sqlx::Result<Vec<Feed>> {
url, url,
type as "feed_type: FeedType", type as "feed_type: FeedType",
description, description,
crawl_interval_minutes,
last_crawl_error,
last_crawled_at,
last_entry_published_at,
created_at, created_at,
updated_at, updated_at,
deleted_at deleted_at
@ -108,34 +157,37 @@ pub async fn get_feeds(pool: &PgPool) -> sqlx::Result<Vec<Feed>> {
.await .await
} }
pub async fn create_feed(pool: &PgPool, payload: CreateFeed) -> Result<Feed> { pub async fn create(pool: &PgPool, payload: CreateFeed) -> Result<Feed> {
payload.validate()?; payload.validate()?;
Ok(sqlx::query_as!( Ok(sqlx::query_as!(
Feed, Feed,
r#"insert into feed ( r#"insert into feed (
title, url, type, description title, url, description
) values ( ) values (
$1, $2, $3, $4 $1, $2, $3
) returning ) returning
feed_id, feed_id,
title, title,
url, url,
type as "feed_type: FeedType", type as "feed_type: FeedType",
description, description,
crawl_interval_minutes,
last_crawl_error,
last_crawled_at,
last_entry_published_at,
created_at, created_at,
updated_at, updated_at,
deleted_at deleted_at
"#, "#,
payload.title, payload.title,
payload.url, payload.url,
payload.feed_type as FeedType,
payload.description payload.description
) )
.fetch_one(pool) .fetch_one(pool)
.await?) .await?)
} }
pub async fn upsert_feed(pool: &PgPool, payload: CreateFeed) -> Result<Feed> { pub async fn upsert(pool: &PgPool, payload: UpsertFeed) -> Result<Feed> {
payload.validate()?; payload.validate()?;
Ok(sqlx::query_as!( Ok(sqlx::query_as!(
Feed, Feed,
@ -146,7 +198,7 @@ pub async fn upsert_feed(pool: &PgPool, payload: CreateFeed) -> Result<Feed> {
) on conflict (url) do update set ) on conflict (url) do update set
title = excluded.title, title = excluded.title,
url = excluded.url, url = excluded.url,
type = excluded.type, type = COALESCE(excluded.type, feed.type),
description = excluded.description description = excluded.description
returning returning
feed_id, feed_id,
@ -154,20 +206,67 @@ pub async fn upsert_feed(pool: &PgPool, payload: CreateFeed) -> Result<Feed> {
url, url,
type as "feed_type: FeedType", type as "feed_type: FeedType",
description, description,
crawl_interval_minutes,
last_crawl_error,
last_crawled_at,
last_entry_published_at,
created_at, created_at,
updated_at, updated_at,
deleted_at deleted_at
"#, "#,
payload.title, payload.title,
payload.url, payload.url,
payload.feed_type as FeedType, payload.feed_type as Option<FeedType>,
payload.description payload.description
) )
.fetch_one(pool) .fetch_one(pool)
.await?) .await?)
} }
pub async fn delete_feed(pool: &PgPool, feed_id: Uuid) -> Result<()> { pub async fn update(pool: &PgPool, feed_id: Uuid, payload: UpdateFeed) -> Result<Feed> {
payload.validate()?;
let mut query = sqlx::QueryBuilder::new("UPDATE feed SET ");
let mut updates = query.separated(", ");
if let Some(title) = payload.title {
updates.push_unseparated("title = ");
updates.push_bind(title);
}
if let Some(url) = payload.url {
updates.push_unseparated("url = ");
updates.push_bind(url);
}
if let Some(description) = payload.description {
updates.push_unseparated("description = ");
updates.push_bind(description);
}
if let Some(crawl_interval_minutes) = payload.crawl_interval_minutes {
updates.push("crawl_interval_minutes = ");
updates.push_bind(crawl_interval_minutes);
}
if let Some(last_crawl_error) = payload.last_crawl_error {
updates.push_unseparated("last_crawl_error = ");
updates.push_bind(last_crawl_error);
}
if let Some(last_crawled_at) = payload.last_crawled_at {
updates.push_unseparated("last_crawled_at = ");
updates.push_bind(last_crawled_at);
}
if let Some(last_entry_published_at) = payload.last_entry_published_at {
updates.push_unseparated("last_entry_published_at = ");
updates.push_bind(last_entry_published_at);
}
query.push(" WHERE id = ");
query.push_bind(feed_id);
query.push(" RETURNING *");
let query = query.build_query_as();
Ok(query.fetch_one(pool).await?)
}
pub async fn delete(pool: &PgPool, feed_id: Uuid) -> Result<()> {
sqlx::query!( sqlx::query!(
"update feed set deleted_at = now() where feed_id = $1", "update feed set deleted_at = now() where feed_id = $1",
feed_id feed_id
@ -176,3 +275,45 @@ pub async fn delete_feed(pool: &PgPool, feed_id: Uuid) -> Result<()> {
.await?; .await?;
Ok(()) Ok(())
} }
pub async fn save(&self, pool: &PgPool) -> Result<Feed> {
Ok(sqlx::query_as!(
Feed,
r#"update feed set
title = $2,
url = $3,
type = $4,
description = $5,
crawl_interval_minutes = $6,
last_crawl_error = $7,
last_crawled_at = $8,
last_entry_published_at = $9
where feed_id = $1
returning
feed_id,
title,
url,
type as "feed_type: FeedType",
description,
crawl_interval_minutes,
last_crawl_error,
last_crawled_at,
last_entry_published_at,
created_at,
updated_at,
deleted_at
"#,
self.feed_id,
self.title,
self.url,
self.feed_type as FeedType,
self.description,
self.crawl_interval_minutes,
self.last_crawl_error,
self.last_crawled_at,
self.last_entry_published_at,
)
.fetch_one(pool)
.await?)
}
}

View File

@ -9,7 +9,7 @@ const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmn
/// ///
/// Database rows have a UUID primary key, but they are encoded in Base62 to be shorter and more /// Database rows have a UUID primary key, but they are encoded in Base62 to be shorter and more
/// URL-friendly for the frontend. /// URL-friendly for the frontend.
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize, Clone, Copy)]
pub struct Base62Uuid( pub struct Base62Uuid(
#[serde(deserialize_with = "uuid_from_base62_str")] #[serde(deserialize_with = "uuid_from_base62_str")]
#[serde(serialize_with = "uuid_to_base62_str")] #[serde(serialize_with = "uuid_to_base62_str")]