Fetch and save entry HTML content with metadata

And render the extracted HTML on the entry page in the frontend.
This commit is contained in:
2023-06-07 01:06:03 -04:00
parent 786f3a194f
commit 3f29138bd1
7 changed files with 516 additions and 12 deletions

View File

@@ -98,8 +98,6 @@ pub async fn main() -> Result<()> {
let args: Args = argh::from_env();
info!("hello?");
match args.commands {
Commands::AddFeed(args) => {
let feed = create_feed(
@@ -125,6 +123,7 @@ pub async fn main() -> Result<()> {
title: args.title,
url: args.url,
description: args.description,
html_content: None,
feed_id: args.feed_id,
},
)

View File

@@ -1,6 +1,6 @@
use axum::extract::{State, Path};
use axum::response::Response;
use maud::html;
use maud::{html, PreEscaped};
use sqlx::PgPool;
use crate::error::Result;
@@ -12,7 +12,7 @@ pub async fn get(Path(id): Path<i32>, State(pool): State<PgPool>, layout: Layout
Ok(layout.render(html! {
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
h1 { a href=(entry.url) { (title) } }
@let description = entry.description.unwrap_or_else(|| "No description".to_string());
p { (description) }
@let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
(PreEscaped(content))
}))
}

View File

@@ -1,5 +1,6 @@
use article_scraper::ArticleScraper;
use feed_rs::parser;
use reqwest::Client;
use reqwest::{Client, Url};
use sqlx::PgPool;
use tracing::{info, warn};
@@ -9,18 +10,23 @@ use crate::models::entry::{upsert_entries, CreateEntry};
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
/// database.
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
let scraper = ArticleScraper::new(None).await;
let client = Client::new();
let feeds = get_feeds(pool).await?;
for feed in feeds {
info!("Fetching feed {}: {}", feed.id, feed.url);
let bytes = client.get(feed.url).send().await?.bytes().await?;
let parsed_feed = parser::parse(&bytes[..])?;
let mut payload = Vec::with_capacity(parsed_feed.entries.len());
for entry in parsed_feed.entries {
if let Some(link) = entry.links.get(0) {
info!("Fetching entry article: {}", link.href);
let article = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await?;
let entry = CreateEntry {
title: entry.title.map(|t| t.content),
url: link.href.clone(),
description: entry.summary.map(|s| s.content),
html_content: article.get_content(),
feed_id: feed.id,
};
payload.push(entry);

View File

@@ -11,6 +11,7 @@ pub struct Entry {
pub title: Option<String>,
pub url: String,
pub description: Option<String>,
pub html_content: Option<String>,
pub feed_id: i32,
pub created_at: NaiveDateTime,
pub updated_at: NaiveDateTime,
@@ -25,6 +26,7 @@ pub struct CreateEntry {
pub url: String,
#[validate(length(max = 524288))]
pub description: Option<String>,
pub html_content: Option<String>,
#[validate(range(min = 1))]
pub feed_id: i32,
}
@@ -52,13 +54,14 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
sqlx::query_as!(
Entry,
"INSERT INTO entries (
title, url, description, feed_id, created_at, updated_at
title, url, description, html_content, feed_id, created_at, updated_at
) VALUES (
$1, $2, $3, $4, now(), now()
$1, $2, $3, $4, $5, now(), now()
) RETURNING *",
payload.title,
payload.url,
payload.description,
payload.html_content,
payload.feed_id,
)
.fetch_one(pool)
@@ -77,23 +80,26 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len());
payload.iter().map(|entry| {
titles.push(entry.title.clone());
urls.push(entry.url.clone());
descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id);
entry.validate()
}).collect::<Result<Vec<()>, ValidationErrors>>()?;
sqlx::query_as!(
Entry,
"INSERT INTO entries (
title, url, description, feed_id, created_at, updated_at
) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::int[])
title, url, description, html_content, feed_id, created_at, updated_at
) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::text[], $5::int[])
RETURNING *",
titles.as_slice() as &[Option<String>],
urls.as_slice(),
descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(),
)
.fetch_all(pool)
@@ -112,24 +118,27 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len());
payload.iter().map(|entry| {
titles.push(entry.title.clone());
urls.push(entry.url.clone());
descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id);
entry.validate()
}).collect::<Result<Vec<()>, ValidationErrors>>()?;
sqlx::query_as!(
Entry,
"INSERT INTO entries (
title, url, description, feed_id, created_at, updated_at
) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::int[])
title, url, description, html_content, feed_id, created_at, updated_at
) SELECT *, now(), now() FROM UNNEST($1::text[], $2::text[], $3::text[], $4::text[], $5::int[])
ON CONFLICT DO NOTHING
RETURNING *",
titles.as_slice() as &[Option<String>],
urls.as_slice(),
descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(),
)
.fetch_all(pool)