From 3f028c30880eff71c5d5688eda3ffcba687c7075 Mon Sep 17 00:00:00 2001 From: Tyler Hallada Date: Wed, 5 Jul 2023 23:45:49 -0400 Subject: [PATCH] Store entry html content outside DB in file storage Since the HTML content can get quite big and can have embeded images. --- .gitignore | 1 + README.md | 1 + justfile | 1 + migrations/20230507201612_initial.sql | 9 +++-- src/bin/cli.rs | 1 - src/config.rs | 2 ++ src/handlers/entry.rs | 8 ++++- src/jobs/crawl.rs | 41 ++++++++++++++------- src/models/entry.rs | 52 +++++++++++++++++++-------- src/utils.rs | 2 -- 10 files changed, 81 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index a72e8eb..eac289a 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ /static/js/* /static/css/* .frontend-built +/content diff --git a/README.md b/README.md index 49c924a..2cd5be4 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ builds DATABASE_MAX_CONNECTIONS=5 TITLE=crawlnicle MAX_MEM_LOG_SIZE=1000000 + CONTENT_DIR=./content ``` 1. Run `just migrate` (or `sqlx migrate run`) which will run all the database diff --git a/justfile b/justfile index 01febf0..85e3ee2 100755 --- a/justfile +++ b/justfile @@ -39,6 +39,7 @@ watch-backend: --ignore 'logs/*' \ --ignore 'static/*' \ --ignore 'frontend/*' \ + --ignore 'content/*' \ --no-vcs-ignores \ -x run diff --git a/migrations/20230507201612_initial.sql b/migrations/20230507201612_initial.sql index c3a93dc..c0b9e80 100644 --- a/migrations/20230507201612_initial.sql +++ b/migrations/20230507201612_initial.sql @@ -28,12 +28,12 @@ end; $$ language plpgsql; -- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes - -- over things like usernames and emails, ithout needing to remember to do case-conversion. - create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false); +-- over things like usernames and emails, ithout needing to remember to do case-conversion. +create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false); - create type feed_type as enum ('atom', 'rss'); +create type feed_type as enum ('atom', 'rss'); - create table if not exists "feed" ( +create table if not exists "feed" ( feed_id uuid primary key default uuid_generate_v1mc(), title text, url varchar(2048) not null, @@ -52,7 +52,6 @@ create table if not exists "entry" ( title text, url varchar(2048) not null, description text, - html_content text, feed_id uuid not null references "feed" (feed_id) on delete cascade, published_at timestamptz not null, created_at timestamptz not null default now(), diff --git a/src/bin/cli.rs b/src/bin/cli.rs index 0c685a5..eefad14 100644 --- a/src/bin/cli.rs +++ b/src/bin/cli.rs @@ -117,7 +117,6 @@ pub async fn main() -> Result<()> { title: args.title, url: args.url, description: args.description, - html_content: None, feed_id: args.feed_id, published_at: Utc::now(), }, diff --git a/src/config.rs b/src/config.rs index 680d777..8f55037 100644 --- a/src/config.rs +++ b/src/config.rs @@ -14,4 +14,6 @@ pub struct Config { pub title: String, #[clap(long, env)] pub max_mem_log_size: usize, + #[clap(long, env)] + pub content_dir: String, } diff --git a/src/handlers/entry.rs b/src/handlers/entry.rs index e1e7dde..a2edf97 100644 --- a/src/handlers/entry.rs +++ b/src/handlers/entry.rs @@ -1,8 +1,11 @@ +use std::fs; + use axum::extract::{Path, State}; use axum::response::Response; use maud::{html, PreEscaped}; use sqlx::PgPool; +use crate::config::Config; use crate::error::Result; use crate::models::entry::get_entry; use crate::partials::layout::Layout; @@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid; pub async fn get( Path(id): Path, State(pool): State, + State(config): State, layout: Layout, ) -> Result { let entry = get_entry(&pool, id.as_uuid()).await?; + let content_dir = std::path::Path::new(&config.content_dir); + let content_path = content_dir.join(format!("{}.html", entry.entry_id)); Ok(layout.render(html! { article { @let title = entry.title.unwrap_or_else(|| "Untitled".to_string()); @@ -25,7 +31,7 @@ pub async fn get( (published_at) } } - @let content = entry.html_content.unwrap_or_else(|| "No content".to_string()); + @let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string()); (PreEscaped(content)) } })) diff --git a/src/jobs/crawl.rs b/src/jobs/crawl.rs index 1c3d36b..cd7ff9c 100644 --- a/src/jobs/crawl.rs +++ b/src/jobs/crawl.rs @@ -1,3 +1,7 @@ +use std::fs; +use std::env; +use std::path::Path; + use article_scraper::ArticleScraper; use chrono::Utc; use feed_rs::parser; @@ -6,7 +10,7 @@ use sqlx::PgPool; use tracing::{info, info_span, warn}; use crate::models::feed::get_feeds; -use crate::models::entry::{upsert_entries, CreateEntry}; +use crate::models::entry::{update_entry, upsert_entries, CreateEntry}; use crate::uuid::Base62Uuid; /// For every feed in the database, fetches the feed, parses it, and saves new entries to the @@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid; pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> { let scraper = ArticleScraper::new(None).await; let client = Client::new(); + let content_dir = env::var("CONTENT_DIR")?; + let content_dir = Path::new(&content_dir); let feeds = get_feeds(pool).await?; for feed in feeds { let feed_id_str: String = Base62Uuid::from(feed.feed_id).into(); @@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> { if let Some(link) = entry.links.get(0) { // if no scraped or feed date is available, fallback to the current time let published_at = entry.published.unwrap_or_else(Utc::now); - let mut entry = CreateEntry { + let entry = CreateEntry { title: entry.title.map(|t| t.content), url: link.href.clone(), description: entry.summary.map(|s| s.content), - html_content: None, feed_id: feed.feed_id, published_at, }; - info!("Fetching and parsing entry link: {}", link.href); - if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await { - if let Some(date) = article.date { - // prefer scraped date over rss feed date - entry.published_at = date; - }; - entry.html_content = article.get_content(); - } else { - warn!("Failed to fetch article for entry: {:?}", link); - } payload.push(entry); } else { warn!("Skipping feed entry with no links"); @@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> { } let entries = upsert_entries(pool, payload).await?; info!("Created {} entries", entries.len()); + + // TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that + // doesn't implement Send so this isn't trivial. + for mut entry in entries { + info!("Fetching and parsing entry link: {}", entry.url); + if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await { + let id = entry.entry_id; + if let Some(date) = article.date { + // prefer scraped date over rss feed date + entry.published_at = date; + update_entry(pool, entry).await?; + }; + let html_content = article.get_content(); + if let Some(content) = html_content { + fs::write(content_dir.join(format!("{}.html", id)), content)?; + } + } else { + warn!("Failed to fetch article for entry: {:?}", &entry.url); + } + } } Ok(()) } diff --git a/src/models/entry.rs b/src/models/entry.rs index 44425fd..7f6dbdb 100644 --- a/src/models/entry.rs +++ b/src/models/entry.rs @@ -14,7 +14,6 @@ pub struct Entry { pub title: Option, pub url: String, pub description: Option, - pub html_content: Option, pub feed_id: Uuid, pub published_at: DateTime, pub created_at: DateTime, @@ -30,7 +29,6 @@ pub struct CreateEntry { pub url: String, #[validate(length(max = 524288))] pub description: Option, - pub html_content: Option, pub feed_id: Uuid, pub published_at: DateTime, } @@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result sqlx::query_as!( Entry, "insert into entry ( - title, url, description, html_content, feed_id, published_at + title, url, description, feed_id, published_at ) values ( - $1, $2, $3, $4, $5, $6 + $1, $2, $3, $4, $5 ) returning *", payload.title, payload.url, payload.description, - payload.html_content, payload.feed_id, payload.published_at, ) @@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec) -> Result< let mut titles = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len()); let mut descriptions: Vec> = Vec::with_capacity(payload.len()); - let mut html_contents: Vec> = Vec::with_capacity(payload.len()); let mut feed_ids = Vec::with_capacity(payload.len()); let mut published_ats = Vec::with_capacity(payload.len()); payload @@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec) -> Result< titles.push(entry.title.clone()); urls.push(entry.url.clone()); descriptions.push(entry.description.clone()); - html_contents.push(entry.html_content.clone()); feed_ids.push(entry.feed_id); published_ats.push(entry.published_at); entry.validate() @@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec) -> Result< sqlx::query_as!( Entry, "insert into entry ( - title, url, description, html_content, feed_id, published_at - ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[]) + title, url, description, feed_id, published_at + ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[]) returning *", titles.as_slice() as &[Option], urls.as_slice(), descriptions.as_slice() as &[Option], - html_contents.as_slice() as &[Option], feed_ids.as_slice(), published_ats.as_slice(), ) @@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec) -> Result< let mut titles = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len()); let mut descriptions: Vec> = Vec::with_capacity(payload.len()); - let mut html_contents: Vec> = Vec::with_capacity(payload.len()); let mut feed_ids = Vec::with_capacity(payload.len()); let mut published_ats = Vec::with_capacity(payload.len()); payload @@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec) -> Result< titles.push(entry.title.clone()); urls.push(entry.url.clone()); descriptions.push(entry.description.clone()); - html_contents.push(entry.html_content.clone()); feed_ids.push(entry.feed_id); published_ats.push(entry.published_at); entry.validate() @@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec) -> Result< sqlx::query_as!( Entry, "insert into entry ( - title, url, description, html_content, feed_id, published_at - ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[]) + title, url, description, feed_id, published_at + ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[]) on conflict do nothing returning *", titles.as_slice() as &[Option], urls.as_slice(), descriptions.as_slice() as &[Option], - html_contents.as_slice() as &[Option], feed_ids.as_slice(), published_ats.as_slice(), ) @@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec) -> Result< }) } +pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result { + sqlx::query_as!( + Entry, + "update entry set + title = $2, + url = $3, + description = $4, + feed_id = $5, + published_at = $6 + where entry_id = $1 + returning * + ", + payload.entry_id, + payload.title, + payload.url, + payload.description, + payload.feed_id, + payload.published_at, + ) + .fetch_one(pool) + .await + .map_err(|error| { + if let sqlx::error::Error::Database(ref psql_error) = error { + if psql_error.code().as_deref() == Some("23503") { + return Error::RelationNotFound("feed"); + } + } + Error::Sqlx(error) + }) +} + pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> { sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id) .execute(pool) diff --git a/src/utils.rs b/src/utils.rs index b5f03b4..dca8b1e 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,7 +1,5 @@ use url::Url; -const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - pub fn get_domain(url: &str) -> Option { Url::parse(url) .ok()