Store entry html content outside DB in file storage

Since the HTML content can get quite big and can have embeded images.
This commit is contained in:
Tyler Hallada 2023-07-05 23:45:49 -04:00
parent 7289151318
commit 3f028c3088
10 changed files with 81 additions and 37 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@
/static/js/* /static/js/*
/static/css/* /static/css/*
.frontend-built .frontend-built
/content

View File

@ -55,6 +55,7 @@ builds
DATABASE_MAX_CONNECTIONS=5 DATABASE_MAX_CONNECTIONS=5
TITLE=crawlnicle TITLE=crawlnicle
MAX_MEM_LOG_SIZE=1000000 MAX_MEM_LOG_SIZE=1000000
CONTENT_DIR=./content
``` ```
1. Run `just migrate` (or `sqlx migrate run`) which will run all the database 1. Run `just migrate` (or `sqlx migrate run`) which will run all the database

View File

@ -39,6 +39,7 @@ watch-backend:
--ignore 'logs/*' \ --ignore 'logs/*' \
--ignore 'static/*' \ --ignore 'static/*' \
--ignore 'frontend/*' \ --ignore 'frontend/*' \
--ignore 'content/*' \
--no-vcs-ignores \ --no-vcs-ignores \
-x run -x run

View File

@ -28,12 +28,12 @@ end;
$$ language plpgsql; $$ language plpgsql;
-- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes -- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
-- over things like usernames and emails, ithout needing to remember to do case-conversion. -- over things like usernames and emails, ithout needing to remember to do case-conversion.
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false); create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
create type feed_type as enum ('atom', 'rss'); create type feed_type as enum ('atom', 'rss');
create table if not exists "feed" ( create table if not exists "feed" (
feed_id uuid primary key default uuid_generate_v1mc(), feed_id uuid primary key default uuid_generate_v1mc(),
title text, title text,
url varchar(2048) not null, url varchar(2048) not null,
@ -52,7 +52,6 @@ create table if not exists "entry" (
title text, title text,
url varchar(2048) not null, url varchar(2048) not null,
description text, description text,
html_content text,
feed_id uuid not null references "feed" (feed_id) on delete cascade, feed_id uuid not null references "feed" (feed_id) on delete cascade,
published_at timestamptz not null, published_at timestamptz not null,
created_at timestamptz not null default now(), created_at timestamptz not null default now(),

View File

@ -117,7 +117,6 @@ pub async fn main() -> Result<()> {
title: args.title, title: args.title,
url: args.url, url: args.url,
description: args.description, description: args.description,
html_content: None,
feed_id: args.feed_id, feed_id: args.feed_id,
published_at: Utc::now(), published_at: Utc::now(),
}, },

View File

@ -14,4 +14,6 @@ pub struct Config {
pub title: String, pub title: String,
#[clap(long, env)] #[clap(long, env)]
pub max_mem_log_size: usize, pub max_mem_log_size: usize,
#[clap(long, env)]
pub content_dir: String,
} }

View File

@ -1,8 +1,11 @@
use std::fs;
use axum::extract::{Path, State}; use axum::extract::{Path, State};
use axum::response::Response; use axum::response::Response;
use maud::{html, PreEscaped}; use maud::{html, PreEscaped};
use sqlx::PgPool; use sqlx::PgPool;
use crate::config::Config;
use crate::error::Result; use crate::error::Result;
use crate::models::entry::get_entry; use crate::models::entry::get_entry;
use crate::partials::layout::Layout; use crate::partials::layout::Layout;
@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid;
pub async fn get( pub async fn get(
Path(id): Path<Base62Uuid>, Path(id): Path<Base62Uuid>,
State(pool): State<PgPool>, State(pool): State<PgPool>,
State(config): State<Config>,
layout: Layout, layout: Layout,
) -> Result<Response> { ) -> Result<Response> {
let entry = get_entry(&pool, id.as_uuid()).await?; let entry = get_entry(&pool, id.as_uuid()).await?;
let content_dir = std::path::Path::new(&config.content_dir);
let content_path = content_dir.join(format!("{}.html", entry.entry_id));
Ok(layout.render(html! { Ok(layout.render(html! {
article { article {
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string()); @let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
@ -25,7 +31,7 @@ pub async fn get(
(published_at) (published_at)
} }
} }
@let content = entry.html_content.unwrap_or_else(|| "No content".to_string()); @let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
(PreEscaped(content)) (PreEscaped(content))
} }
})) }))

View File

@ -1,3 +1,7 @@
use std::fs;
use std::env;
use std::path::Path;
use article_scraper::ArticleScraper; use article_scraper::ArticleScraper;
use chrono::Utc; use chrono::Utc;
use feed_rs::parser; use feed_rs::parser;
@ -6,7 +10,7 @@ use sqlx::PgPool;
use tracing::{info, info_span, warn}; use tracing::{info, info_span, warn};
use crate::models::feed::get_feeds; use crate::models::feed::get_feeds;
use crate::models::entry::{upsert_entries, CreateEntry}; use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
use crate::uuid::Base62Uuid; use crate::uuid::Base62Uuid;
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the /// For every feed in the database, fetches the feed, parses it, and saves new entries to the
@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid;
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> { pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
let scraper = ArticleScraper::new(None).await; let scraper = ArticleScraper::new(None).await;
let client = Client::new(); let client = Client::new();
let content_dir = env::var("CONTENT_DIR")?;
let content_dir = Path::new(&content_dir);
let feeds = get_feeds(pool).await?; let feeds = get_feeds(pool).await?;
for feed in feeds { for feed in feeds {
let feed_id_str: String = Base62Uuid::from(feed.feed_id).into(); let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
if let Some(link) = entry.links.get(0) { if let Some(link) = entry.links.get(0) {
// if no scraped or feed date is available, fallback to the current time // if no scraped or feed date is available, fallback to the current time
let published_at = entry.published.unwrap_or_else(Utc::now); let published_at = entry.published.unwrap_or_else(Utc::now);
let mut entry = CreateEntry { let entry = CreateEntry {
title: entry.title.map(|t| t.content), title: entry.title.map(|t| t.content),
url: link.href.clone(), url: link.href.clone(),
description: entry.summary.map(|s| s.content), description: entry.summary.map(|s| s.content),
html_content: None,
feed_id: feed.feed_id, feed_id: feed.feed_id,
published_at, published_at,
}; };
info!("Fetching and parsing entry link: {}", link.href);
if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
if let Some(date) = article.date {
// prefer scraped date over rss feed date
entry.published_at = date;
};
entry.html_content = article.get_content();
} else {
warn!("Failed to fetch article for entry: {:?}", link);
}
payload.push(entry); payload.push(entry);
} else { } else {
warn!("Skipping feed entry with no links"); warn!("Skipping feed entry with no links");
@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
} }
let entries = upsert_entries(pool, payload).await?; let entries = upsert_entries(pool, payload).await?;
info!("Created {} entries", entries.len()); info!("Created {} entries", entries.len());
// TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that
// doesn't implement Send so this isn't trivial.
for mut entry in entries {
info!("Fetching and parsing entry link: {}", entry.url);
if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
let id = entry.entry_id;
if let Some(date) = article.date {
// prefer scraped date over rss feed date
entry.published_at = date;
update_entry(pool, entry).await?;
};
let html_content = article.get_content();
if let Some(content) = html_content {
fs::write(content_dir.join(format!("{}.html", id)), content)?;
}
} else {
warn!("Failed to fetch article for entry: {:?}", &entry.url);
}
}
} }
Ok(()) Ok(())
} }

View File

@ -14,7 +14,6 @@ pub struct Entry {
pub title: Option<String>, pub title: Option<String>,
pub url: String, pub url: String,
pub description: Option<String>, pub description: Option<String>,
pub html_content: Option<String>,
pub feed_id: Uuid, pub feed_id: Uuid,
pub published_at: DateTime<Utc>, pub published_at: DateTime<Utc>,
pub created_at: DateTime<Utc>, pub created_at: DateTime<Utc>,
@ -30,7 +29,6 @@ pub struct CreateEntry {
pub url: String, pub url: String,
#[validate(length(max = 524288))] #[validate(length(max = 524288))]
pub description: Option<String>, pub description: Option<String>,
pub html_content: Option<String>,
pub feed_id: Uuid, pub feed_id: Uuid,
pub published_at: DateTime<Utc>, pub published_at: DateTime<Utc>,
} }
@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
"insert into entry ( "insert into entry (
title, url, description, html_content, feed_id, published_at title, url, description, feed_id, published_at
) values ( ) values (
$1, $2, $3, $4, $5, $6 $1, $2, $3, $4, $5
) returning *", ) returning *",
payload.title, payload.title,
payload.url, payload.url,
payload.description, payload.description,
payload.html_content,
payload.feed_id, payload.feed_id,
payload.published_at, payload.published_at,
) )
@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len()); let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len()); let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len()); let mut feed_ids = Vec::with_capacity(payload.len());
let mut published_ats = Vec::with_capacity(payload.len()); let mut published_ats = Vec::with_capacity(payload.len());
payload payload
@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
titles.push(entry.title.clone()); titles.push(entry.title.clone());
urls.push(entry.url.clone()); urls.push(entry.url.clone());
descriptions.push(entry.description.clone()); descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id); feed_ids.push(entry.feed_id);
published_ats.push(entry.published_at); published_ats.push(entry.published_at);
entry.validate() entry.validate()
@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
"insert into entry ( "insert into entry (
title, url, description, html_content, feed_id, published_at title, url, description, feed_id, published_at
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[]) ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
returning *", returning *",
titles.as_slice() as &[Option<String>], titles.as_slice() as &[Option<String>],
urls.as_slice(), urls.as_slice(),
descriptions.as_slice() as &[Option<String>], descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(), feed_ids.as_slice(),
published_ats.as_slice(), published_ats.as_slice(),
) )
@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len()); let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len()); let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len()); let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len()); let mut feed_ids = Vec::with_capacity(payload.len());
let mut published_ats = Vec::with_capacity(payload.len()); let mut published_ats = Vec::with_capacity(payload.len());
payload payload
@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
titles.push(entry.title.clone()); titles.push(entry.title.clone());
urls.push(entry.url.clone()); urls.push(entry.url.clone());
descriptions.push(entry.description.clone()); descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id); feed_ids.push(entry.feed_id);
published_ats.push(entry.published_at); published_ats.push(entry.published_at);
entry.validate() entry.validate()
@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
sqlx::query_as!( sqlx::query_as!(
Entry, Entry,
"insert into entry ( "insert into entry (
title, url, description, html_content, feed_id, published_at title, url, description, feed_id, published_at
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[]) ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
on conflict do nothing on conflict do nothing
returning *", returning *",
titles.as_slice() as &[Option<String>], titles.as_slice() as &[Option<String>],
urls.as_slice(), urls.as_slice(),
descriptions.as_slice() as &[Option<String>], descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(), feed_ids.as_slice(),
published_ats.as_slice(), published_ats.as_slice(),
) )
@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
}) })
} }
pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
sqlx::query_as!(
Entry,
"update entry set
title = $2,
url = $3,
description = $4,
feed_id = $5,
published_at = $6
where entry_id = $1
returning *
",
payload.entry_id,
payload.title,
payload.url,
payload.description,
payload.feed_id,
payload.published_at,
)
.fetch_one(pool)
.await
.map_err(|error| {
if let sqlx::error::Error::Database(ref psql_error) = error {
if psql_error.code().as_deref() == Some("23503") {
return Error::RelationNotFound("feed");
}
}
Error::Sqlx(error)
})
}
pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> { pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id) sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
.execute(pool) .execute(pool)

View File

@ -1,7 +1,5 @@
use url::Url; use url::Url;
const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
pub fn get_domain(url: &str) -> Option<String> { pub fn get_domain(url: &str) -> Option<String> {
Url::parse(url) Url::parse(url)
.ok() .ok()