Store entry html content outside DB in file storage

Since the HTML content can get quite big and can have embeded images.
This commit is contained in:
Tyler Hallada 2023-07-05 23:45:49 -04:00
parent 7289151318
commit 3f028c3088
10 changed files with 81 additions and 37 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@
/static/js/*
/static/css/*
.frontend-built
/content

View File

@ -55,6 +55,7 @@ builds
DATABASE_MAX_CONNECTIONS=5
TITLE=crawlnicle
MAX_MEM_LOG_SIZE=1000000
CONTENT_DIR=./content
```
1. Run `just migrate` (or `sqlx migrate run`) which will run all the database

View File

@ -39,6 +39,7 @@ watch-backend:
--ignore 'logs/*' \
--ignore 'static/*' \
--ignore 'frontend/*' \
--ignore 'content/*' \
--no-vcs-ignores \
-x run

View File

@ -28,12 +28,12 @@ end;
$$ language plpgsql;
-- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
-- over things like usernames and emails, ithout needing to remember to do case-conversion.
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
-- over things like usernames and emails, ithout needing to remember to do case-conversion.
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
create type feed_type as enum ('atom', 'rss');
create type feed_type as enum ('atom', 'rss');
create table if not exists "feed" (
create table if not exists "feed" (
feed_id uuid primary key default uuid_generate_v1mc(),
title text,
url varchar(2048) not null,
@ -52,7 +52,6 @@ create table if not exists "entry" (
title text,
url varchar(2048) not null,
description text,
html_content text,
feed_id uuid not null references "feed" (feed_id) on delete cascade,
published_at timestamptz not null,
created_at timestamptz not null default now(),

View File

@ -117,7 +117,6 @@ pub async fn main() -> Result<()> {
title: args.title,
url: args.url,
description: args.description,
html_content: None,
feed_id: args.feed_id,
published_at: Utc::now(),
},

View File

@ -14,4 +14,6 @@ pub struct Config {
pub title: String,
#[clap(long, env)]
pub max_mem_log_size: usize,
#[clap(long, env)]
pub content_dir: String,
}

View File

@ -1,8 +1,11 @@
use std::fs;
use axum::extract::{Path, State};
use axum::response::Response;
use maud::{html, PreEscaped};
use sqlx::PgPool;
use crate::config::Config;
use crate::error::Result;
use crate::models::entry::get_entry;
use crate::partials::layout::Layout;
@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid;
pub async fn get(
Path(id): Path<Base62Uuid>,
State(pool): State<PgPool>,
State(config): State<Config>,
layout: Layout,
) -> Result<Response> {
let entry = get_entry(&pool, id.as_uuid()).await?;
let content_dir = std::path::Path::new(&config.content_dir);
let content_path = content_dir.join(format!("{}.html", entry.entry_id));
Ok(layout.render(html! {
article {
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
@ -25,7 +31,7 @@ pub async fn get(
(published_at)
}
}
@let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
@let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
(PreEscaped(content))
}
}))

View File

@ -1,3 +1,7 @@
use std::fs;
use std::env;
use std::path::Path;
use article_scraper::ArticleScraper;
use chrono::Utc;
use feed_rs::parser;
@ -6,7 +10,7 @@ use sqlx::PgPool;
use tracing::{info, info_span, warn};
use crate::models::feed::get_feeds;
use crate::models::entry::{upsert_entries, CreateEntry};
use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
use crate::uuid::Base62Uuid;
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid;
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
let scraper = ArticleScraper::new(None).await;
let client = Client::new();
let content_dir = env::var("CONTENT_DIR")?;
let content_dir = Path::new(&content_dir);
let feeds = get_feeds(pool).await?;
for feed in feeds {
let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
if let Some(link) = entry.links.get(0) {
// if no scraped or feed date is available, fallback to the current time
let published_at = entry.published.unwrap_or_else(Utc::now);
let mut entry = CreateEntry {
let entry = CreateEntry {
title: entry.title.map(|t| t.content),
url: link.href.clone(),
description: entry.summary.map(|s| s.content),
html_content: None,
feed_id: feed.feed_id,
published_at,
};
info!("Fetching and parsing entry link: {}", link.href);
if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
if let Some(date) = article.date {
// prefer scraped date over rss feed date
entry.published_at = date;
};
entry.html_content = article.get_content();
} else {
warn!("Failed to fetch article for entry: {:?}", link);
}
payload.push(entry);
} else {
warn!("Skipping feed entry with no links");
@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
}
let entries = upsert_entries(pool, payload).await?;
info!("Created {} entries", entries.len());
// TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that
// doesn't implement Send so this isn't trivial.
for mut entry in entries {
info!("Fetching and parsing entry link: {}", entry.url);
if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
let id = entry.entry_id;
if let Some(date) = article.date {
// prefer scraped date over rss feed date
entry.published_at = date;
update_entry(pool, entry).await?;
};
let html_content = article.get_content();
if let Some(content) = html_content {
fs::write(content_dir.join(format!("{}.html", id)), content)?;
}
} else {
warn!("Failed to fetch article for entry: {:?}", &entry.url);
}
}
}
Ok(())
}

View File

@ -14,7 +14,6 @@ pub struct Entry {
pub title: Option<String>,
pub url: String,
pub description: Option<String>,
pub html_content: Option<String>,
pub feed_id: Uuid,
pub published_at: DateTime<Utc>,
pub created_at: DateTime<Utc>,
@ -30,7 +29,6 @@ pub struct CreateEntry {
pub url: String,
#[validate(length(max = 524288))]
pub description: Option<String>,
pub html_content: Option<String>,
pub feed_id: Uuid,
pub published_at: DateTime<Utc>,
}
@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
sqlx::query_as!(
Entry,
"insert into entry (
title, url, description, html_content, feed_id, published_at
title, url, description, feed_id, published_at
) values (
$1, $2, $3, $4, $5, $6
$1, $2, $3, $4, $5
) returning *",
payload.title,
payload.url,
payload.description,
payload.html_content,
payload.feed_id,
payload.published_at,
)
@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len());
let mut published_ats = Vec::with_capacity(payload.len());
payload
@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
titles.push(entry.title.clone());
urls.push(entry.url.clone());
descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id);
published_ats.push(entry.published_at);
entry.validate()
@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
sqlx::query_as!(
Entry,
"insert into entry (
title, url, description, html_content, feed_id, published_at
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
title, url, description, feed_id, published_at
) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
returning *",
titles.as_slice() as &[Option<String>],
urls.as_slice(),
descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(),
published_ats.as_slice(),
)
@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
let mut titles = Vec::with_capacity(payload.len());
let mut urls = Vec::with_capacity(payload.len());
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
let mut feed_ids = Vec::with_capacity(payload.len());
let mut published_ats = Vec::with_capacity(payload.len());
payload
@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
titles.push(entry.title.clone());
urls.push(entry.url.clone());
descriptions.push(entry.description.clone());
html_contents.push(entry.html_content.clone());
feed_ids.push(entry.feed_id);
published_ats.push(entry.published_at);
entry.validate()
@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
sqlx::query_as!(
Entry,
"insert into entry (
title, url, description, html_content, feed_id, published_at
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
title, url, description, feed_id, published_at
) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
on conflict do nothing
returning *",
titles.as_slice() as &[Option<String>],
urls.as_slice(),
descriptions.as_slice() as &[Option<String>],
html_contents.as_slice() as &[Option<String>],
feed_ids.as_slice(),
published_ats.as_slice(),
)
@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
})
}
pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
sqlx::query_as!(
Entry,
"update entry set
title = $2,
url = $3,
description = $4,
feed_id = $5,
published_at = $6
where entry_id = $1
returning *
",
payload.entry_id,
payload.title,
payload.url,
payload.description,
payload.feed_id,
payload.published_at,
)
.fetch_one(pool)
.await
.map_err(|error| {
if let sqlx::error::Error::Database(ref psql_error) = error {
if psql_error.code().as_deref() == Some("23503") {
return Error::RelationNotFound("feed");
}
}
Error::Sqlx(error)
})
}
pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
.execute(pool)

View File

@ -1,7 +1,5 @@
use url::Url;
const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
pub fn get_domain(url: &str) -> Option<String> {
Url::parse(url)
.ok()