Store entry html content outside DB in file storage

Since the HTML content can get quite big and can have embeded images.
2023-07-05 23:45:49 -04:00 · 2023-07-05 23:45:49 -04:00 · 3f028c3088
commit 3f028c3088
parent 7289151318
10 changed files with 81 additions and 37 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@
 /static/js/*
 /static/css/*
 .frontend-built
 /content
--- a/README.md
+++ b/README.md
@ -55,6 +55,7 @@ builds
   DATABASE_MAX_CONNECTIONS=5
   TITLE=crawlnicle
   MAX_MEM_LOG_SIZE=1000000
   CONTENT_DIR=./content
   ```
 1. Run `just migrate` (or `sqlx migrate run`) which will run all the database
--- a/1
+++ b/1
@ -39,6 +39,7 @@ watch-backend:
    --ignore 'logs/*' \
    --ignore 'static/*' \
    --ignore 'frontend/*' \
    --ignore 'content/*' \
    --no-vcs-ignores \
    -x run
--- a/migrations/20230507201612_initial.sql
+++ b/migrations/20230507201612_initial.sql
@ -28,12 +28,12 @@ end;
 $$ language plpgsql;
 -- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
-    -- over things like usernames and emails, ithout needing to remember to do case-conversion.
+-- over things like usernames and emails, ithout needing to remember to do case-conversion.
-    create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
+create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
-    create type feed_type as enum ('atom', 'rss');
+create type feed_type as enum ('atom', 'rss');
-    create table if not exists "feed" (
+create table if not exists "feed" (
    feed_id uuid primary key default uuid_generate_v1mc(),
    title text,
    url varchar(2048) not null,
@ -52,7 +52,6 @@ create table if not exists "entry" (
    title text,
    url varchar(2048) not null,
    description text,
    html_content text,
    feed_id uuid not null references "feed" (feed_id) on delete cascade,
    published_at timestamptz not null,
    created_at timestamptz not null default now(),
--- a/src/bin/cli.rs
+++ b/src/bin/cli.rs
@ -117,7 +117,6 @@ pub async fn main() -> Result<()> {
                    title: args.title,
                    url: args.url,
                    description: args.description,
                    html_content: None,
                    feed_id: args.feed_id,
                    published_at: Utc::now(),
                },
--- a/src/config.rs
+++ b/src/config.rs
@ -14,4 +14,6 @@ pub struct Config {
    pub title: String,
    #[clap(long, env)]
    pub max_mem_log_size: usize,
    #[clap(long, env)]
    pub content_dir: String,
 }
--- a/src/handlers/entry.rs
+++ b/src/handlers/entry.rs
@ -1,8 +1,11 @@
 use std::fs;
 use axum::extract::{Path, State};
 use axum::response::Response;
 use maud::{html, PreEscaped};
 use sqlx::PgPool;
 use crate::config::Config;
 use crate::error::Result;
 use crate::models::entry::get_entry;
 use crate::partials::layout::Layout;
@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid;
 pub async fn get(
    Path(id): Path<Base62Uuid>,
    State(pool): State<PgPool>,
    State(config): State<Config>,
    layout: Layout,
 ) -> Result<Response> {
    let entry = get_entry(&pool, id.as_uuid()).await?;
    let content_dir = std::path::Path::new(&config.content_dir);
    let content_path = content_dir.join(format!("{}.html", entry.entry_id));
    Ok(layout.render(html! {
        article {
            @let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
@ -25,7 +31,7 @@ pub async fn get(
                    (published_at)
                }
            }
-            @let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
+            @let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
            (PreEscaped(content))
        }
    }))
--- a/src/jobs/crawl.rs
+++ b/src/jobs/crawl.rs
@ -1,3 +1,7 @@
 use std::fs;
 use std::env;
 use std::path::Path;
 use article_scraper::ArticleScraper;
 use chrono::Utc;
 use feed_rs::parser;
@ -6,7 +10,7 @@ use sqlx::PgPool;
 use tracing::{info, info_span, warn};
 use crate::models::feed::get_feeds;
-use crate::models::entry::{upsert_entries, CreateEntry};
+use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
 use crate::uuid::Base62Uuid;
 /// For every feed in the database, fetches the feed, parses it, and saves new entries to the
@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid;
 pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
    let scraper = ArticleScraper::new(None).await;
    let client = Client::new();
    let content_dir = env::var("CONTENT_DIR")?;
    let content_dir = Path::new(&content_dir);
    let feeds = get_feeds(pool).await?;
    for feed in feeds {
        let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
            if let Some(link) = entry.links.get(0) {
                // if no scraped or feed date is available, fallback to the current time
                let published_at = entry.published.unwrap_or_else(Utc::now);
-                let mut entry = CreateEntry {
+                let entry = CreateEntry {
                    title: entry.title.map(|t| t.content),
                    url: link.href.clone(),
                    description: entry.summary.map(|s| s.content),
                    html_content: None,
                    feed_id: feed.feed_id,
                    published_at,
                };
                info!("Fetching and parsing entry link: {}", link.href);
                if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
                    if let Some(date) = article.date {
                        // prefer scraped date over rss feed date
                        entry.published_at = date;
                    };
                    entry.html_content = article.get_content();
                } else {
                    warn!("Failed to fetch article for entry: {:?}", link);
                }
                payload.push(entry);
            } else {
                warn!("Skipping feed entry with no links");
@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
        }
        let entries = upsert_entries(pool, payload).await?;
        info!("Created {} entries", entries.len());
        // TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that 
        // doesn't implement Send so this isn't trivial.
        for mut entry in entries {
            info!("Fetching and parsing entry link: {}", entry.url);
            if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
                let id = entry.entry_id;
                if let Some(date) = article.date {
                    // prefer scraped date over rss feed date
                    entry.published_at = date;
                    update_entry(pool, entry).await?;
                };
                let html_content = article.get_content();
                if let Some(content) = html_content {
                    fs::write(content_dir.join(format!("{}.html", id)), content)?;
                }
            } else {
                warn!("Failed to fetch article for entry: {:?}", &entry.url);
            }
        }
    }
    Ok(())
 }
--- a/src/models/entry.rs
+++ b/src/models/entry.rs
@ -14,7 +14,6 @@ pub struct Entry {
    pub title: Option<String>,
    pub url: String,
    pub description: Option<String>,
    pub html_content: Option<String>,
    pub feed_id: Uuid,
    pub published_at: DateTime<Utc>,
    pub created_at: DateTime<Utc>,
@ -30,7 +29,6 @@ pub struct CreateEntry {
    pub url: String,
    #[validate(length(max = 524288))]
    pub description: Option<String>,
    pub html_content: Option<String>,
    pub feed_id: Uuid,
    pub published_at: DateTime<Utc>,
 }
@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
    sqlx::query_as!(
        Entry,
        "insert into entry (
-            title, url, description, html_content, feed_id, published_at
+            title, url, description, feed_id, published_at
        ) values (
-            $1, $2, $3, $4, $5, $6
+            $1, $2, $3, $4, $5
        ) returning *",
        payload.title,
        payload.url,
        payload.description,
        payload.html_content,
        payload.feed_id,
        payload.published_at,
    )
@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    let mut titles = Vec::with_capacity(payload.len());
    let mut urls = Vec::with_capacity(payload.len());
    let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
    let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
    let mut feed_ids = Vec::with_capacity(payload.len());
    let mut published_ats = Vec::with_capacity(payload.len());
    payload
@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
            titles.push(entry.title.clone());
            urls.push(entry.url.clone());
            descriptions.push(entry.description.clone());
            html_contents.push(entry.html_content.clone());
            feed_ids.push(entry.feed_id);
            published_ats.push(entry.published_at);
            entry.validate()
@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    sqlx::query_as!(
        Entry,
        "insert into entry (
-            title, url, description, html_content, feed_id, published_at
+            title, url, description, feed_id, published_at
-        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
+        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
        returning *",
        titles.as_slice() as &[Option<String>],
        urls.as_slice(),
        descriptions.as_slice() as &[Option<String>],
        html_contents.as_slice() as &[Option<String>],
        feed_ids.as_slice(),
        published_ats.as_slice(),
    )
@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    let mut titles = Vec::with_capacity(payload.len());
    let mut urls = Vec::with_capacity(payload.len());
    let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
    let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
    let mut feed_ids = Vec::with_capacity(payload.len());
    let mut published_ats = Vec::with_capacity(payload.len());
    payload
@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
            titles.push(entry.title.clone());
            urls.push(entry.url.clone());
            descriptions.push(entry.description.clone());
            html_contents.push(entry.html_content.clone());
            feed_ids.push(entry.feed_id);
            published_ats.push(entry.published_at);
            entry.validate()
@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    sqlx::query_as!(
        Entry,
        "insert into entry (
-            title, url, description, html_content, feed_id, published_at
+            title, url, description, feed_id, published_at
-        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
+        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
        on conflict do nothing
        returning *",
        titles.as_slice() as &[Option<String>],
        urls.as_slice(),
        descriptions.as_slice() as &[Option<String>],
        html_contents.as_slice() as &[Option<String>],
        feed_ids.as_slice(),
        published_ats.as_slice(),
    )
@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    })
 }
 pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
    sqlx::query_as!(
        Entry,
        "update entry set
            title = $2,
            url = $3,
            description = $4,
            feed_id = $5,
            published_at = $6
        where entry_id = $1
        returning *
        ",
        payload.entry_id,
        payload.title,
        payload.url,
        payload.description,
        payload.feed_id,
        payload.published_at,
    )
    .fetch_one(pool)
    .await
    .map_err(|error| {
        if let sqlx::error::Error::Database(ref psql_error) = error {
            if psql_error.code().as_deref() == Some("23503") {
                return Error::RelationNotFound("feed");
            }
        }
        Error::Sqlx(error)
    })
 }
 pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
    sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
        .execute(pool)
--- a/src/utils.rs
+++ b/src/utils.rs
@ -1,7 +1,5 @@
 use url::Url;
 const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 pub fn get_domain(url: &str) -> Option<String> {
    Url::parse(url)
        .ok()