Store entry html content outside DB in file storage

Since the HTML content can get quite big and can have embeded images.
2023-07-05 23:45:49 -04:00
parent 7289151318
commit 3f028c3088
10 changed files with 81 additions and 37 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 /static/js/*
 /static/css/*
 .frontend-built
+/content
--- a/README.md
+++ b/README.md
@@ -55,6 +55,7 @@ builds
   DATABASE_MAX_CONNECTIONS=5
   TITLE=crawlnicle
   MAX_MEM_LOG_SIZE=1000000
+   CONTENT_DIR=./content
   ```

 1. Run `just migrate` (or `sqlx migrate run`) which will run all the database
--- a/1
+++ b/1
@@ -39,6 +39,7 @@ watch-backend:
    --ignore 'logs/*' \
    --ignore 'static/*' \
    --ignore 'frontend/*' \
+    --ignore 'content/*' \
    --no-vcs-ignores \
    -x run

--- a/migrations/20230507201612_initial.sql
+++ b/migrations/20230507201612_initial.sql
@@ -28,12 +28,12 @@ end;
 $$ language plpgsql;

 -- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
-    -- over things like usernames and emails, ithout needing to remember to do case-conversion.
-    create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
+-- over things like usernames and emails, ithout needing to remember to do case-conversion.
+create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);

-    create type feed_type as enum ('atom', 'rss');
+create type feed_type as enum ('atom', 'rss');

-    create table if not exists "feed" (
+create table if not exists "feed" (
    feed_id uuid primary key default uuid_generate_v1mc(),
    title text,
    url varchar(2048) not null,
@@ -52,7 +52,6 @@ create table if not exists "entry" (
    title text,
    url varchar(2048) not null,
    description text,
-    html_content text,
    feed_id uuid not null references "feed" (feed_id) on delete cascade,
    published_at timestamptz not null,
    created_at timestamptz not null default now(),
--- a/src/bin/cli.rs
+++ b/src/bin/cli.rs
@@ -117,7 +117,6 @@ pub async fn main() -> Result<()> {
                    title: args.title,
                    url: args.url,
                    description: args.description,
-                    html_content: None,
                    feed_id: args.feed_id,
                    published_at: Utc::now(),
                },
--- a/src/config.rs
+++ b/src/config.rs
@@ -14,4 +14,6 @@ pub struct Config {
    pub title: String,
    #[clap(long, env)]
    pub max_mem_log_size: usize,
+    #[clap(long, env)]
+    pub content_dir: String,
 }
--- a/src/handlers/entry.rs
+++ b/src/handlers/entry.rs
@@ -1,8 +1,11 @@
+use std::fs;
+
 use axum::extract::{Path, State};
 use axum::response::Response;
 use maud::{html, PreEscaped};
 use sqlx::PgPool;

+use crate::config::Config;
 use crate::error::Result;
 use crate::models::entry::get_entry;
 use crate::partials::layout::Layout;
@@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid;
 pub async fn get(
    Path(id): Path<Base62Uuid>,
    State(pool): State<PgPool>,
+    State(config): State<Config>,
    layout: Layout,
 ) -> Result<Response> {
    let entry = get_entry(&pool, id.as_uuid()).await?;
+    let content_dir = std::path::Path::new(&config.content_dir);
+    let content_path = content_dir.join(format!("{}.html", entry.entry_id));
    Ok(layout.render(html! {
        article {
            @let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
@@ -25,7 +31,7 @@ pub async fn get(
                    (published_at)
                }
            }
-            @let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
+            @let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
            (PreEscaped(content))
        }
    }))
--- a/src/jobs/crawl.rs
+++ b/src/jobs/crawl.rs
@@ -1,3 +1,7 @@
+use std::fs;
+use std::env;
+use std::path::Path;
+
 use article_scraper::ArticleScraper;
 use chrono::Utc;
 use feed_rs::parser;
@@ -6,7 +10,7 @@ use sqlx::PgPool;
 use tracing::{info, info_span, warn};

 use crate::models::feed::get_feeds;
-use crate::models::entry::{upsert_entries, CreateEntry};
+use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
 use crate::uuid::Base62Uuid;

 /// For every feed in the database, fetches the feed, parses it, and saves new entries to the
@@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid;
 pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
    let scraper = ArticleScraper::new(None).await;
    let client = Client::new();
+    let content_dir = env::var("CONTENT_DIR")?;
+    let content_dir = Path::new(&content_dir);
    let feeds = get_feeds(pool).await?;
    for feed in feeds {
        let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
@@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
            if let Some(link) = entry.links.get(0) {
                // if no scraped or feed date is available, fallback to the current time
                let published_at = entry.published.unwrap_or_else(Utc::now);
-                let mut entry = CreateEntry {
+                let entry = CreateEntry {
                    title: entry.title.map(|t| t.content),
                    url: link.href.clone(),
                    description: entry.summary.map(|s| s.content),
-                    html_content: None,
                    feed_id: feed.feed_id,
                    published_at,
                };
-                info!("Fetching and parsing entry link: {}", link.href);
-                if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
-                    if let Some(date) = article.date {
-                        // prefer scraped date over rss feed date
-                        entry.published_at = date;
-                    };
-                    entry.html_content = article.get_content();
-                } else {
-                    warn!("Failed to fetch article for entry: {:?}", link);
-                }
                payload.push(entry);
            } else {
                warn!("Skipping feed entry with no links");
@@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
        }
        let entries = upsert_entries(pool, payload).await?;
        info!("Created {} entries", entries.len());
+
+        // TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that 
+        // doesn't implement Send so this isn't trivial.
+        for mut entry in entries {
+            info!("Fetching and parsing entry link: {}", entry.url);
+            if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
+                let id = entry.entry_id;
+                if let Some(date) = article.date {
+                    // prefer scraped date over rss feed date
+                    entry.published_at = date;
+                    update_entry(pool, entry).await?;
+                };
+                let html_content = article.get_content();
+                if let Some(content) = html_content {
+                    fs::write(content_dir.join(format!("{}.html", id)), content)?;
+                }
+            } else {
+                warn!("Failed to fetch article for entry: {:?}", &entry.url);
+            }
+        }
    }
    Ok(())
 }
--- a/src/models/entry.rs
+++ b/src/models/entry.rs
@@ -14,7 +14,6 @@ pub struct Entry {
    pub title: Option<String>,
    pub url: String,
    pub description: Option<String>,
-    pub html_content: Option<String>,
    pub feed_id: Uuid,
    pub published_at: DateTime<Utc>,
    pub created_at: DateTime<Utc>,
@@ -30,7 +29,6 @@ pub struct CreateEntry {
    pub url: String,
    #[validate(length(max = 524288))]
    pub description: Option<String>,
-    pub html_content: Option<String>,
    pub feed_id: Uuid,
    pub published_at: DateTime<Utc>,
 }
@@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
    sqlx::query_as!(
        Entry,
        "insert into entry (
-            title, url, description, html_content, feed_id, published_at
+            title, url, description, feed_id, published_at
        ) values (
-            $1, $2, $3, $4, $5, $6
+            $1, $2, $3, $4, $5
        ) returning *",
        payload.title,
        payload.url,
        payload.description,
-        payload.html_content,
        payload.feed_id,
        payload.published_at,
    )
@@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    let mut titles = Vec::with_capacity(payload.len());
    let mut urls = Vec::with_capacity(payload.len());
    let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
-    let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
    let mut feed_ids = Vec::with_capacity(payload.len());
    let mut published_ats = Vec::with_capacity(payload.len());
    payload
@@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
            titles.push(entry.title.clone());
            urls.push(entry.url.clone());
            descriptions.push(entry.description.clone());
-            html_contents.push(entry.html_content.clone());
            feed_ids.push(entry.feed_id);
            published_ats.push(entry.published_at);
            entry.validate()
@@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    sqlx::query_as!(
        Entry,
        "insert into entry (
-            title, url, description, html_content, feed_id, published_at
-        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
+            title, url, description, feed_id, published_at
+        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
        returning *",
        titles.as_slice() as &[Option<String>],
        urls.as_slice(),
        descriptions.as_slice() as &[Option<String>],
-        html_contents.as_slice() as &[Option<String>],
        feed_ids.as_slice(),
        published_ats.as_slice(),
    )
@@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    let mut titles = Vec::with_capacity(payload.len());
    let mut urls = Vec::with_capacity(payload.len());
    let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
-    let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
    let mut feed_ids = Vec::with_capacity(payload.len());
    let mut published_ats = Vec::with_capacity(payload.len());
    payload
@@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
            titles.push(entry.title.clone());
            urls.push(entry.url.clone());
            descriptions.push(entry.description.clone());
-            html_contents.push(entry.html_content.clone());
            feed_ids.push(entry.feed_id);
            published_ats.push(entry.published_at);
            entry.validate()
@@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    sqlx::query_as!(
        Entry,
        "insert into entry (
-            title, url, description, html_content, feed_id, published_at
-        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
+            title, url, description, feed_id, published_at
+        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
        on conflict do nothing
        returning *",
        titles.as_slice() as &[Option<String>],
        urls.as_slice(),
        descriptions.as_slice() as &[Option<String>],
-        html_contents.as_slice() as &[Option<String>],
        feed_ids.as_slice(),
        published_ats.as_slice(),
    )
@@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
    })
 }

+pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
+    sqlx::query_as!(
+        Entry,
+        "update entry set
+            title = $2,
+            url = $3,
+            description = $4,
+            feed_id = $5,
+            published_at = $6
+        where entry_id = $1
+        returning *
+        ",
+        payload.entry_id,
+        payload.title,
+        payload.url,
+        payload.description,
+        payload.feed_id,
+        payload.published_at,
+    )
+    .fetch_one(pool)
+    .await
+    .map_err(|error| {
+        if let sqlx::error::Error::Database(ref psql_error) = error {
+            if psql_error.code().as_deref() == Some("23503") {
+                return Error::RelationNotFound("feed");
+            }
+        }
+        Error::Sqlx(error)
+    })
+}
+
 pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
    sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
        .execute(pool)
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,7 +1,5 @@
 use url::Url;

-const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
 pub fn get_domain(url: &str) -> Option<String> {
    Url::parse(url)
        .ok()