From 3f028c30880eff71c5d5688eda3ffcba687c7075 Mon Sep 17 00:00:00 2001
From: Tyler Hallada <tyler@hallada.net>
Date: Wed, 5 Jul 2023 23:45:49 -0400
Subject: [PATCH] Store entry html content outside DB in file storage

Since the HTML content can get quite big and can have embeded images.
---
 .gitignore                            |  1 +
 README.md                             |  1 +
 justfile                              |  1 +
 migrations/20230507201612_initial.sql |  9 +++--
 src/bin/cli.rs                        |  1 -
 src/config.rs                         |  2 ++
 src/handlers/entry.rs                 |  8 ++++-
 src/jobs/crawl.rs                     | 41 ++++++++++++++-------
 src/models/entry.rs                   | 52 +++++++++++++++++++--------
 src/utils.rs                          |  2 --
 10 files changed, 81 insertions(+), 37 deletions(-)

diff --git a/.gitignore b/.gitignore
index a72e8eb..eac289a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@
 /static/js/*
 /static/css/*
 .frontend-built
+/content
diff --git a/README.md b/README.md
index 49c924a..2cd5be4 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,7 @@ builds
    DATABASE_MAX_CONNECTIONS=5
    TITLE=crawlnicle
    MAX_MEM_LOG_SIZE=1000000
+   CONTENT_DIR=./content
    ```
 
 1. Run `just migrate` (or `sqlx migrate run`) which will run all the database
diff --git a/justfile b/justfile
index 01febf0..85e3ee2 100755
--- a/justfile
+++ b/justfile
@@ -39,6 +39,7 @@ watch-backend:
     --ignore 'logs/*' \
     --ignore 'static/*' \
     --ignore 'frontend/*' \
+    --ignore 'content/*' \
     --no-vcs-ignores \
     -x run
 
diff --git a/migrations/20230507201612_initial.sql b/migrations/20230507201612_initial.sql
index c3a93dc..c0b9e80 100644
--- a/migrations/20230507201612_initial.sql
+++ b/migrations/20230507201612_initial.sql
@@ -28,12 +28,12 @@ end;
 $$ language plpgsql;
 
 -- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
-    -- over things like usernames and emails, ithout needing to remember to do case-conversion.
-    create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
+-- over things like usernames and emails, ithout needing to remember to do case-conversion.
+create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
 
-    create type feed_type as enum ('atom', 'rss');
+create type feed_type as enum ('atom', 'rss');
 
-    create table if not exists "feed" (
+create table if not exists "feed" (
     feed_id uuid primary key default uuid_generate_v1mc(),
     title text,
     url varchar(2048) not null,
@@ -52,7 +52,6 @@ create table if not exists "entry" (
     title text,
     url varchar(2048) not null,
     description text,
-    html_content text,
     feed_id uuid not null references "feed" (feed_id) on delete cascade,
     published_at timestamptz not null,
     created_at timestamptz not null default now(),
diff --git a/src/bin/cli.rs b/src/bin/cli.rs
index 0c685a5..eefad14 100644
--- a/src/bin/cli.rs
+++ b/src/bin/cli.rs
@@ -117,7 +117,6 @@ pub async fn main() -> Result<()> {
                     title: args.title,
                     url: args.url,
                     description: args.description,
-                    html_content: None,
                     feed_id: args.feed_id,
                     published_at: Utc::now(),
                 },
diff --git a/src/config.rs b/src/config.rs
index 680d777..8f55037 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -14,4 +14,6 @@ pub struct Config {
     pub title: String,
     #[clap(long, env)]
     pub max_mem_log_size: usize,
+    #[clap(long, env)]
+    pub content_dir: String,
 }
diff --git a/src/handlers/entry.rs b/src/handlers/entry.rs
index e1e7dde..a2edf97 100644
--- a/src/handlers/entry.rs
+++ b/src/handlers/entry.rs
@@ -1,8 +1,11 @@
+use std::fs;
+
 use axum::extract::{Path, State};
 use axum::response::Response;
 use maud::{html, PreEscaped};
 use sqlx::PgPool;
 
+use crate::config::Config;
 use crate::error::Result;
 use crate::models::entry::get_entry;
 use crate::partials::layout::Layout;
@@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid;
 pub async fn get(
     Path(id): Path<Base62Uuid>,
     State(pool): State<PgPool>,
+    State(config): State<Config>,
     layout: Layout,
 ) -> Result<Response> {
     let entry = get_entry(&pool, id.as_uuid()).await?;
+    let content_dir = std::path::Path::new(&config.content_dir);
+    let content_path = content_dir.join(format!("{}.html", entry.entry_id));
     Ok(layout.render(html! {
         article {
             @let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
@@ -25,7 +31,7 @@ pub async fn get(
                     (published_at)
                 }
             }
-            @let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
+            @let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
             (PreEscaped(content))
         }
     }))
diff --git a/src/jobs/crawl.rs b/src/jobs/crawl.rs
index 1c3d36b..cd7ff9c 100644
--- a/src/jobs/crawl.rs
+++ b/src/jobs/crawl.rs
@@ -1,3 +1,7 @@
+use std::fs;
+use std::env;
+use std::path::Path;
+
 use article_scraper::ArticleScraper;
 use chrono::Utc;
 use feed_rs::parser;
@@ -6,7 +10,7 @@ use sqlx::PgPool;
 use tracing::{info, info_span, warn};
 
 use crate::models::feed::get_feeds;
-use crate::models::entry::{upsert_entries, CreateEntry};
+use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
 use crate::uuid::Base62Uuid;
 
 /// For every feed in the database, fetches the feed, parses it, and saves new entries to the
@@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid;
 pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
     let scraper = ArticleScraper::new(None).await;
     let client = Client::new();
+    let content_dir = env::var("CONTENT_DIR")?;
+    let content_dir = Path::new(&content_dir);
     let feeds = get_feeds(pool).await?;
     for feed in feeds {
         let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
@@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
             if let Some(link) = entry.links.get(0) {
                 // if no scraped or feed date is available, fallback to the current time
                 let published_at = entry.published.unwrap_or_else(Utc::now);
-                let mut entry = CreateEntry {
+                let entry = CreateEntry {
                     title: entry.title.map(|t| t.content),
                     url: link.href.clone(),
                     description: entry.summary.map(|s| s.content),
-                    html_content: None,
                     feed_id: feed.feed_id,
                     published_at,
                 };
-                info!("Fetching and parsing entry link: {}", link.href);
-                if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
-                    if let Some(date) = article.date {
-                        // prefer scraped date over rss feed date
-                        entry.published_at = date;
-                    };
-                    entry.html_content = article.get_content();
-                } else {
-                    warn!("Failed to fetch article for entry: {:?}", link);
-                }
                 payload.push(entry);
             } else {
                 warn!("Skipping feed entry with no links");
@@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
         }
         let entries = upsert_entries(pool, payload).await?;
         info!("Created {} entries", entries.len());
+
+        // TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that 
+        // doesn't implement Send so this isn't trivial.
+        for mut entry in entries {
+            info!("Fetching and parsing entry link: {}", entry.url);
+            if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
+                let id = entry.entry_id;
+                if let Some(date) = article.date {
+                    // prefer scraped date over rss feed date
+                    entry.published_at = date;
+                    update_entry(pool, entry).await?;
+                };
+                let html_content = article.get_content();
+                if let Some(content) = html_content {
+                    fs::write(content_dir.join(format!("{}.html", id)), content)?;
+                }
+            } else {
+                warn!("Failed to fetch article for entry: {:?}", &entry.url);
+            }
+        }
     }
     Ok(())
 }
diff --git a/src/models/entry.rs b/src/models/entry.rs
index 44425fd..7f6dbdb 100644
--- a/src/models/entry.rs
+++ b/src/models/entry.rs
@@ -14,7 +14,6 @@ pub struct Entry {
     pub title: Option<String>,
     pub url: String,
     pub description: Option<String>,
-    pub html_content: Option<String>,
     pub feed_id: Uuid,
     pub published_at: DateTime<Utc>,
     pub created_at: DateTime<Utc>,
@@ -30,7 +29,6 @@ pub struct CreateEntry {
     pub url: String,
     #[validate(length(max = 524288))]
     pub description: Option<String>,
-    pub html_content: Option<String>,
     pub feed_id: Uuid,
     pub published_at: DateTime<Utc>,
 }
@@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
     sqlx::query_as!(
         Entry,
         "insert into entry (
-            title, url, description, html_content, feed_id, published_at
+            title, url, description, feed_id, published_at
         ) values (
-            $1, $2, $3, $4, $5, $6
+            $1, $2, $3, $4, $5
         ) returning *",
         payload.title,
         payload.url,
         payload.description,
-        payload.html_content,
         payload.feed_id,
         payload.published_at,
     )
@@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
     let mut titles = Vec::with_capacity(payload.len());
     let mut urls = Vec::with_capacity(payload.len());
     let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
-    let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
     let mut feed_ids = Vec::with_capacity(payload.len());
     let mut published_ats = Vec::with_capacity(payload.len());
     payload
@@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
             titles.push(entry.title.clone());
             urls.push(entry.url.clone());
             descriptions.push(entry.description.clone());
-            html_contents.push(entry.html_content.clone());
             feed_ids.push(entry.feed_id);
             published_ats.push(entry.published_at);
             entry.validate()
@@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
     sqlx::query_as!(
         Entry,
         "insert into entry (
-            title, url, description, html_content, feed_id, published_at
-        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
+            title, url, description, feed_id, published_at
+        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
         returning *",
         titles.as_slice() as &[Option<String>],
         urls.as_slice(),
         descriptions.as_slice() as &[Option<String>],
-        html_contents.as_slice() as &[Option<String>],
         feed_ids.as_slice(),
         published_ats.as_slice(),
     )
@@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
     let mut titles = Vec::with_capacity(payload.len());
     let mut urls = Vec::with_capacity(payload.len());
     let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
-    let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
     let mut feed_ids = Vec::with_capacity(payload.len());
     let mut published_ats = Vec::with_capacity(payload.len());
     payload
@@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
             titles.push(entry.title.clone());
             urls.push(entry.url.clone());
             descriptions.push(entry.description.clone());
-            html_contents.push(entry.html_content.clone());
             feed_ids.push(entry.feed_id);
             published_ats.push(entry.published_at);
             entry.validate()
@@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
     sqlx::query_as!(
         Entry,
         "insert into entry (
-            title, url, description, html_content, feed_id, published_at
-        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
+            title, url, description, feed_id, published_at
+        ) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
         on conflict do nothing
         returning *",
         titles.as_slice() as &[Option<String>],
         urls.as_slice(),
         descriptions.as_slice() as &[Option<String>],
-        html_contents.as_slice() as &[Option<String>],
         feed_ids.as_slice(),
         published_ats.as_slice(),
     )
@@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
     })
 }
 
+pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
+    sqlx::query_as!(
+        Entry,
+        "update entry set
+            title = $2,
+            url = $3,
+            description = $4,
+            feed_id = $5,
+            published_at = $6
+        where entry_id = $1
+        returning *
+        ",
+        payload.entry_id,
+        payload.title,
+        payload.url,
+        payload.description,
+        payload.feed_id,
+        payload.published_at,
+    )
+    .fetch_one(pool)
+    .await
+    .map_err(|error| {
+        if let sqlx::error::Error::Database(ref psql_error) = error {
+            if psql_error.code().as_deref() == Some("23503") {
+                return Error::RelationNotFound("feed");
+            }
+        }
+        Error::Sqlx(error)
+    })
+}
+
 pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
     sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
         .execute(pool)
diff --git a/src/utils.rs b/src/utils.rs
index b5f03b4..dca8b1e 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,7 +1,5 @@
 use url::Url;
 
-const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
 pub fn get_domain(url: &str) -> Option<String> {
     Url::parse(url)
         .ok()