Store entry html content outside DB in file storage
Since the HTML content can get quite big and can have embeded images.
This commit is contained in:
parent
7289151318
commit
3f028c3088
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@
|
||||
/static/js/*
|
||||
/static/css/*
|
||||
.frontend-built
|
||||
/content
|
||||
|
@ -55,6 +55,7 @@ builds
|
||||
DATABASE_MAX_CONNECTIONS=5
|
||||
TITLE=crawlnicle
|
||||
MAX_MEM_LOG_SIZE=1000000
|
||||
CONTENT_DIR=./content
|
||||
```
|
||||
|
||||
1. Run `just migrate` (or `sqlx migrate run`) which will run all the database
|
||||
|
1
justfile
1
justfile
@ -39,6 +39,7 @@ watch-backend:
|
||||
--ignore 'logs/*' \
|
||||
--ignore 'static/*' \
|
||||
--ignore 'frontend/*' \
|
||||
--ignore 'content/*' \
|
||||
--no-vcs-ignores \
|
||||
-x run
|
||||
|
||||
|
@ -28,12 +28,12 @@ end;
|
||||
$$ language plpgsql;
|
||||
|
||||
-- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
|
||||
-- over things like usernames and emails, ithout needing to remember to do case-conversion.
|
||||
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
|
||||
-- over things like usernames and emails, ithout needing to remember to do case-conversion.
|
||||
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
|
||||
|
||||
create type feed_type as enum ('atom', 'rss');
|
||||
create type feed_type as enum ('atom', 'rss');
|
||||
|
||||
create table if not exists "feed" (
|
||||
create table if not exists "feed" (
|
||||
feed_id uuid primary key default uuid_generate_v1mc(),
|
||||
title text,
|
||||
url varchar(2048) not null,
|
||||
@ -52,7 +52,6 @@ create table if not exists "entry" (
|
||||
title text,
|
||||
url varchar(2048) not null,
|
||||
description text,
|
||||
html_content text,
|
||||
feed_id uuid not null references "feed" (feed_id) on delete cascade,
|
||||
published_at timestamptz not null,
|
||||
created_at timestamptz not null default now(),
|
||||
|
@ -117,7 +117,6 @@ pub async fn main() -> Result<()> {
|
||||
title: args.title,
|
||||
url: args.url,
|
||||
description: args.description,
|
||||
html_content: None,
|
||||
feed_id: args.feed_id,
|
||||
published_at: Utc::now(),
|
||||
},
|
||||
|
@ -14,4 +14,6 @@ pub struct Config {
|
||||
pub title: String,
|
||||
#[clap(long, env)]
|
||||
pub max_mem_log_size: usize,
|
||||
#[clap(long, env)]
|
||||
pub content_dir: String,
|
||||
}
|
||||
|
@ -1,8 +1,11 @@
|
||||
use std::fs;
|
||||
|
||||
use axum::extract::{Path, State};
|
||||
use axum::response::Response;
|
||||
use maud::{html, PreEscaped};
|
||||
use sqlx::PgPool;
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::error::Result;
|
||||
use crate::models::entry::get_entry;
|
||||
use crate::partials::layout::Layout;
|
||||
@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid;
|
||||
pub async fn get(
|
||||
Path(id): Path<Base62Uuid>,
|
||||
State(pool): State<PgPool>,
|
||||
State(config): State<Config>,
|
||||
layout: Layout,
|
||||
) -> Result<Response> {
|
||||
let entry = get_entry(&pool, id.as_uuid()).await?;
|
||||
let content_dir = std::path::Path::new(&config.content_dir);
|
||||
let content_path = content_dir.join(format!("{}.html", entry.entry_id));
|
||||
Ok(layout.render(html! {
|
||||
article {
|
||||
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
|
||||
@ -25,7 +31,7 @@ pub async fn get(
|
||||
(published_at)
|
||||
}
|
||||
}
|
||||
@let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
|
||||
@let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
|
||||
(PreEscaped(content))
|
||||
}
|
||||
}))
|
||||
|
@ -1,3 +1,7 @@
|
||||
use std::fs;
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
|
||||
use article_scraper::ArticleScraper;
|
||||
use chrono::Utc;
|
||||
use feed_rs::parser;
|
||||
@ -6,7 +10,7 @@ use sqlx::PgPool;
|
||||
use tracing::{info, info_span, warn};
|
||||
|
||||
use crate::models::feed::get_feeds;
|
||||
use crate::models::entry::{upsert_entries, CreateEntry};
|
||||
use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
|
||||
use crate::uuid::Base62Uuid;
|
||||
|
||||
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
|
||||
@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid;
|
||||
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
|
||||
let scraper = ArticleScraper::new(None).await;
|
||||
let client = Client::new();
|
||||
let content_dir = env::var("CONTENT_DIR")?;
|
||||
let content_dir = Path::new(&content_dir);
|
||||
let feeds = get_feeds(pool).await?;
|
||||
for feed in feeds {
|
||||
let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
|
||||
@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
|
||||
if let Some(link) = entry.links.get(0) {
|
||||
// if no scraped or feed date is available, fallback to the current time
|
||||
let published_at = entry.published.unwrap_or_else(Utc::now);
|
||||
let mut entry = CreateEntry {
|
||||
let entry = CreateEntry {
|
||||
title: entry.title.map(|t| t.content),
|
||||
url: link.href.clone(),
|
||||
description: entry.summary.map(|s| s.content),
|
||||
html_content: None,
|
||||
feed_id: feed.feed_id,
|
||||
published_at,
|
||||
};
|
||||
info!("Fetching and parsing entry link: {}", link.href);
|
||||
if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
|
||||
if let Some(date) = article.date {
|
||||
// prefer scraped date over rss feed date
|
||||
entry.published_at = date;
|
||||
};
|
||||
entry.html_content = article.get_content();
|
||||
} else {
|
||||
warn!("Failed to fetch article for entry: {:?}", link);
|
||||
}
|
||||
payload.push(entry);
|
||||
} else {
|
||||
warn!("Skipping feed entry with no links");
|
||||
@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
|
||||
}
|
||||
let entries = upsert_entries(pool, payload).await?;
|
||||
info!("Created {} entries", entries.len());
|
||||
|
||||
// TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that
|
||||
// doesn't implement Send so this isn't trivial.
|
||||
for mut entry in entries {
|
||||
info!("Fetching and parsing entry link: {}", entry.url);
|
||||
if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
|
||||
let id = entry.entry_id;
|
||||
if let Some(date) = article.date {
|
||||
// prefer scraped date over rss feed date
|
||||
entry.published_at = date;
|
||||
update_entry(pool, entry).await?;
|
||||
};
|
||||
let html_content = article.get_content();
|
||||
if let Some(content) = html_content {
|
||||
fs::write(content_dir.join(format!("{}.html", id)), content)?;
|
||||
}
|
||||
} else {
|
||||
warn!("Failed to fetch article for entry: {:?}", &entry.url);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -14,7 +14,6 @@ pub struct Entry {
|
||||
pub title: Option<String>,
|
||||
pub url: String,
|
||||
pub description: Option<String>,
|
||||
pub html_content: Option<String>,
|
||||
pub feed_id: Uuid,
|
||||
pub published_at: DateTime<Utc>,
|
||||
pub created_at: DateTime<Utc>,
|
||||
@ -30,7 +29,6 @@ pub struct CreateEntry {
|
||||
pub url: String,
|
||||
#[validate(length(max = 524288))]
|
||||
pub description: Option<String>,
|
||||
pub html_content: Option<String>,
|
||||
pub feed_id: Uuid,
|
||||
pub published_at: DateTime<Utc>,
|
||||
}
|
||||
@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
|
||||
sqlx::query_as!(
|
||||
Entry,
|
||||
"insert into entry (
|
||||
title, url, description, html_content, feed_id, published_at
|
||||
title, url, description, feed_id, published_at
|
||||
) values (
|
||||
$1, $2, $3, $4, $5, $6
|
||||
$1, $2, $3, $4, $5
|
||||
) returning *",
|
||||
payload.title,
|
||||
payload.url,
|
||||
payload.description,
|
||||
payload.html_content,
|
||||
payload.feed_id,
|
||||
payload.published_at,
|
||||
)
|
||||
@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
||||
let mut titles = Vec::with_capacity(payload.len());
|
||||
let mut urls = Vec::with_capacity(payload.len());
|
||||
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
||||
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
||||
let mut feed_ids = Vec::with_capacity(payload.len());
|
||||
let mut published_ats = Vec::with_capacity(payload.len());
|
||||
payload
|
||||
@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
||||
titles.push(entry.title.clone());
|
||||
urls.push(entry.url.clone());
|
||||
descriptions.push(entry.description.clone());
|
||||
html_contents.push(entry.html_content.clone());
|
||||
feed_ids.push(entry.feed_id);
|
||||
published_ats.push(entry.published_at);
|
||||
entry.validate()
|
||||
@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
||||
sqlx::query_as!(
|
||||
Entry,
|
||||
"insert into entry (
|
||||
title, url, description, html_content, feed_id, published_at
|
||||
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
|
||||
title, url, description, feed_id, published_at
|
||||
) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
|
||||
returning *",
|
||||
titles.as_slice() as &[Option<String>],
|
||||
urls.as_slice(),
|
||||
descriptions.as_slice() as &[Option<String>],
|
||||
html_contents.as_slice() as &[Option<String>],
|
||||
feed_ids.as_slice(),
|
||||
published_ats.as_slice(),
|
||||
)
|
||||
@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
||||
let mut titles = Vec::with_capacity(payload.len());
|
||||
let mut urls = Vec::with_capacity(payload.len());
|
||||
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
||||
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
||||
let mut feed_ids = Vec::with_capacity(payload.len());
|
||||
let mut published_ats = Vec::with_capacity(payload.len());
|
||||
payload
|
||||
@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
||||
titles.push(entry.title.clone());
|
||||
urls.push(entry.url.clone());
|
||||
descriptions.push(entry.description.clone());
|
||||
html_contents.push(entry.html_content.clone());
|
||||
feed_ids.push(entry.feed_id);
|
||||
published_ats.push(entry.published_at);
|
||||
entry.validate()
|
||||
@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
||||
sqlx::query_as!(
|
||||
Entry,
|
||||
"insert into entry (
|
||||
title, url, description, html_content, feed_id, published_at
|
||||
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
|
||||
title, url, description, feed_id, published_at
|
||||
) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
|
||||
on conflict do nothing
|
||||
returning *",
|
||||
titles.as_slice() as &[Option<String>],
|
||||
urls.as_slice(),
|
||||
descriptions.as_slice() as &[Option<String>],
|
||||
html_contents.as_slice() as &[Option<String>],
|
||||
feed_ids.as_slice(),
|
||||
published_ats.as_slice(),
|
||||
)
|
||||
@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
|
||||
sqlx::query_as!(
|
||||
Entry,
|
||||
"update entry set
|
||||
title = $2,
|
||||
url = $3,
|
||||
description = $4,
|
||||
feed_id = $5,
|
||||
published_at = $6
|
||||
where entry_id = $1
|
||||
returning *
|
||||
",
|
||||
payload.entry_id,
|
||||
payload.title,
|
||||
payload.url,
|
||||
payload.description,
|
||||
payload.feed_id,
|
||||
payload.published_at,
|
||||
)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.map_err(|error| {
|
||||
if let sqlx::error::Error::Database(ref psql_error) = error {
|
||||
if psql_error.code().as_deref() == Some("23503") {
|
||||
return Error::RelationNotFound("feed");
|
||||
}
|
||||
}
|
||||
Error::Sqlx(error)
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
|
||||
sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
|
||||
.execute(pool)
|
||||
|
@ -1,7 +1,5 @@
|
||||
use url::Url;
|
||||
|
||||
const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||
|
||||
pub fn get_domain(url: &str) -> Option<String> {
|
||||
Url::parse(url)
|
||||
.ok()
|
||||
|
Loading…
Reference in New Issue
Block a user