Store entry html content outside DB in file storage
Since the HTML content can get quite big and can have embeded images.
This commit is contained in:
parent
7289151318
commit
3f028c3088
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,3 +4,4 @@
|
|||||||
/static/js/*
|
/static/js/*
|
||||||
/static/css/*
|
/static/css/*
|
||||||
.frontend-built
|
.frontend-built
|
||||||
|
/content
|
||||||
|
@ -55,6 +55,7 @@ builds
|
|||||||
DATABASE_MAX_CONNECTIONS=5
|
DATABASE_MAX_CONNECTIONS=5
|
||||||
TITLE=crawlnicle
|
TITLE=crawlnicle
|
||||||
MAX_MEM_LOG_SIZE=1000000
|
MAX_MEM_LOG_SIZE=1000000
|
||||||
|
CONTENT_DIR=./content
|
||||||
```
|
```
|
||||||
|
|
||||||
1. Run `just migrate` (or `sqlx migrate run`) which will run all the database
|
1. Run `just migrate` (or `sqlx migrate run`) which will run all the database
|
||||||
|
1
justfile
1
justfile
@ -39,6 +39,7 @@ watch-backend:
|
|||||||
--ignore 'logs/*' \
|
--ignore 'logs/*' \
|
||||||
--ignore 'static/*' \
|
--ignore 'static/*' \
|
||||||
--ignore 'frontend/*' \
|
--ignore 'frontend/*' \
|
||||||
|
--ignore 'content/*' \
|
||||||
--no-vcs-ignores \
|
--no-vcs-ignores \
|
||||||
-x run
|
-x run
|
||||||
|
|
||||||
|
@ -28,12 +28,12 @@ end;
|
|||||||
$$ language plpgsql;
|
$$ language plpgsql;
|
||||||
|
|
||||||
-- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
|
-- This is a text collation that sorts text case-insensitively, useful for `UNIQUE` indexes
|
||||||
-- over things like usernames and emails, ithout needing to remember to do case-conversion.
|
-- over things like usernames and emails, ithout needing to remember to do case-conversion.
|
||||||
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
|
create collation case_insensitive (provider = icu, locale = 'und-u-ks-level2', deterministic = false);
|
||||||
|
|
||||||
create type feed_type as enum ('atom', 'rss');
|
create type feed_type as enum ('atom', 'rss');
|
||||||
|
|
||||||
create table if not exists "feed" (
|
create table if not exists "feed" (
|
||||||
feed_id uuid primary key default uuid_generate_v1mc(),
|
feed_id uuid primary key default uuid_generate_v1mc(),
|
||||||
title text,
|
title text,
|
||||||
url varchar(2048) not null,
|
url varchar(2048) not null,
|
||||||
@ -52,7 +52,6 @@ create table if not exists "entry" (
|
|||||||
title text,
|
title text,
|
||||||
url varchar(2048) not null,
|
url varchar(2048) not null,
|
||||||
description text,
|
description text,
|
||||||
html_content text,
|
|
||||||
feed_id uuid not null references "feed" (feed_id) on delete cascade,
|
feed_id uuid not null references "feed" (feed_id) on delete cascade,
|
||||||
published_at timestamptz not null,
|
published_at timestamptz not null,
|
||||||
created_at timestamptz not null default now(),
|
created_at timestamptz not null default now(),
|
||||||
|
@ -117,7 +117,6 @@ pub async fn main() -> Result<()> {
|
|||||||
title: args.title,
|
title: args.title,
|
||||||
url: args.url,
|
url: args.url,
|
||||||
description: args.description,
|
description: args.description,
|
||||||
html_content: None,
|
|
||||||
feed_id: args.feed_id,
|
feed_id: args.feed_id,
|
||||||
published_at: Utc::now(),
|
published_at: Utc::now(),
|
||||||
},
|
},
|
||||||
|
@ -14,4 +14,6 @@ pub struct Config {
|
|||||||
pub title: String,
|
pub title: String,
|
||||||
#[clap(long, env)]
|
#[clap(long, env)]
|
||||||
pub max_mem_log_size: usize,
|
pub max_mem_log_size: usize,
|
||||||
|
#[clap(long, env)]
|
||||||
|
pub content_dir: String,
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,11 @@
|
|||||||
|
use std::fs;
|
||||||
|
|
||||||
use axum::extract::{Path, State};
|
use axum::extract::{Path, State};
|
||||||
use axum::response::Response;
|
use axum::response::Response;
|
||||||
use maud::{html, PreEscaped};
|
use maud::{html, PreEscaped};
|
||||||
use sqlx::PgPool;
|
use sqlx::PgPool;
|
||||||
|
|
||||||
|
use crate::config::Config;
|
||||||
use crate::error::Result;
|
use crate::error::Result;
|
||||||
use crate::models::entry::get_entry;
|
use crate::models::entry::get_entry;
|
||||||
use crate::partials::layout::Layout;
|
use crate::partials::layout::Layout;
|
||||||
@ -11,9 +14,12 @@ use crate::uuid::Base62Uuid;
|
|||||||
pub async fn get(
|
pub async fn get(
|
||||||
Path(id): Path<Base62Uuid>,
|
Path(id): Path<Base62Uuid>,
|
||||||
State(pool): State<PgPool>,
|
State(pool): State<PgPool>,
|
||||||
|
State(config): State<Config>,
|
||||||
layout: Layout,
|
layout: Layout,
|
||||||
) -> Result<Response> {
|
) -> Result<Response> {
|
||||||
let entry = get_entry(&pool, id.as_uuid()).await?;
|
let entry = get_entry(&pool, id.as_uuid()).await?;
|
||||||
|
let content_dir = std::path::Path::new(&config.content_dir);
|
||||||
|
let content_path = content_dir.join(format!("{}.html", entry.entry_id));
|
||||||
Ok(layout.render(html! {
|
Ok(layout.render(html! {
|
||||||
article {
|
article {
|
||||||
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
|
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
|
||||||
@ -25,7 +31,7 @@ pub async fn get(
|
|||||||
(published_at)
|
(published_at)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@let content = entry.html_content.unwrap_or_else(|| "No content".to_string());
|
@let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
|
||||||
(PreEscaped(content))
|
(PreEscaped(content))
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
|
@ -1,3 +1,7 @@
|
|||||||
|
use std::fs;
|
||||||
|
use std::env;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
use article_scraper::ArticleScraper;
|
use article_scraper::ArticleScraper;
|
||||||
use chrono::Utc;
|
use chrono::Utc;
|
||||||
use feed_rs::parser;
|
use feed_rs::parser;
|
||||||
@ -6,7 +10,7 @@ use sqlx::PgPool;
|
|||||||
use tracing::{info, info_span, warn};
|
use tracing::{info, info_span, warn};
|
||||||
|
|
||||||
use crate::models::feed::get_feeds;
|
use crate::models::feed::get_feeds;
|
||||||
use crate::models::entry::{upsert_entries, CreateEntry};
|
use crate::models::entry::{update_entry, upsert_entries, CreateEntry};
|
||||||
use crate::uuid::Base62Uuid;
|
use crate::uuid::Base62Uuid;
|
||||||
|
|
||||||
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
|
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
|
||||||
@ -14,6 +18,8 @@ use crate::uuid::Base62Uuid;
|
|||||||
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
|
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
|
||||||
let scraper = ArticleScraper::new(None).await;
|
let scraper = ArticleScraper::new(None).await;
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
|
let content_dir = env::var("CONTENT_DIR")?;
|
||||||
|
let content_dir = Path::new(&content_dir);
|
||||||
let feeds = get_feeds(pool).await?;
|
let feeds = get_feeds(pool).await?;
|
||||||
for feed in feeds {
|
for feed in feeds {
|
||||||
let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
|
let feed_id_str: String = Base62Uuid::from(feed.feed_id).into();
|
||||||
@ -31,24 +37,13 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
|
|||||||
if let Some(link) = entry.links.get(0) {
|
if let Some(link) = entry.links.get(0) {
|
||||||
// if no scraped or feed date is available, fallback to the current time
|
// if no scraped or feed date is available, fallback to the current time
|
||||||
let published_at = entry.published.unwrap_or_else(Utc::now);
|
let published_at = entry.published.unwrap_or_else(Utc::now);
|
||||||
let mut entry = CreateEntry {
|
let entry = CreateEntry {
|
||||||
title: entry.title.map(|t| t.content),
|
title: entry.title.map(|t| t.content),
|
||||||
url: link.href.clone(),
|
url: link.href.clone(),
|
||||||
description: entry.summary.map(|s| s.content),
|
description: entry.summary.map(|s| s.content),
|
||||||
html_content: None,
|
|
||||||
feed_id: feed.feed_id,
|
feed_id: feed.feed_id,
|
||||||
published_at,
|
published_at,
|
||||||
};
|
};
|
||||||
info!("Fetching and parsing entry link: {}", link.href);
|
|
||||||
if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
|
|
||||||
if let Some(date) = article.date {
|
|
||||||
// prefer scraped date over rss feed date
|
|
||||||
entry.published_at = date;
|
|
||||||
};
|
|
||||||
entry.html_content = article.get_content();
|
|
||||||
} else {
|
|
||||||
warn!("Failed to fetch article for entry: {:?}", link);
|
|
||||||
}
|
|
||||||
payload.push(entry);
|
payload.push(entry);
|
||||||
} else {
|
} else {
|
||||||
warn!("Skipping feed entry with no links");
|
warn!("Skipping feed entry with no links");
|
||||||
@ -56,6 +51,26 @@ pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
|
|||||||
}
|
}
|
||||||
let entries = upsert_entries(pool, payload).await?;
|
let entries = upsert_entries(pool, payload).await?;
|
||||||
info!("Created {} entries", entries.len());
|
info!("Created {} entries", entries.len());
|
||||||
|
|
||||||
|
// TODO: figure out how to do this in parallel. ArticleScraper uses some libxml thing that
|
||||||
|
// doesn't implement Send so this isn't trivial.
|
||||||
|
for mut entry in entries {
|
||||||
|
info!("Fetching and parsing entry link: {}", entry.url);
|
||||||
|
if let Ok(article) = scraper.parse(&Url::parse(&entry.url)?, true, &client, None).await {
|
||||||
|
let id = entry.entry_id;
|
||||||
|
if let Some(date) = article.date {
|
||||||
|
// prefer scraped date over rss feed date
|
||||||
|
entry.published_at = date;
|
||||||
|
update_entry(pool, entry).await?;
|
||||||
|
};
|
||||||
|
let html_content = article.get_content();
|
||||||
|
if let Some(content) = html_content {
|
||||||
|
fs::write(content_dir.join(format!("{}.html", id)), content)?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
warn!("Failed to fetch article for entry: {:?}", &entry.url);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -14,7 +14,6 @@ pub struct Entry {
|
|||||||
pub title: Option<String>,
|
pub title: Option<String>,
|
||||||
pub url: String,
|
pub url: String,
|
||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
pub html_content: Option<String>,
|
|
||||||
pub feed_id: Uuid,
|
pub feed_id: Uuid,
|
||||||
pub published_at: DateTime<Utc>,
|
pub published_at: DateTime<Utc>,
|
||||||
pub created_at: DateTime<Utc>,
|
pub created_at: DateTime<Utc>,
|
||||||
@ -30,7 +29,6 @@ pub struct CreateEntry {
|
|||||||
pub url: String,
|
pub url: String,
|
||||||
#[validate(length(max = 524288))]
|
#[validate(length(max = 524288))]
|
||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
pub html_content: Option<String>,
|
|
||||||
pub feed_id: Uuid,
|
pub feed_id: Uuid,
|
||||||
pub published_at: DateTime<Utc>,
|
pub published_at: DateTime<Utc>,
|
||||||
}
|
}
|
||||||
@ -92,14 +90,13 @@ pub async fn create_entry(pool: &PgPool, payload: CreateEntry) -> Result<Entry>
|
|||||||
sqlx::query_as!(
|
sqlx::query_as!(
|
||||||
Entry,
|
Entry,
|
||||||
"insert into entry (
|
"insert into entry (
|
||||||
title, url, description, html_content, feed_id, published_at
|
title, url, description, feed_id, published_at
|
||||||
) values (
|
) values (
|
||||||
$1, $2, $3, $4, $5, $6
|
$1, $2, $3, $4, $5
|
||||||
) returning *",
|
) returning *",
|
||||||
payload.title,
|
payload.title,
|
||||||
payload.url,
|
payload.url,
|
||||||
payload.description,
|
payload.description,
|
||||||
payload.html_content,
|
|
||||||
payload.feed_id,
|
payload.feed_id,
|
||||||
payload.published_at,
|
payload.published_at,
|
||||||
)
|
)
|
||||||
@ -119,7 +116,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
|||||||
let mut titles = Vec::with_capacity(payload.len());
|
let mut titles = Vec::with_capacity(payload.len());
|
||||||
let mut urls = Vec::with_capacity(payload.len());
|
let mut urls = Vec::with_capacity(payload.len());
|
||||||
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
||||||
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
|
||||||
let mut feed_ids = Vec::with_capacity(payload.len());
|
let mut feed_ids = Vec::with_capacity(payload.len());
|
||||||
let mut published_ats = Vec::with_capacity(payload.len());
|
let mut published_ats = Vec::with_capacity(payload.len());
|
||||||
payload
|
payload
|
||||||
@ -128,7 +124,6 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
|||||||
titles.push(entry.title.clone());
|
titles.push(entry.title.clone());
|
||||||
urls.push(entry.url.clone());
|
urls.push(entry.url.clone());
|
||||||
descriptions.push(entry.description.clone());
|
descriptions.push(entry.description.clone());
|
||||||
html_contents.push(entry.html_content.clone());
|
|
||||||
feed_ids.push(entry.feed_id);
|
feed_ids.push(entry.feed_id);
|
||||||
published_ats.push(entry.published_at);
|
published_ats.push(entry.published_at);
|
||||||
entry.validate()
|
entry.validate()
|
||||||
@ -137,13 +132,12 @@ pub async fn create_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
|||||||
sqlx::query_as!(
|
sqlx::query_as!(
|
||||||
Entry,
|
Entry,
|
||||||
"insert into entry (
|
"insert into entry (
|
||||||
title, url, description, html_content, feed_id, published_at
|
title, url, description, feed_id, published_at
|
||||||
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
|
) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
|
||||||
returning *",
|
returning *",
|
||||||
titles.as_slice() as &[Option<String>],
|
titles.as_slice() as &[Option<String>],
|
||||||
urls.as_slice(),
|
urls.as_slice(),
|
||||||
descriptions.as_slice() as &[Option<String>],
|
descriptions.as_slice() as &[Option<String>],
|
||||||
html_contents.as_slice() as &[Option<String>],
|
|
||||||
feed_ids.as_slice(),
|
feed_ids.as_slice(),
|
||||||
published_ats.as_slice(),
|
published_ats.as_slice(),
|
||||||
)
|
)
|
||||||
@ -163,7 +157,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
|||||||
let mut titles = Vec::with_capacity(payload.len());
|
let mut titles = Vec::with_capacity(payload.len());
|
||||||
let mut urls = Vec::with_capacity(payload.len());
|
let mut urls = Vec::with_capacity(payload.len());
|
||||||
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
let mut descriptions: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
||||||
let mut html_contents: Vec<Option<String>> = Vec::with_capacity(payload.len());
|
|
||||||
let mut feed_ids = Vec::with_capacity(payload.len());
|
let mut feed_ids = Vec::with_capacity(payload.len());
|
||||||
let mut published_ats = Vec::with_capacity(payload.len());
|
let mut published_ats = Vec::with_capacity(payload.len());
|
||||||
payload
|
payload
|
||||||
@ -172,7 +165,6 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
|||||||
titles.push(entry.title.clone());
|
titles.push(entry.title.clone());
|
||||||
urls.push(entry.url.clone());
|
urls.push(entry.url.clone());
|
||||||
descriptions.push(entry.description.clone());
|
descriptions.push(entry.description.clone());
|
||||||
html_contents.push(entry.html_content.clone());
|
|
||||||
feed_ids.push(entry.feed_id);
|
feed_ids.push(entry.feed_id);
|
||||||
published_ats.push(entry.published_at);
|
published_ats.push(entry.published_at);
|
||||||
entry.validate()
|
entry.validate()
|
||||||
@ -181,14 +173,13 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
|||||||
sqlx::query_as!(
|
sqlx::query_as!(
|
||||||
Entry,
|
Entry,
|
||||||
"insert into entry (
|
"insert into entry (
|
||||||
title, url, description, html_content, feed_id, published_at
|
title, url, description, feed_id, published_at
|
||||||
) select * from unnest($1::text[], $2::text[], $3::text[], $4::text[], $5::uuid[], $6::timestamptz[])
|
) select * from unnest($1::text[], $2::text[], $3::text[], $4::uuid[], $5::timestamptz[])
|
||||||
on conflict do nothing
|
on conflict do nothing
|
||||||
returning *",
|
returning *",
|
||||||
titles.as_slice() as &[Option<String>],
|
titles.as_slice() as &[Option<String>],
|
||||||
urls.as_slice(),
|
urls.as_slice(),
|
||||||
descriptions.as_slice() as &[Option<String>],
|
descriptions.as_slice() as &[Option<String>],
|
||||||
html_contents.as_slice() as &[Option<String>],
|
|
||||||
feed_ids.as_slice(),
|
feed_ids.as_slice(),
|
||||||
published_ats.as_slice(),
|
published_ats.as_slice(),
|
||||||
)
|
)
|
||||||
@ -204,6 +195,37 @@ pub async fn upsert_entries(pool: &PgPool, payload: Vec<CreateEntry>) -> Result<
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub async fn update_entry(pool: &PgPool, payload: Entry) -> Result<Entry> {
|
||||||
|
sqlx::query_as!(
|
||||||
|
Entry,
|
||||||
|
"update entry set
|
||||||
|
title = $2,
|
||||||
|
url = $3,
|
||||||
|
description = $4,
|
||||||
|
feed_id = $5,
|
||||||
|
published_at = $6
|
||||||
|
where entry_id = $1
|
||||||
|
returning *
|
||||||
|
",
|
||||||
|
payload.entry_id,
|
||||||
|
payload.title,
|
||||||
|
payload.url,
|
||||||
|
payload.description,
|
||||||
|
payload.feed_id,
|
||||||
|
payload.published_at,
|
||||||
|
)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
.map_err(|error| {
|
||||||
|
if let sqlx::error::Error::Database(ref psql_error) = error {
|
||||||
|
if psql_error.code().as_deref() == Some("23503") {
|
||||||
|
return Error::RelationNotFound("feed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Error::Sqlx(error)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
|
pub async fn delete_entry(pool: &PgPool, entry_id: Uuid) -> Result<()> {
|
||||||
sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
|
sqlx::query!("update entry set deleted_at = now() where entry_id = $1", entry_id)
|
||||||
.execute(pool)
|
.execute(pool)
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
const BASE62_CHARS: &[u8] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
|
||||||
|
|
||||||
pub fn get_domain(url: &str) -> Option<String> {
|
pub fn get_domain(url: &str) -> Option<String> {
|
||||||
Url::parse(url)
|
Url::parse(url)
|
||||||
.ok()
|
.ok()
|
||||||
|
Loading…
Reference in New Issue
Block a user