From ceac234ce78e218188067a53d9247559fe1d1160 Mon Sep 17 00:00:00 2001 From: Tyler Hallada Date: Tue, 29 Aug 2023 23:04:35 -0400 Subject: [PATCH] Sanitize entry html content with ammonia --- Cargo.lock | 81 +++++++++++++++++++++++++++++++++---- Cargo.toml | 1 + src/actors/entry_crawler.rs | 5 ++- src/actors/feed_crawler.rs | 20 +++++++-- src/handlers/entry.rs | 6 +-- src/handlers/import.rs | 11 +---- 6 files changed, 99 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 87c3d17..42cdea6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -34,6 +34,19 @@ dependencies = [ "memchr", ] +[[package]] +name = "ammonia" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e6d1c7838db705c9b756557ee27c384ce695a1c51a6fe528784cb1c6840170" +dependencies = [ + "html5ever 0.26.0", + "maplit", + "once_cell", + "tendril", + "url", +] + [[package]] name = "android_system_properties" version = "0.1.5" @@ -359,6 +372,7 @@ dependencies = [ name = "crawlnicle" version = "0.1.0" dependencies = [ + "ammonia", "ansi-to-html", "anyhow", "axum", @@ -980,7 +994,21 @@ checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148" dependencies = [ "log", "mac", - "markup5ever", + "markup5ever 0.10.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever 0.11.0", "proc-macro2", "quote", "syn 1.0.109", @@ -1321,6 +1349,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "markup5ever" version = "0.10.1" @@ -1328,8 +1362,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" dependencies = [ "log", - "phf", - "phf_codegen", + "phf 0.8.0", + "phf_codegen 0.8.0", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen 0.10.0", "string_cache", "string_cache_codegen", "tendril", @@ -1341,8 +1389,8 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b" dependencies = [ - "html5ever", - "markup5ever", + "html5ever 0.25.2", + "markup5ever 0.10.1", "tendril", "xml5ever", ] @@ -1699,6 +1747,15 @@ dependencies = [ "phf_shared 0.8.0", ] +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + [[package]] name = "phf_codegen" version = "0.8.0" @@ -1709,6 +1766,16 @@ dependencies = [ "phf_shared 0.8.0", ] +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + [[package]] name = "phf_generator" version = "0.8.0" @@ -1957,7 +2024,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e7843b159286299dd2b90f06d904ae1a8017a650d88d716c85dd6f123947f399" dependencies = [ - "html5ever", + "html5ever 0.25.2", "lazy_static", "markup5ever_rcdom", "regex", @@ -3430,7 +3497,7 @@ checksum = "9234163818fd8e2418fcde330655e757900d4236acd8cc70fef345ef91f6d865" dependencies = [ "log", "mac", - "markup5ever", + "markup5ever 0.10.1", "time 0.1.45", ] diff --git a/Cargo.toml b/Cargo.toml index be4811c..409add1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,3 +49,4 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } uuid = { version = "1.3", features = ["serde"] } url = "2.4" validator = { version = "0.16", features = ["derive"] } +ammonia = "3.3.0" diff --git a/src/actors/entry_crawler.rs b/src/actors/entry_crawler.rs index e23dd0f..d666a2d 100644 --- a/src/actors/entry_crawler.rs +++ b/src/actors/entry_crawler.rs @@ -2,6 +2,7 @@ use std::fmt::{self, Display, Formatter}; use std::fs; use std::path::Path; +use ammonia::clean; use bytes::Buf; use readability::extractor; use reqwest::Client; @@ -114,7 +115,9 @@ impl EntryCrawler { // .await // .map_err(|_| EntryCrawlerError::CreateEntryError(entry.url.clone()))?; // }; - fs::write(content_dir.join(format!("{}.html", id)), article.content) + let content = clean(&article.content); + info!("sanitized content"); + fs::write(content_dir.join(format!("{}.html", id)), content) .map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?; fs::write(content_dir.join(format!("{}.txt", id)), article.text) .map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?; diff --git a/src/actors/feed_crawler.rs b/src/actors/feed_crawler.rs index face3d1..757ea75 100644 --- a/src/actors/feed_crawler.rs +++ b/src/actors/feed_crawler.rs @@ -201,12 +201,24 @@ impl FeedCrawler { .cmp(&Duration::minutes(feed.crawl_interval_minutes.into())) { Ordering::Greater => { - feed.crawl_interval_minutes = - i32::max(feed.crawl_interval_minutes * 2, MAX_CRAWL_INTERVAL_MINUTES); + feed.crawl_interval_minutes = i32::max( + (feed.crawl_interval_minutes as f32 * 1.2).round() as i32, + MAX_CRAWL_INTERVAL_MINUTES, + ); + info!( + interval = feed.crawl_interval_minutes, + "increased crawl interval" + ); } Ordering::Less => { - feed.crawl_interval_minutes = - i32::max(feed.crawl_interval_minutes / 2, MIN_CRAWL_INTERVAL_MINUTES); + feed.crawl_interval_minutes = i32::max( + (feed.crawl_interval_minutes as f32 / 1.2).round() as i32, + MIN_CRAWL_INTERVAL_MINUTES, + ); + info!( + interval = feed.crawl_interval_minutes, + "decreased crawl interval" + ); } Ordering::Equal => {} } diff --git a/src/handlers/entry.rs b/src/handlers/entry.rs index 51d60e5..9028326 100644 --- a/src/handlers/entry.rs +++ b/src/handlers/entry.rs @@ -20,18 +20,18 @@ pub async fn get( let entry = Entry::get(&pool, id.as_uuid()).await?; let content_dir = std::path::Path::new(&config.content_dir); let content_path = content_dir.join(format!("{}.html", entry.entry_id)); + let title = entry.title.unwrap_or_else(|| "Untitled".to_string()); + let published_at = entry.published_at.to_rfc3339_opts(chrono::SecondsFormat::Millis, true); + let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string()); Ok(layout.render(html! { article { - @let title = entry.title.unwrap_or_else(|| "Untitled".to_string()); h2 { a href=(entry.url) { (title) } } - @let published_at = entry.published_at.to_rfc3339_opts(chrono::SecondsFormat::Millis, true); span class="published" { strong { "Published: " } time datetime=(published_at) data-controller="local-time" { (published_at) } } - @let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string()); (PreEscaped(content)) } })) diff --git a/src/handlers/import.rs b/src/handlers/import.rs index d3c631b..0ff07ec 100644 --- a/src/handlers/import.rs +++ b/src/handlers/import.rs @@ -23,24 +23,16 @@ pub async fn opml( State(importer): State, mut multipart: Multipart, ) -> Result { - dbg!("opml handler"); - if let Some(field) = multipart.next_field().await.map_err(|err| { - dbg!(&err); - err - })? { + if let Some(field) = multipart.next_field().await? { let import_id = Base62Uuid::new(); - dbg!(&import_id); let file_name = field.file_name().map(|s| s.to_string()); - dbg!(&file_name); let bytes = field.bytes().await?; - dbg!(&bytes.len()); let receiver = importer.import(import_id, file_name, bytes).await; { let mut imports = imports.lock().await; imports.insert(import_id.as_uuid(), receiver); } - let import_html_id = format!("import-{}", import_id); let import_stream = format!("/import/{}/stream", import_id); return Ok(( StatusCode::CREATED, @@ -59,7 +51,6 @@ pub async fn opml( ) .into_response()); } - dbg!("no file"); Err(Error::NoFile) }