Sanitize entry html content with ammonia
This commit is contained in:
parent
2f39be4152
commit
ceac234ce7
81
Cargo.lock
generated
81
Cargo.lock
generated
@ -34,6 +34,19 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ammonia"
|
||||||
|
version = "3.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "64e6d1c7838db705c9b756557ee27c384ce695a1c51a6fe528784cb1c6840170"
|
||||||
|
dependencies = [
|
||||||
|
"html5ever 0.26.0",
|
||||||
|
"maplit",
|
||||||
|
"once_cell",
|
||||||
|
"tendril",
|
||||||
|
"url",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "android_system_properties"
|
name = "android_system_properties"
|
||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
@ -359,6 +372,7 @@ dependencies = [
|
|||||||
name = "crawlnicle"
|
name = "crawlnicle"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"ammonia",
|
||||||
"ansi-to-html",
|
"ansi-to-html",
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"axum",
|
"axum",
|
||||||
@ -980,7 +994,21 @@ checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"log",
|
"log",
|
||||||
"mac",
|
"mac",
|
||||||
"markup5ever",
|
"markup5ever 0.10.1",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 1.0.109",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "html5ever"
|
||||||
|
version = "0.26.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"mac",
|
||||||
|
"markup5ever 0.11.0",
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
"syn 1.0.109",
|
"syn 1.0.109",
|
||||||
@ -1321,6 +1349,12 @@ version = "0.1.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "maplit"
|
||||||
|
version = "1.0.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "markup5ever"
|
name = "markup5ever"
|
||||||
version = "0.10.1"
|
version = "0.10.1"
|
||||||
@ -1328,8 +1362,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
|
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"log",
|
"log",
|
||||||
"phf",
|
"phf 0.8.0",
|
||||||
"phf_codegen",
|
"phf_codegen 0.8.0",
|
||||||
|
"string_cache",
|
||||||
|
"string_cache_codegen",
|
||||||
|
"tendril",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "markup5ever"
|
||||||
|
version = "0.11.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"phf 0.10.1",
|
||||||
|
"phf_codegen 0.10.0",
|
||||||
"string_cache",
|
"string_cache",
|
||||||
"string_cache_codegen",
|
"string_cache_codegen",
|
||||||
"tendril",
|
"tendril",
|
||||||
@ -1341,8 +1389,8 @@ version = "0.1.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b"
|
checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"html5ever",
|
"html5ever 0.25.2",
|
||||||
"markup5ever",
|
"markup5ever 0.10.1",
|
||||||
"tendril",
|
"tendril",
|
||||||
"xml5ever",
|
"xml5ever",
|
||||||
]
|
]
|
||||||
@ -1699,6 +1747,15 @@ dependencies = [
|
|||||||
"phf_shared 0.8.0",
|
"phf_shared 0.8.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf"
|
||||||
|
version = "0.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||||
|
dependencies = [
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf_codegen"
|
name = "phf_codegen"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
@ -1709,6 +1766,16 @@ dependencies = [
|
|||||||
"phf_shared 0.8.0",
|
"phf_shared 0.8.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "phf_codegen"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||||
|
dependencies = [
|
||||||
|
"phf_generator 0.10.0",
|
||||||
|
"phf_shared 0.10.0",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "phf_generator"
|
name = "phf_generator"
|
||||||
version = "0.8.0"
|
version = "0.8.0"
|
||||||
@ -1957,7 +2024,7 @@ version = "0.2.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e7843b159286299dd2b90f06d904ae1a8017a650d88d716c85dd6f123947f399"
|
checksum = "e7843b159286299dd2b90f06d904ae1a8017a650d88d716c85dd6f123947f399"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"html5ever",
|
"html5ever 0.25.2",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"markup5ever_rcdom",
|
"markup5ever_rcdom",
|
||||||
"regex",
|
"regex",
|
||||||
@ -3430,7 +3497,7 @@ checksum = "9234163818fd8e2418fcde330655e757900d4236acd8cc70fef345ef91f6d865"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"log",
|
"log",
|
||||||
"mac",
|
"mac",
|
||||||
"markup5ever",
|
"markup5ever 0.10.1",
|
||||||
"time 0.1.45",
|
"time 0.1.45",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -49,3 +49,4 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|||||||
uuid = { version = "1.3", features = ["serde"] }
|
uuid = { version = "1.3", features = ["serde"] }
|
||||||
url = "2.4"
|
url = "2.4"
|
||||||
validator = { version = "0.16", features = ["derive"] }
|
validator = { version = "0.16", features = ["derive"] }
|
||||||
|
ammonia = "3.3.0"
|
||||||
|
@ -2,6 +2,7 @@ use std::fmt::{self, Display, Formatter};
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
|
use ammonia::clean;
|
||||||
use bytes::Buf;
|
use bytes::Buf;
|
||||||
use readability::extractor;
|
use readability::extractor;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
@ -114,7 +115,9 @@ impl EntryCrawler {
|
|||||||
// .await
|
// .await
|
||||||
// .map_err(|_| EntryCrawlerError::CreateEntryError(entry.url.clone()))?;
|
// .map_err(|_| EntryCrawlerError::CreateEntryError(entry.url.clone()))?;
|
||||||
// };
|
// };
|
||||||
fs::write(content_dir.join(format!("{}.html", id)), article.content)
|
let content = clean(&article.content);
|
||||||
|
info!("sanitized content");
|
||||||
|
fs::write(content_dir.join(format!("{}.html", id)), content)
|
||||||
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
|
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
|
||||||
fs::write(content_dir.join(format!("{}.txt", id)), article.text)
|
fs::write(content_dir.join(format!("{}.txt", id)), article.text)
|
||||||
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
|
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
|
||||||
|
@ -201,12 +201,24 @@ impl FeedCrawler {
|
|||||||
.cmp(&Duration::minutes(feed.crawl_interval_minutes.into()))
|
.cmp(&Duration::minutes(feed.crawl_interval_minutes.into()))
|
||||||
{
|
{
|
||||||
Ordering::Greater => {
|
Ordering::Greater => {
|
||||||
feed.crawl_interval_minutes =
|
feed.crawl_interval_minutes = i32::max(
|
||||||
i32::max(feed.crawl_interval_minutes * 2, MAX_CRAWL_INTERVAL_MINUTES);
|
(feed.crawl_interval_minutes as f32 * 1.2).round() as i32,
|
||||||
|
MAX_CRAWL_INTERVAL_MINUTES,
|
||||||
|
);
|
||||||
|
info!(
|
||||||
|
interval = feed.crawl_interval_minutes,
|
||||||
|
"increased crawl interval"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Ordering::Less => {
|
Ordering::Less => {
|
||||||
feed.crawl_interval_minutes =
|
feed.crawl_interval_minutes = i32::max(
|
||||||
i32::max(feed.crawl_interval_minutes / 2, MIN_CRAWL_INTERVAL_MINUTES);
|
(feed.crawl_interval_minutes as f32 / 1.2).round() as i32,
|
||||||
|
MIN_CRAWL_INTERVAL_MINUTES,
|
||||||
|
);
|
||||||
|
info!(
|
||||||
|
interval = feed.crawl_interval_minutes,
|
||||||
|
"decreased crawl interval"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
Ordering::Equal => {}
|
Ordering::Equal => {}
|
||||||
}
|
}
|
||||||
|
@ -20,18 +20,18 @@ pub async fn get(
|
|||||||
let entry = Entry::get(&pool, id.as_uuid()).await?;
|
let entry = Entry::get(&pool, id.as_uuid()).await?;
|
||||||
let content_dir = std::path::Path::new(&config.content_dir);
|
let content_dir = std::path::Path::new(&config.content_dir);
|
||||||
let content_path = content_dir.join(format!("{}.html", entry.entry_id));
|
let content_path = content_dir.join(format!("{}.html", entry.entry_id));
|
||||||
|
let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
|
||||||
|
let published_at = entry.published_at.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
|
||||||
|
let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
|
||||||
Ok(layout.render(html! {
|
Ok(layout.render(html! {
|
||||||
article {
|
article {
|
||||||
@let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
|
|
||||||
h2 { a href=(entry.url) { (title) } }
|
h2 { a href=(entry.url) { (title) } }
|
||||||
@let published_at = entry.published_at.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
|
|
||||||
span class="published" {
|
span class="published" {
|
||||||
strong { "Published: " }
|
strong { "Published: " }
|
||||||
time datetime=(published_at) data-controller="local-time" {
|
time datetime=(published_at) data-controller="local-time" {
|
||||||
(published_at)
|
(published_at)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
|
|
||||||
(PreEscaped(content))
|
(PreEscaped(content))
|
||||||
}
|
}
|
||||||
}))
|
}))
|
||||||
|
@ -23,24 +23,16 @@ pub async fn opml(
|
|||||||
State(importer): State<ImporterHandle>,
|
State(importer): State<ImporterHandle>,
|
||||||
mut multipart: Multipart,
|
mut multipart: Multipart,
|
||||||
) -> Result<Response> {
|
) -> Result<Response> {
|
||||||
dbg!("opml handler");
|
if let Some(field) = multipart.next_field().await? {
|
||||||
if let Some(field) = multipart.next_field().await.map_err(|err| {
|
|
||||||
dbg!(&err);
|
|
||||||
err
|
|
||||||
})? {
|
|
||||||
let import_id = Base62Uuid::new();
|
let import_id = Base62Uuid::new();
|
||||||
dbg!(&import_id);
|
|
||||||
let file_name = field.file_name().map(|s| s.to_string());
|
let file_name = field.file_name().map(|s| s.to_string());
|
||||||
dbg!(&file_name);
|
|
||||||
let bytes = field.bytes().await?;
|
let bytes = field.bytes().await?;
|
||||||
dbg!(&bytes.len());
|
|
||||||
let receiver = importer.import(import_id, file_name, bytes).await;
|
let receiver = importer.import(import_id, file_name, bytes).await;
|
||||||
{
|
{
|
||||||
let mut imports = imports.lock().await;
|
let mut imports = imports.lock().await;
|
||||||
imports.insert(import_id.as_uuid(), receiver);
|
imports.insert(import_id.as_uuid(), receiver);
|
||||||
}
|
}
|
||||||
|
|
||||||
let import_html_id = format!("import-{}", import_id);
|
|
||||||
let import_stream = format!("/import/{}/stream", import_id);
|
let import_stream = format!("/import/{}/stream", import_id);
|
||||||
return Ok((
|
return Ok((
|
||||||
StatusCode::CREATED,
|
StatusCode::CREATED,
|
||||||
@ -59,7 +51,6 @@ pub async fn opml(
|
|||||||
)
|
)
|
||||||
.into_response());
|
.into_response());
|
||||||
}
|
}
|
||||||
dbg!("no file");
|
|
||||||
Err(Error::NoFile)
|
Err(Error::NoFile)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user