Sanitize entry html content with ammonia

2023-08-29 23:04:35 -04:00
parent 2f39be4152
commit ceac234ce7
6 changed files with 99 additions and 25 deletions
--- a/src/actors/entry_crawler.rs
+++ b/src/actors/entry_crawler.rs
@@ -2,6 +2,7 @@ use std::fmt::{self, Display, Formatter};
 use std::fs;
 use std::path::Path;

+use ammonia::clean;
 use bytes::Buf;
 use readability::extractor;
 use reqwest::Client;
@@ -114,7 +115,9 @@ impl EntryCrawler {
        //         .await
        //         .map_err(|_| EntryCrawlerError::CreateEntryError(entry.url.clone()))?;
        // };
-        fs::write(content_dir.join(format!("{}.html", id)), article.content)
+        let content = clean(&article.content);
+        info!("sanitized content");
+        fs::write(content_dir.join(format!("{}.html", id)), content)
            .map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
        fs::write(content_dir.join(format!("{}.txt", id)), article.text)
            .map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
--- a/src/actors/feed_crawler.rs
+++ b/src/actors/feed_crawler.rs
@@ -201,12 +201,24 @@ impl FeedCrawler {
                    .cmp(&Duration::minutes(feed.crawl_interval_minutes.into()))
                {
                    Ordering::Greater => {
-                        feed.crawl_interval_minutes =
-                            i32::max(feed.crawl_interval_minutes * 2, MAX_CRAWL_INTERVAL_MINUTES);
+                        feed.crawl_interval_minutes = i32::max(
+                            (feed.crawl_interval_minutes as f32 * 1.2).round() as i32,
+                            MAX_CRAWL_INTERVAL_MINUTES,
+                        );
+                        info!(
+                            interval = feed.crawl_interval_minutes,
+                            "increased crawl interval"
+                        );
                    }
                    Ordering::Less => {
-                        feed.crawl_interval_minutes =
-                            i32::max(feed.crawl_interval_minutes / 2, MIN_CRAWL_INTERVAL_MINUTES);
+                        feed.crawl_interval_minutes = i32::max(
+                            (feed.crawl_interval_minutes as f32 / 1.2).round() as i32,
+                            MIN_CRAWL_INTERVAL_MINUTES,
+                        );
+                        info!(
+                            interval = feed.crawl_interval_minutes,
+                            "decreased crawl interval"
+                        );
                    }
                    Ordering::Equal => {}
                }
--- a/src/handlers/entry.rs
+++ b/src/handlers/entry.rs
@@ -20,18 +20,18 @@ pub async fn get(
    let entry = Entry::get(&pool, id.as_uuid()).await?;
    let content_dir = std::path::Path::new(&config.content_dir);
    let content_path = content_dir.join(format!("{}.html", entry.entry_id));
+    let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
+    let published_at = entry.published_at.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
+    let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
    Ok(layout.render(html! {
        article {
-            @let title = entry.title.unwrap_or_else(|| "Untitled".to_string());
            h2 { a href=(entry.url) { (title) } }
-            @let published_at = entry.published_at.to_rfc3339_opts(chrono::SecondsFormat::Millis, true);
            span class="published" {
                strong { "Published: " }
                time datetime=(published_at) data-controller="local-time" {
                    (published_at)
                }
            }
-            @let content = fs::read_to_string(content_path).unwrap_or_else(|_| "No content".to_string());
            (PreEscaped(content))
        }
    }))
--- a/src/handlers/import.rs
+++ b/src/handlers/import.rs
@@ -23,24 +23,16 @@ pub async fn opml(
    State(importer): State<ImporterHandle>,
    mut multipart: Multipart,
 ) -> Result<Response> {
-    dbg!("opml handler");
-    if let Some(field) = multipart.next_field().await.map_err(|err| {
-        dbg!(&err);
-        err
-    })? {
+    if let Some(field) = multipart.next_field().await? {
        let import_id = Base62Uuid::new();
-        dbg!(&import_id);
        let file_name = field.file_name().map(|s| s.to_string());
-        dbg!(&file_name);
        let bytes = field.bytes().await?;
-        dbg!(&bytes.len());
        let receiver = importer.import(import_id, file_name, bytes).await;
        {
            let mut imports = imports.lock().await;
            imports.insert(import_id.as_uuid(), receiver);
        }

-        let import_html_id = format!("import-{}", import_id);
        let import_stream = format!("/import/{}/stream", import_id);
        return Ok((
            StatusCode::CREATED,
@@ -59,7 +51,6 @@ pub async fn opml(
        )
            .into_response());
    }
-    dbg!("no file");
    Err(Error::NoFile)
 }