Sanitize entry html content with ammonia

This commit is contained in:
2023-08-29 23:04:35 -04:00
parent 2f39be4152
commit ceac234ce7
6 changed files with 99 additions and 25 deletions

View File

@@ -2,6 +2,7 @@ use std::fmt::{self, Display, Formatter};
use std::fs;
use std::path::Path;
use ammonia::clean;
use bytes::Buf;
use readability::extractor;
use reqwest::Client;
@@ -114,7 +115,9 @@ impl EntryCrawler {
// .await
// .map_err(|_| EntryCrawlerError::CreateEntryError(entry.url.clone()))?;
// };
fs::write(content_dir.join(format!("{}.html", id)), article.content)
let content = clean(&article.content);
info!("sanitized content");
fs::write(content_dir.join(format!("{}.html", id)), content)
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
fs::write(content_dir.join(format!("{}.txt", id)), article.text)
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;

View File

@@ -201,12 +201,24 @@ impl FeedCrawler {
.cmp(&Duration::minutes(feed.crawl_interval_minutes.into()))
{
Ordering::Greater => {
feed.crawl_interval_minutes =
i32::max(feed.crawl_interval_minutes * 2, MAX_CRAWL_INTERVAL_MINUTES);
feed.crawl_interval_minutes = i32::max(
(feed.crawl_interval_minutes as f32 * 1.2).round() as i32,
MAX_CRAWL_INTERVAL_MINUTES,
);
info!(
interval = feed.crawl_interval_minutes,
"increased crawl interval"
);
}
Ordering::Less => {
feed.crawl_interval_minutes =
i32::max(feed.crawl_interval_minutes / 2, MIN_CRAWL_INTERVAL_MINUTES);
feed.crawl_interval_minutes = i32::max(
(feed.crawl_interval_minutes as f32 / 1.2).round() as i32,
MIN_CRAWL_INTERVAL_MINUTES,
);
info!(
interval = feed.crawl_interval_minutes,
"decreased crawl interval"
);
}
Ordering::Equal => {}
}