Sanitize entry html content with ammonia
This commit is contained in:
@@ -2,6 +2,7 @@ use std::fmt::{self, Display, Formatter};
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use ammonia::clean;
|
||||
use bytes::Buf;
|
||||
use readability::extractor;
|
||||
use reqwest::Client;
|
||||
@@ -114,7 +115,9 @@ impl EntryCrawler {
|
||||
// .await
|
||||
// .map_err(|_| EntryCrawlerError::CreateEntryError(entry.url.clone()))?;
|
||||
// };
|
||||
fs::write(content_dir.join(format!("{}.html", id)), article.content)
|
||||
let content = clean(&article.content);
|
||||
info!("sanitized content");
|
||||
fs::write(content_dir.join(format!("{}.html", id)), content)
|
||||
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
|
||||
fs::write(content_dir.join(format!("{}.txt", id)), article.text)
|
||||
.map_err(|_| EntryCrawlerError::SaveContentError(entry.url.clone()))?;
|
||||
|
||||
@@ -201,12 +201,24 @@ impl FeedCrawler {
|
||||
.cmp(&Duration::minutes(feed.crawl_interval_minutes.into()))
|
||||
{
|
||||
Ordering::Greater => {
|
||||
feed.crawl_interval_minutes =
|
||||
i32::max(feed.crawl_interval_minutes * 2, MAX_CRAWL_INTERVAL_MINUTES);
|
||||
feed.crawl_interval_minutes = i32::max(
|
||||
(feed.crawl_interval_minutes as f32 * 1.2).round() as i32,
|
||||
MAX_CRAWL_INTERVAL_MINUTES,
|
||||
);
|
||||
info!(
|
||||
interval = feed.crawl_interval_minutes,
|
||||
"increased crawl interval"
|
||||
);
|
||||
}
|
||||
Ordering::Less => {
|
||||
feed.crawl_interval_minutes =
|
||||
i32::max(feed.crawl_interval_minutes / 2, MIN_CRAWL_INTERVAL_MINUTES);
|
||||
feed.crawl_interval_minutes = i32::max(
|
||||
(feed.crawl_interval_minutes as f32 / 1.2).round() as i32,
|
||||
MIN_CRAWL_INTERVAL_MINUTES,
|
||||
);
|
||||
info!(
|
||||
interval = feed.crawl_interval_minutes,
|
||||
"decreased crawl interval"
|
||||
);
|
||||
}
|
||||
Ordering::Equal => {}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user