Files
crawlnicle/src/jobs/crawl.rs
2023-06-09 01:06:19 -04:00

60 lines
2.6 KiB
Rust

use article_scraper::ArticleScraper;
use chrono::Utc;
use feed_rs::parser;
use reqwest::{Client, Url};
use sqlx::PgPool;
use tracing::{info, info_span, warn};
use crate::models::feed::get_feeds;
use crate::models::entry::{upsert_entries, CreateEntry};
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
/// database.
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
let scraper = ArticleScraper::new(None).await;
let client = Client::new();
let feeds = get_feeds(pool).await?;
for feed in feeds {
let feed_span = info_span!("feed", id = feed.id, url = feed.url.as_str());
let _feed_span_guard = feed_span.enter();
info!("Fetching feed");
// TODO: handle these results
let bytes = client.get(feed.url).send().await?.bytes().await?;
info!("Parsing feed");
let parsed_feed = parser::parse(&bytes[..])?;
let mut payload = Vec::with_capacity(parsed_feed.entries.len());
for entry in parsed_feed.entries {
let entry_span = info_span!("entry", id = entry.id, title = entry.title.clone().map(|t| t.content));
let _entry_span_guard = entry_span.enter();
if let Some(link) = entry.links.get(0) {
// if no scraped or feed date is available, fallback to the current time
let published_at = entry.published.unwrap_or_else(Utc::now).naive_utc();
let mut entry = CreateEntry {
title: entry.title.map(|t| t.content),
url: link.href.clone(),
description: entry.summary.map(|s| s.content),
html_content: None,
feed_id: feed.id,
published_at,
};
info!("Fetching and parsing entry link: {}", link.href);
if let Ok(article) = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await {
if let Some(date) = article.date {
// prefer scraped date over rss feed date
entry.published_at = date.naive_utc()
};
entry.html_content = article.get_content();
} else {
warn!("Failed to fetch article for entry: {:?}", link);
}
payload.push(entry);
} else {
warn!("Skipping feed entry with no links");
}
}
let entries = upsert_entries(pool, payload).await?;
info!("Created {} entries", entries.len());
}
Ok(())
}