Fetch and save entry HTML content with metadata

And render the extracted HTML on the entry page in the frontend.
This commit is contained in:
2023-06-07 01:06:03 -04:00
parent 786f3a194f
commit 3f29138bd1
7 changed files with 516 additions and 12 deletions

View File

@@ -1,5 +1,6 @@
use article_scraper::ArticleScraper;
use feed_rs::parser;
use reqwest::Client;
use reqwest::{Client, Url};
use sqlx::PgPool;
use tracing::{info, warn};
@@ -9,18 +10,23 @@ use crate::models::entry::{upsert_entries, CreateEntry};
/// For every feed in the database, fetches the feed, parses it, and saves new entries to the
/// database.
pub async fn crawl(pool: &PgPool) -> anyhow::Result<()> {
let scraper = ArticleScraper::new(None).await;
let client = Client::new();
let feeds = get_feeds(pool).await?;
for feed in feeds {
info!("Fetching feed {}: {}", feed.id, feed.url);
let bytes = client.get(feed.url).send().await?.bytes().await?;
let parsed_feed = parser::parse(&bytes[..])?;
let mut payload = Vec::with_capacity(parsed_feed.entries.len());
for entry in parsed_feed.entries {
if let Some(link) = entry.links.get(0) {
info!("Fetching entry article: {}", link.href);
let article = scraper.parse(&Url::parse(&link.href)?, true, &client, None).await?;
let entry = CreateEntry {
title: entry.title.map(|t| t.content),
url: link.href.clone(),
description: entry.summary.map(|s| s.content),
html_content: article.get_content(),
feed_id: feed.id,
};
payload.push(entry);