Scrape additional fields to mod table

This commit is contained in:
Tyler Hallada 2022-01-17 23:37:58 -05:00
parent f23cf526e5
commit b97689b7fd
4 changed files with 147 additions and 41 deletions

View File

@ -0,0 +1,7 @@
ALTER TABLE "mods" ADD COLUMN "last_update_at" TIMESTAMP(3); /* TODO: make NOT NULL after backfill */
ALTER TABLE "mods" ADD COLUMN "first_upload_at" TIMESTAMP(3); /* TODO: make NOT NULL after backfill */
ALTER TABLE "mods" ADD COLUMN "thumbnail_link" VARCHAR(255);
ALTER TABLE "mods" ADD COLUMN "author_id" INTEGER; /* TODO: make NOT NULL after backfill */
ALTER TABLE "mods" ADD COLUMN "category_id" INTEGER;
ALTER TABLE "mods" RENAME COLUMN "author" TO "author_name";
ALTER TABLE "mods" RENAME COLUMN "category" TO "category_name";

View File

@ -277,7 +277,10 @@ pub async fn main() -> Result<()> {
.find(|processed_mod| processed_mod.nexus_mod_id == scraped_mod.nexus_mod_id)
{
if processed_mod.last_updated_files_at
> NaiveDateTime::new(scraped_mod.last_update, NaiveTime::from_hms(0, 0, 0))
> NaiveDateTime::new(
scraped_mod.last_update_at,
NaiveTime::from_hms(0, 0, 0),
)
{
return false;
}
@ -287,10 +290,21 @@ pub async fn main() -> Result<()> {
.map(|scraped_mod| UnsavedMod {
name: scraped_mod.name,
nexus_mod_id: scraped_mod.nexus_mod_id,
author: scraped_mod.author,
category: scraped_mod.category,
author_name: scraped_mod.author_name,
author_id: Some(scraped_mod.author_id),
category_name: scraped_mod.category_name,
category_id: scraped_mod.category_id,
description: scraped_mod.desc,
thumbnail_link: scraped_mod.thumbnail_link,
game_id: game.id,
last_update_at: Some(NaiveDateTime::new(
scraped_mod.last_update_at,
NaiveTime::from_hms(0, 0, 0),
)),
first_upload_at: Some(NaiveDateTime::new(
scraped_mod.first_upload_at,
NaiveTime::from_hms(0, 0, 0),
)),
})
.collect();

View File

@ -11,12 +11,17 @@ pub struct Mod {
pub id: i32,
pub name: String,
pub nexus_mod_id: i32,
pub author: String,
pub category: Option<String>,
pub author_name: String,
pub author_id: Option<i32>,
pub category_name: Option<String>,
pub category_id: Option<i32>,
pub description: Option<String>,
pub thumbnail_link: Option<String>,
pub game_id: i32,
pub updated_at: NaiveDateTime,
pub created_at: NaiveDateTime,
pub last_update_at: Option<NaiveDateTime>,
pub first_upload_at: Option<NaiveDateTime>,
pub last_updated_files_at: Option<NaiveDateTime>,
}
@ -24,10 +29,15 @@ pub struct Mod {
pub struct UnsavedMod<'a> {
pub name: &'a str,
pub nexus_mod_id: i32,
pub author: &'a str,
pub category: Option<&'a str>,
pub author_name: &'a str,
pub author_id: Option<i32>,
pub category_name: Option<&'a str>,
pub category_id: Option<i32>,
pub description: Option<&'a str>,
pub thumbnail_link: Option<&'a str>,
pub game_id: i32,
pub last_update_at: Option<NaiveDateTime>,
pub first_upload_at: Option<NaiveDateTime>,
}
#[instrument(level = "debug", skip(pool))]
@ -77,26 +87,36 @@ pub async fn insert(
pool: &sqlx::Pool<sqlx::Postgres>,
name: &str,
nexus_mod_id: i32,
author: &str,
category: Option<&str>,
author_name: &str,
author_id: i32,
category_name: Option<&str>,
category_id: Option<i32>,
description: Option<&str>,
thumbnail_link: Option<&str>,
game_id: i32,
last_update_at: Option<NaiveDateTime>,
first_upload_at: Option<NaiveDateTime>,
) -> Result<Mod> {
sqlx::query_as!(
Mod,
"INSERT INTO mods
(name, nexus_mod_id, author, category, description, game_id, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, now(), now())
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, now(), now())
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
SET (name, author, category, description, updated_at) =
(EXCLUDED.name, EXCLUDED.author, EXCLUDED.category, EXCLUDED.description, now())
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
RETURNING *",
name,
nexus_mod_id,
author,
category,
author_name,
author_id,
category_name,
category_id,
description,
game_id
thumbnail_link,
game_id,
last_update_at,
first_upload_at
)
.fetch_one(pool)
.await
@ -112,36 +132,51 @@ pub async fn batched_insert<'a>(
for batch in mods.chunks(BATCH_SIZE) {
let mut names: Vec<&str> = vec![];
let mut nexus_mod_ids: Vec<i32> = vec![];
let mut authors: Vec<&str> = vec![];
let mut categories: Vec<Option<&str>> = vec![];
let mut author_names: Vec<&str> = vec![];
let mut author_ids: Vec<Option<i32>> = vec![];
let mut category_names: Vec<Option<&str>> = vec![];
let mut category_ids: Vec<Option<i32>> = vec![];
let mut descriptions: Vec<Option<&str>> = vec![];
let mut thumbnail_links: Vec<Option<&str>> = vec![];
let mut game_ids: Vec<i32> = vec![];
let mut last_update_ats: Vec<Option<NaiveDateTime>> = vec![];
let mut first_upload_ats: Vec<Option<NaiveDateTime>> = vec![];
batch.iter().for_each(|unsaved_mod| {
names.push(unsaved_mod.name);
nexus_mod_ids.push(unsaved_mod.nexus_mod_id);
authors.push(unsaved_mod.author);
categories.push(unsaved_mod.category);
author_names.push(unsaved_mod.author_name);
author_ids.push(unsaved_mod.author_id);
category_names.push(unsaved_mod.category_name);
category_ids.push(unsaved_mod.category_id);
descriptions.push(unsaved_mod.description);
thumbnail_links.push(unsaved_mod.thumbnail_link);
game_ids.push(unsaved_mod.game_id);
last_update_ats.push(unsaved_mod.last_update_at);
first_upload_ats.push(unsaved_mod.first_upload_at);
});
saved_mods.append(
// sqlx doesn't understand arrays of Options with the query_as! macro
&mut sqlx::query_as(
r#"INSERT INTO mods
(name, nexus_mod_id, author, category, description, game_id, created_at, updated_at)
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
SELECT *, now(), now()
FROM UNNEST($1::text[], $2::int[], $3::text[], $4::text[], $5::text[], $6::int[])
FROM UNNEST($1::text[], $2::int[], $3::text[], $4::int[], $5::text[], $6::int[], $7::text[], $8::text[], $9::int[], $10::timestamp(3)[], $11::timestamp(3)[])
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
SET (name, author, category, description, updated_at) =
(EXCLUDED.name, EXCLUDED.author, EXCLUDED.category, EXCLUDED.description, now())
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
RETURNING *"#,
)
.bind(&names)
.bind(&nexus_mod_ids)
.bind(&authors)
.bind(&categories)
.bind(&author_names)
.bind(&author_ids)
.bind(&category_names)
.bind(&category_ids)
.bind(&descriptions)
.bind(&thumbnail_links)
.bind(&game_ids)
.bind(&last_update_ats)
.bind(&first_upload_ats)
.fetch_all(pool)
.await
.context("Failed to insert mods")?,

View File

@ -13,10 +13,14 @@ pub struct ModListResponse {
pub struct ScrapedMod<'a> {
pub nexus_mod_id: i32,
pub name: &'a str,
pub category: Option<&'a str>,
pub author: &'a str,
pub category_name: Option<&'a str>,
pub category_id: Option<i32>,
pub author_name: &'a str,
pub author_id: i32,
pub desc: Option<&'a str>,
pub last_update: NaiveDate,
pub thumbnail_link: Option<&'a str>,
pub last_update_at: NaiveDate,
pub first_upload_at: NaiveDate,
}
pub struct ModListScrape<'a> {
@ -55,7 +59,12 @@ impl ModListResponse {
Selector::parse("div.category a").expect("failed to parse CSS selector");
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
let last_update_select = Selector::parse("div.date").expect("failed to parse CSS selector");
let thumbnail_select =
Selector::parse("a.mod-image img.fore").expect("failed to parse CSS selector");
let first_upload_date_select =
Selector::parse("time.date").expect("failed to parse CSS selector");
let last_update_date_select =
Selector::parse("div.date").expect("failed to parse CSS selector");
let next_page_select =
Selector::parse("div.pagination li.next").expect("failed to parse CSS selector");
@ -90,12 +99,31 @@ impl ModListResponse {
.select(&category_select)
.next()
.expect("Missing category link for mod");
let category = category_elem.text().next();
let category_id = match category_elem.value().attr("href") {
Some(href) => Some(
href.split("/")
.nth(6)
.expect("Missing category id for mod")
.parse::<i32>()
.expect("Failed to parse category id"),
),
None => None,
};
let category_name = category_elem.text().next();
let author_elem = right
.select(&author_select)
.next()
.expect("Missing author link for mod");
let author = author_elem
let author_id = author_elem
.value()
.attr("href")
.expect("Missing author link href for mod")
.split("/")
.last()
.expect("Missing author id for mod")
.parse::<i32>()
.expect("Failed to parse author id");
let author_name = author_elem
.text()
.next()
.expect("Missing author text for mod");
@ -104,26 +132,48 @@ impl ModListResponse {
.next()
.expect("Missing desc elem for mod");
let desc = desc_elem.text().next();
let last_update_elem = right
.select(&last_update_select)
let thumbnail_elem = left
.select(&thumbnail_select)
.next()
.expect("Missing last update elem for mod");
let last_update = last_update_elem
.text()
.expect("Missing thumbnail elem for mod");
let thumbnail_link = thumbnail_elem.value().attr("src");
let first_upload_date_text = right
.select(&first_upload_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let first_upload_at = first_upload_date_text
.skip(2)
.next()
.expect("Missing last update text for mod")
.trim();
dbg!(&first_upload_at);
let first_upload_at = NaiveDate::parse_from_str(first_upload_at, "%d %b %Y")
.expect("Cannot parse first upload date");
let last_update_date_text = right
.select(&last_update_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let last_update_at = last_update_date_text
.skip(1)
.next()
.expect("Missing last update text for mod")
.trim();
let last_update = NaiveDate::parse_from_str(last_update, "%d %b %Y")
let last_update_at = NaiveDate::parse_from_str(last_update_at, "%d %b %Y")
.expect("Cannot parse last update date");
ScrapedMod {
nexus_mod_id,
name,
category,
author,
category_name,
category_id,
author_name,
author_id,
desc,
last_update,
thumbnail_link,
last_update_at,
first_upload_at,
}
})
.collect();