Scrape additional fields to mod table

This commit is contained in:
Tyler Hallada 2022-01-17 23:37:58 -05:00
parent f23cf526e5
commit b97689b7fd
4 changed files with 147 additions and 41 deletions

View File

@ -0,0 +1,7 @@
ALTER TABLE "mods" ADD COLUMN "last_update_at" TIMESTAMP(3); /* TODO: make NOT NULL after backfill */
ALTER TABLE "mods" ADD COLUMN "first_upload_at" TIMESTAMP(3); /* TODO: make NOT NULL after backfill */
ALTER TABLE "mods" ADD COLUMN "thumbnail_link" VARCHAR(255);
ALTER TABLE "mods" ADD COLUMN "author_id" INTEGER; /* TODO: make NOT NULL after backfill */
ALTER TABLE "mods" ADD COLUMN "category_id" INTEGER;
ALTER TABLE "mods" RENAME COLUMN "author" TO "author_name";
ALTER TABLE "mods" RENAME COLUMN "category" TO "category_name";

View File

@ -277,7 +277,10 @@ pub async fn main() -> Result<()> {
.find(|processed_mod| processed_mod.nexus_mod_id == scraped_mod.nexus_mod_id) .find(|processed_mod| processed_mod.nexus_mod_id == scraped_mod.nexus_mod_id)
{ {
if processed_mod.last_updated_files_at if processed_mod.last_updated_files_at
> NaiveDateTime::new(scraped_mod.last_update, NaiveTime::from_hms(0, 0, 0)) > NaiveDateTime::new(
scraped_mod.last_update_at,
NaiveTime::from_hms(0, 0, 0),
)
{ {
return false; return false;
} }
@ -287,10 +290,21 @@ pub async fn main() -> Result<()> {
.map(|scraped_mod| UnsavedMod { .map(|scraped_mod| UnsavedMod {
name: scraped_mod.name, name: scraped_mod.name,
nexus_mod_id: scraped_mod.nexus_mod_id, nexus_mod_id: scraped_mod.nexus_mod_id,
author: scraped_mod.author, author_name: scraped_mod.author_name,
category: scraped_mod.category, author_id: Some(scraped_mod.author_id),
category_name: scraped_mod.category_name,
category_id: scraped_mod.category_id,
description: scraped_mod.desc, description: scraped_mod.desc,
thumbnail_link: scraped_mod.thumbnail_link,
game_id: game.id, game_id: game.id,
last_update_at: Some(NaiveDateTime::new(
scraped_mod.last_update_at,
NaiveTime::from_hms(0, 0, 0),
)),
first_upload_at: Some(NaiveDateTime::new(
scraped_mod.first_upload_at,
NaiveTime::from_hms(0, 0, 0),
)),
}) })
.collect(); .collect();

View File

@ -11,12 +11,17 @@ pub struct Mod {
pub id: i32, pub id: i32,
pub name: String, pub name: String,
pub nexus_mod_id: i32, pub nexus_mod_id: i32,
pub author: String, pub author_name: String,
pub category: Option<String>, pub author_id: Option<i32>,
pub category_name: Option<String>,
pub category_id: Option<i32>,
pub description: Option<String>, pub description: Option<String>,
pub thumbnail_link: Option<String>,
pub game_id: i32, pub game_id: i32,
pub updated_at: NaiveDateTime, pub updated_at: NaiveDateTime,
pub created_at: NaiveDateTime, pub created_at: NaiveDateTime,
pub last_update_at: Option<NaiveDateTime>,
pub first_upload_at: Option<NaiveDateTime>,
pub last_updated_files_at: Option<NaiveDateTime>, pub last_updated_files_at: Option<NaiveDateTime>,
} }
@ -24,10 +29,15 @@ pub struct Mod {
pub struct UnsavedMod<'a> { pub struct UnsavedMod<'a> {
pub name: &'a str, pub name: &'a str,
pub nexus_mod_id: i32, pub nexus_mod_id: i32,
pub author: &'a str, pub author_name: &'a str,
pub category: Option<&'a str>, pub author_id: Option<i32>,
pub category_name: Option<&'a str>,
pub category_id: Option<i32>,
pub description: Option<&'a str>, pub description: Option<&'a str>,
pub thumbnail_link: Option<&'a str>,
pub game_id: i32, pub game_id: i32,
pub last_update_at: Option<NaiveDateTime>,
pub first_upload_at: Option<NaiveDateTime>,
} }
#[instrument(level = "debug", skip(pool))] #[instrument(level = "debug", skip(pool))]
@ -77,26 +87,36 @@ pub async fn insert(
pool: &sqlx::Pool<sqlx::Postgres>, pool: &sqlx::Pool<sqlx::Postgres>,
name: &str, name: &str,
nexus_mod_id: i32, nexus_mod_id: i32,
author: &str, author_name: &str,
category: Option<&str>, author_id: i32,
category_name: Option<&str>,
category_id: Option<i32>,
description: Option<&str>, description: Option<&str>,
thumbnail_link: Option<&str>,
game_id: i32, game_id: i32,
last_update_at: Option<NaiveDateTime>,
first_upload_at: Option<NaiveDateTime>,
) -> Result<Mod> { ) -> Result<Mod> {
sqlx::query_as!( sqlx::query_as!(
Mod, Mod,
"INSERT INTO mods "INSERT INTO mods
(name, nexus_mod_id, author, category, description, game_id, created_at, updated_at) (name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, now(), now()) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, now(), now())
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
SET (name, author, category, description, updated_at) = SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
(EXCLUDED.name, EXCLUDED.author, EXCLUDED.category, EXCLUDED.description, now()) (EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
RETURNING *", RETURNING *",
name, name,
nexus_mod_id, nexus_mod_id,
author, author_name,
category, author_id,
category_name,
category_id,
description, description,
game_id thumbnail_link,
game_id,
last_update_at,
first_upload_at
) )
.fetch_one(pool) .fetch_one(pool)
.await .await
@ -112,36 +132,51 @@ pub async fn batched_insert<'a>(
for batch in mods.chunks(BATCH_SIZE) { for batch in mods.chunks(BATCH_SIZE) {
let mut names: Vec<&str> = vec![]; let mut names: Vec<&str> = vec![];
let mut nexus_mod_ids: Vec<i32> = vec![]; let mut nexus_mod_ids: Vec<i32> = vec![];
let mut authors: Vec<&str> = vec![]; let mut author_names: Vec<&str> = vec![];
let mut categories: Vec<Option<&str>> = vec![]; let mut author_ids: Vec<Option<i32>> = vec![];
let mut category_names: Vec<Option<&str>> = vec![];
let mut category_ids: Vec<Option<i32>> = vec![];
let mut descriptions: Vec<Option<&str>> = vec![]; let mut descriptions: Vec<Option<&str>> = vec![];
let mut thumbnail_links: Vec<Option<&str>> = vec![];
let mut game_ids: Vec<i32> = vec![]; let mut game_ids: Vec<i32> = vec![];
let mut last_update_ats: Vec<Option<NaiveDateTime>> = vec![];
let mut first_upload_ats: Vec<Option<NaiveDateTime>> = vec![];
batch.iter().for_each(|unsaved_mod| { batch.iter().for_each(|unsaved_mod| {
names.push(unsaved_mod.name); names.push(unsaved_mod.name);
nexus_mod_ids.push(unsaved_mod.nexus_mod_id); nexus_mod_ids.push(unsaved_mod.nexus_mod_id);
authors.push(unsaved_mod.author); author_names.push(unsaved_mod.author_name);
categories.push(unsaved_mod.category); author_ids.push(unsaved_mod.author_id);
category_names.push(unsaved_mod.category_name);
category_ids.push(unsaved_mod.category_id);
descriptions.push(unsaved_mod.description); descriptions.push(unsaved_mod.description);
thumbnail_links.push(unsaved_mod.thumbnail_link);
game_ids.push(unsaved_mod.game_id); game_ids.push(unsaved_mod.game_id);
last_update_ats.push(unsaved_mod.last_update_at);
first_upload_ats.push(unsaved_mod.first_upload_at);
}); });
saved_mods.append( saved_mods.append(
// sqlx doesn't understand arrays of Options with the query_as! macro // sqlx doesn't understand arrays of Options with the query_as! macro
&mut sqlx::query_as( &mut sqlx::query_as(
r#"INSERT INTO mods r#"INSERT INTO mods
(name, nexus_mod_id, author, category, description, game_id, created_at, updated_at) (name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
SELECT *, now(), now() SELECT *, now(), now()
FROM UNNEST($1::text[], $2::int[], $3::text[], $4::text[], $5::text[], $6::int[]) FROM UNNEST($1::text[], $2::int[], $3::text[], $4::int[], $5::text[], $6::int[], $7::text[], $8::text[], $9::int[], $10::timestamp(3)[], $11::timestamp(3)[])
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
SET (name, author, category, description, updated_at) = SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
(EXCLUDED.name, EXCLUDED.author, EXCLUDED.category, EXCLUDED.description, now()) (EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
RETURNING *"#, RETURNING *"#,
) )
.bind(&names) .bind(&names)
.bind(&nexus_mod_ids) .bind(&nexus_mod_ids)
.bind(&authors) .bind(&author_names)
.bind(&categories) .bind(&author_ids)
.bind(&category_names)
.bind(&category_ids)
.bind(&descriptions) .bind(&descriptions)
.bind(&thumbnail_links)
.bind(&game_ids) .bind(&game_ids)
.bind(&last_update_ats)
.bind(&first_upload_ats)
.fetch_all(pool) .fetch_all(pool)
.await .await
.context("Failed to insert mods")?, .context("Failed to insert mods")?,

View File

@ -13,10 +13,14 @@ pub struct ModListResponse {
pub struct ScrapedMod<'a> { pub struct ScrapedMod<'a> {
pub nexus_mod_id: i32, pub nexus_mod_id: i32,
pub name: &'a str, pub name: &'a str,
pub category: Option<&'a str>, pub category_name: Option<&'a str>,
pub author: &'a str, pub category_id: Option<i32>,
pub author_name: &'a str,
pub author_id: i32,
pub desc: Option<&'a str>, pub desc: Option<&'a str>,
pub last_update: NaiveDate, pub thumbnail_link: Option<&'a str>,
pub last_update_at: NaiveDate,
pub first_upload_at: NaiveDate,
} }
pub struct ModListScrape<'a> { pub struct ModListScrape<'a> {
@ -55,7 +59,12 @@ impl ModListResponse {
Selector::parse("div.category a").expect("failed to parse CSS selector"); Selector::parse("div.category a").expect("failed to parse CSS selector");
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector"); let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector"); let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
let last_update_select = Selector::parse("div.date").expect("failed to parse CSS selector"); let thumbnail_select =
Selector::parse("a.mod-image img.fore").expect("failed to parse CSS selector");
let first_upload_date_select =
Selector::parse("time.date").expect("failed to parse CSS selector");
let last_update_date_select =
Selector::parse("div.date").expect("failed to parse CSS selector");
let next_page_select = let next_page_select =
Selector::parse("div.pagination li.next").expect("failed to parse CSS selector"); Selector::parse("div.pagination li.next").expect("failed to parse CSS selector");
@ -90,12 +99,31 @@ impl ModListResponse {
.select(&category_select) .select(&category_select)
.next() .next()
.expect("Missing category link for mod"); .expect("Missing category link for mod");
let category = category_elem.text().next(); let category_id = match category_elem.value().attr("href") {
Some(href) => Some(
href.split("/")
.nth(6)
.expect("Missing category id for mod")
.parse::<i32>()
.expect("Failed to parse category id"),
),
None => None,
};
let category_name = category_elem.text().next();
let author_elem = right let author_elem = right
.select(&author_select) .select(&author_select)
.next() .next()
.expect("Missing author link for mod"); .expect("Missing author link for mod");
let author = author_elem let author_id = author_elem
.value()
.attr("href")
.expect("Missing author link href for mod")
.split("/")
.last()
.expect("Missing author id for mod")
.parse::<i32>()
.expect("Failed to parse author id");
let author_name = author_elem
.text() .text()
.next() .next()
.expect("Missing author text for mod"); .expect("Missing author text for mod");
@ -104,26 +132,48 @@ impl ModListResponse {
.next() .next()
.expect("Missing desc elem for mod"); .expect("Missing desc elem for mod");
let desc = desc_elem.text().next(); let desc = desc_elem.text().next();
let last_update_elem = right let thumbnail_elem = left
.select(&last_update_select) .select(&thumbnail_select)
.next() .next()
.expect("Missing last update elem for mod"); .expect("Missing thumbnail elem for mod");
let last_update = last_update_elem let thumbnail_link = thumbnail_elem.value().attr("src");
.text() let first_upload_date_text = right
.select(&first_upload_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let first_upload_at = first_upload_date_text
.skip(2)
.next()
.expect("Missing last update text for mod")
.trim();
dbg!(&first_upload_at);
let first_upload_at = NaiveDate::parse_from_str(first_upload_at, "%d %b %Y")
.expect("Cannot parse first upload date");
let last_update_date_text = right
.select(&last_update_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let last_update_at = last_update_date_text
.skip(1) .skip(1)
.next() .next()
.expect("Missing last update text for mod") .expect("Missing last update text for mod")
.trim(); .trim();
let last_update = NaiveDate::parse_from_str(last_update, "%d %b %Y") let last_update_at = NaiveDate::parse_from_str(last_update_at, "%d %b %Y")
.expect("Cannot parse last update date"); .expect("Cannot parse last update date");
ScrapedMod { ScrapedMod {
nexus_mod_id, nexus_mod_id,
name, name,
category, category_name,
author, category_id,
author_name,
author_id,
desc, desc,
last_update, thumbnail_link,
last_update_at,
first_upload_at,
} }
}) })
.collect(); .collect();