Scrape additional fields to mod table
This commit is contained in:
parent
f23cf526e5
commit
b97689b7fd
@ -0,0 +1,7 @@
|
||||
ALTER TABLE "mods" ADD COLUMN "last_update_at" TIMESTAMP(3); /* TODO: make NOT NULL after backfill */
|
||||
ALTER TABLE "mods" ADD COLUMN "first_upload_at" TIMESTAMP(3); /* TODO: make NOT NULL after backfill */
|
||||
ALTER TABLE "mods" ADD COLUMN "thumbnail_link" VARCHAR(255);
|
||||
ALTER TABLE "mods" ADD COLUMN "author_id" INTEGER; /* TODO: make NOT NULL after backfill */
|
||||
ALTER TABLE "mods" ADD COLUMN "category_id" INTEGER;
|
||||
ALTER TABLE "mods" RENAME COLUMN "author" TO "author_name";
|
||||
ALTER TABLE "mods" RENAME COLUMN "category" TO "category_name";
|
20
src/main.rs
20
src/main.rs
@ -277,7 +277,10 @@ pub async fn main() -> Result<()> {
|
||||
.find(|processed_mod| processed_mod.nexus_mod_id == scraped_mod.nexus_mod_id)
|
||||
{
|
||||
if processed_mod.last_updated_files_at
|
||||
> NaiveDateTime::new(scraped_mod.last_update, NaiveTime::from_hms(0, 0, 0))
|
||||
> NaiveDateTime::new(
|
||||
scraped_mod.last_update_at,
|
||||
NaiveTime::from_hms(0, 0, 0),
|
||||
)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@ -287,10 +290,21 @@ pub async fn main() -> Result<()> {
|
||||
.map(|scraped_mod| UnsavedMod {
|
||||
name: scraped_mod.name,
|
||||
nexus_mod_id: scraped_mod.nexus_mod_id,
|
||||
author: scraped_mod.author,
|
||||
category: scraped_mod.category,
|
||||
author_name: scraped_mod.author_name,
|
||||
author_id: Some(scraped_mod.author_id),
|
||||
category_name: scraped_mod.category_name,
|
||||
category_id: scraped_mod.category_id,
|
||||
description: scraped_mod.desc,
|
||||
thumbnail_link: scraped_mod.thumbnail_link,
|
||||
game_id: game.id,
|
||||
last_update_at: Some(NaiveDateTime::new(
|
||||
scraped_mod.last_update_at,
|
||||
NaiveTime::from_hms(0, 0, 0),
|
||||
)),
|
||||
first_upload_at: Some(NaiveDateTime::new(
|
||||
scraped_mod.first_upload_at,
|
||||
NaiveTime::from_hms(0, 0, 0),
|
||||
)),
|
||||
})
|
||||
.collect();
|
||||
|
||||
|
@ -11,12 +11,17 @@ pub struct Mod {
|
||||
pub id: i32,
|
||||
pub name: String,
|
||||
pub nexus_mod_id: i32,
|
||||
pub author: String,
|
||||
pub category: Option<String>,
|
||||
pub author_name: String,
|
||||
pub author_id: Option<i32>,
|
||||
pub category_name: Option<String>,
|
||||
pub category_id: Option<i32>,
|
||||
pub description: Option<String>,
|
||||
pub thumbnail_link: Option<String>,
|
||||
pub game_id: i32,
|
||||
pub updated_at: NaiveDateTime,
|
||||
pub created_at: NaiveDateTime,
|
||||
pub last_update_at: Option<NaiveDateTime>,
|
||||
pub first_upload_at: Option<NaiveDateTime>,
|
||||
pub last_updated_files_at: Option<NaiveDateTime>,
|
||||
}
|
||||
|
||||
@ -24,10 +29,15 @@ pub struct Mod {
|
||||
pub struct UnsavedMod<'a> {
|
||||
pub name: &'a str,
|
||||
pub nexus_mod_id: i32,
|
||||
pub author: &'a str,
|
||||
pub category: Option<&'a str>,
|
||||
pub author_name: &'a str,
|
||||
pub author_id: Option<i32>,
|
||||
pub category_name: Option<&'a str>,
|
||||
pub category_id: Option<i32>,
|
||||
pub description: Option<&'a str>,
|
||||
pub thumbnail_link: Option<&'a str>,
|
||||
pub game_id: i32,
|
||||
pub last_update_at: Option<NaiveDateTime>,
|
||||
pub first_upload_at: Option<NaiveDateTime>,
|
||||
}
|
||||
|
||||
#[instrument(level = "debug", skip(pool))]
|
||||
@ -77,26 +87,36 @@ pub async fn insert(
|
||||
pool: &sqlx::Pool<sqlx::Postgres>,
|
||||
name: &str,
|
||||
nexus_mod_id: i32,
|
||||
author: &str,
|
||||
category: Option<&str>,
|
||||
author_name: &str,
|
||||
author_id: i32,
|
||||
category_name: Option<&str>,
|
||||
category_id: Option<i32>,
|
||||
description: Option<&str>,
|
||||
thumbnail_link: Option<&str>,
|
||||
game_id: i32,
|
||||
last_update_at: Option<NaiveDateTime>,
|
||||
first_upload_at: Option<NaiveDateTime>,
|
||||
) -> Result<Mod> {
|
||||
sqlx::query_as!(
|
||||
Mod,
|
||||
"INSERT INTO mods
|
||||
(name, nexus_mod_id, author, category, description, game_id, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, now(), now())
|
||||
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, now(), now())
|
||||
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
|
||||
SET (name, author, category, description, updated_at) =
|
||||
(EXCLUDED.name, EXCLUDED.author, EXCLUDED.category, EXCLUDED.description, now())
|
||||
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
|
||||
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
|
||||
RETURNING *",
|
||||
name,
|
||||
nexus_mod_id,
|
||||
author,
|
||||
category,
|
||||
author_name,
|
||||
author_id,
|
||||
category_name,
|
||||
category_id,
|
||||
description,
|
||||
game_id
|
||||
thumbnail_link,
|
||||
game_id,
|
||||
last_update_at,
|
||||
first_upload_at
|
||||
)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
@ -112,36 +132,51 @@ pub async fn batched_insert<'a>(
|
||||
for batch in mods.chunks(BATCH_SIZE) {
|
||||
let mut names: Vec<&str> = vec![];
|
||||
let mut nexus_mod_ids: Vec<i32> = vec![];
|
||||
let mut authors: Vec<&str> = vec![];
|
||||
let mut categories: Vec<Option<&str>> = vec![];
|
||||
let mut author_names: Vec<&str> = vec![];
|
||||
let mut author_ids: Vec<Option<i32>> = vec![];
|
||||
let mut category_names: Vec<Option<&str>> = vec![];
|
||||
let mut category_ids: Vec<Option<i32>> = vec![];
|
||||
let mut descriptions: Vec<Option<&str>> = vec![];
|
||||
let mut thumbnail_links: Vec<Option<&str>> = vec![];
|
||||
let mut game_ids: Vec<i32> = vec![];
|
||||
let mut last_update_ats: Vec<Option<NaiveDateTime>> = vec![];
|
||||
let mut first_upload_ats: Vec<Option<NaiveDateTime>> = vec![];
|
||||
batch.iter().for_each(|unsaved_mod| {
|
||||
names.push(unsaved_mod.name);
|
||||
nexus_mod_ids.push(unsaved_mod.nexus_mod_id);
|
||||
authors.push(unsaved_mod.author);
|
||||
categories.push(unsaved_mod.category);
|
||||
author_names.push(unsaved_mod.author_name);
|
||||
author_ids.push(unsaved_mod.author_id);
|
||||
category_names.push(unsaved_mod.category_name);
|
||||
category_ids.push(unsaved_mod.category_id);
|
||||
descriptions.push(unsaved_mod.description);
|
||||
thumbnail_links.push(unsaved_mod.thumbnail_link);
|
||||
game_ids.push(unsaved_mod.game_id);
|
||||
last_update_ats.push(unsaved_mod.last_update_at);
|
||||
first_upload_ats.push(unsaved_mod.first_upload_at);
|
||||
});
|
||||
saved_mods.append(
|
||||
// sqlx doesn't understand arrays of Options with the query_as! macro
|
||||
&mut sqlx::query_as(
|
||||
r#"INSERT INTO mods
|
||||
(name, nexus_mod_id, author, category, description, game_id, created_at, updated_at)
|
||||
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
|
||||
SELECT *, now(), now()
|
||||
FROM UNNEST($1::text[], $2::int[], $3::text[], $4::text[], $5::text[], $6::int[])
|
||||
FROM UNNEST($1::text[], $2::int[], $3::text[], $4::int[], $5::text[], $6::int[], $7::text[], $8::text[], $9::int[], $10::timestamp(3)[], $11::timestamp(3)[])
|
||||
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
|
||||
SET (name, author, category, description, updated_at) =
|
||||
(EXCLUDED.name, EXCLUDED.author, EXCLUDED.category, EXCLUDED.description, now())
|
||||
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
|
||||
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
|
||||
RETURNING *"#,
|
||||
)
|
||||
.bind(&names)
|
||||
.bind(&nexus_mod_ids)
|
||||
.bind(&authors)
|
||||
.bind(&categories)
|
||||
.bind(&author_names)
|
||||
.bind(&author_ids)
|
||||
.bind(&category_names)
|
||||
.bind(&category_ids)
|
||||
.bind(&descriptions)
|
||||
.bind(&thumbnail_links)
|
||||
.bind(&game_ids)
|
||||
.bind(&last_update_ats)
|
||||
.bind(&first_upload_ats)
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.context("Failed to insert mods")?,
|
||||
|
@ -13,10 +13,14 @@ pub struct ModListResponse {
|
||||
pub struct ScrapedMod<'a> {
|
||||
pub nexus_mod_id: i32,
|
||||
pub name: &'a str,
|
||||
pub category: Option<&'a str>,
|
||||
pub author: &'a str,
|
||||
pub category_name: Option<&'a str>,
|
||||
pub category_id: Option<i32>,
|
||||
pub author_name: &'a str,
|
||||
pub author_id: i32,
|
||||
pub desc: Option<&'a str>,
|
||||
pub last_update: NaiveDate,
|
||||
pub thumbnail_link: Option<&'a str>,
|
||||
pub last_update_at: NaiveDate,
|
||||
pub first_upload_at: NaiveDate,
|
||||
}
|
||||
|
||||
pub struct ModListScrape<'a> {
|
||||
@ -55,7 +59,12 @@ impl ModListResponse {
|
||||
Selector::parse("div.category a").expect("failed to parse CSS selector");
|
||||
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
|
||||
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
|
||||
let last_update_select = Selector::parse("div.date").expect("failed to parse CSS selector");
|
||||
let thumbnail_select =
|
||||
Selector::parse("a.mod-image img.fore").expect("failed to parse CSS selector");
|
||||
let first_upload_date_select =
|
||||
Selector::parse("time.date").expect("failed to parse CSS selector");
|
||||
let last_update_date_select =
|
||||
Selector::parse("div.date").expect("failed to parse CSS selector");
|
||||
let next_page_select =
|
||||
Selector::parse("div.pagination li.next").expect("failed to parse CSS selector");
|
||||
|
||||
@ -90,12 +99,31 @@ impl ModListResponse {
|
||||
.select(&category_select)
|
||||
.next()
|
||||
.expect("Missing category link for mod");
|
||||
let category = category_elem.text().next();
|
||||
let category_id = match category_elem.value().attr("href") {
|
||||
Some(href) => Some(
|
||||
href.split("/")
|
||||
.nth(6)
|
||||
.expect("Missing category id for mod")
|
||||
.parse::<i32>()
|
||||
.expect("Failed to parse category id"),
|
||||
),
|
||||
None => None,
|
||||
};
|
||||
let category_name = category_elem.text().next();
|
||||
let author_elem = right
|
||||
.select(&author_select)
|
||||
.next()
|
||||
.expect("Missing author link for mod");
|
||||
let author = author_elem
|
||||
let author_id = author_elem
|
||||
.value()
|
||||
.attr("href")
|
||||
.expect("Missing author link href for mod")
|
||||
.split("/")
|
||||
.last()
|
||||
.expect("Missing author id for mod")
|
||||
.parse::<i32>()
|
||||
.expect("Failed to parse author id");
|
||||
let author_name = author_elem
|
||||
.text()
|
||||
.next()
|
||||
.expect("Missing author text for mod");
|
||||
@ -104,26 +132,48 @@ impl ModListResponse {
|
||||
.next()
|
||||
.expect("Missing desc elem for mod");
|
||||
let desc = desc_elem.text().next();
|
||||
let last_update_elem = right
|
||||
.select(&last_update_select)
|
||||
let thumbnail_elem = left
|
||||
.select(&thumbnail_select)
|
||||
.next()
|
||||
.expect("Missing last update elem for mod");
|
||||
let last_update = last_update_elem
|
||||
.text()
|
||||
.expect("Missing thumbnail elem for mod");
|
||||
let thumbnail_link = thumbnail_elem.value().attr("src");
|
||||
let first_upload_date_text = right
|
||||
.select(&first_upload_date_select)
|
||||
.next()
|
||||
.expect("Missing dates elem for mod")
|
||||
.text();
|
||||
let first_upload_at = first_upload_date_text
|
||||
.skip(2)
|
||||
.next()
|
||||
.expect("Missing last update text for mod")
|
||||
.trim();
|
||||
dbg!(&first_upload_at);
|
||||
let first_upload_at = NaiveDate::parse_from_str(first_upload_at, "%d %b %Y")
|
||||
.expect("Cannot parse first upload date");
|
||||
let last_update_date_text = right
|
||||
.select(&last_update_date_select)
|
||||
.next()
|
||||
.expect("Missing dates elem for mod")
|
||||
.text();
|
||||
let last_update_at = last_update_date_text
|
||||
.skip(1)
|
||||
.next()
|
||||
.expect("Missing last update text for mod")
|
||||
.trim();
|
||||
let last_update = NaiveDate::parse_from_str(last_update, "%d %b %Y")
|
||||
let last_update_at = NaiveDate::parse_from_str(last_update_at, "%d %b %Y")
|
||||
.expect("Cannot parse last update date");
|
||||
|
||||
ScrapedMod {
|
||||
nexus_mod_id,
|
||||
name,
|
||||
category,
|
||||
author,
|
||||
category_name,
|
||||
category_id,
|
||||
author_name,
|
||||
author_id,
|
||||
desc,
|
||||
last_update,
|
||||
thumbnail_link,
|
||||
last_update_at,
|
||||
first_upload_at,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
Loading…
Reference in New Issue
Block a user