From 7b8fbea7d4304f0076054eec91d6e9e624515df4 Mon Sep 17 00:00:00 2001 From: Tyler Hallada Date: Fri, 30 May 2025 15:07:53 -0400 Subject: [PATCH] Update nexus scraper to use new graphql api Modmapper hasn't been pulling in new mods for a while because nexus mods changed their mod listing page to use a new graphql API and the old API I was using no longer works. --- README.md | 2 +- src/commands/backfills/is_translation.rs | 14 +- src/commands/update.rs | 32 +- src/main.rs | 4 +- src/nexus_scraper.rs | 390 +++++++++++++---------- 5 files changed, 248 insertions(+), 194 deletions(-) diff --git a/README.md b/README.md index bdad458..b9af575 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ RUST_LOG=mod_mapper=debug 4. Install [`sqlx_cli`](https://github.com/launchbadge/sqlx/tree/master/sqlx-cli) with `cargo install sqlx-cli --no-default-features --features postgres` -5. Run `sqlx migrate --source migrations run` which will run all the database migrations. +5. Run `sqlx migrate run` which will run all the database migrations. 6. Get your personal Nexus API token from your profile settings and add it to the `.env` file: diff --git a/src/commands/backfills/is_translation.rs b/src/commands/backfills/is_translation.rs index 4e23811..ce7786e 100644 --- a/src/commands/backfills/is_translation.rs +++ b/src/commands/backfills/is_translation.rs @@ -3,7 +3,7 @@ use std::time::Duration; use tokio::time::sleep; use tracing::{debug, info, info_span}; -use crate::nexus_api::{SSE_GAME_ID, SSE_GAME_NAME}; +use crate::nexus_api::SSE_GAME_NAME; use crate::nexus_scraper; const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours @@ -21,16 +21,18 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool) -> Resul .timeout(REQUEST_TIMEOUT) .connect_timeout(CONNECT_TIMEOUT) .build()?; + let scraper_client = nexus_scraper::NexusScraper::new(client.clone()); while has_next_page { let page_span = info_span!("page", page); let _page_span = page_span.enter(); - let mod_list_resp = - nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_NAME, SSE_GAME_ID, true).await?; - let scraped = mod_list_resp.scrape_mods()?; - let scraped_ids: Vec = scraped.mods.iter().map(|m| m.nexus_mod_id).collect(); + let mods_response = scraper_client + .get_mods(&SSE_GAME_NAME, page * nexus_scraper::PAGE_SIZE, true) + .await?; + let scraped_mods = nexus_scraper::convert_mods_to_scraped(&mods_response.mods.nodes)?; + let scraped_ids: Vec = scraped_mods.iter().map(|m| m.nexus_mod_id).collect(); - has_next_page = scraped.has_next_page; + has_next_page = scraped_mods.len() == 20; let updated_ids: Vec = sqlx::query_as!( UpdatedMods, diff --git a/src/commands/update.rs b/src/commands/update.rs index 72a33d6..2908b7b 100644 --- a/src/commands/update.rs +++ b/src/commands/update.rs @@ -1,10 +1,12 @@ use anyhow::Result; use chrono::{NaiveDateTime, NaiveTime}; use humansize::{format_size_i, DECIMAL}; -use reqwest::StatusCode; use reqwest::header::{HeaderMap, HeaderValue}; +use reqwest::StatusCode; +use serde_json::json; use std::collections::HashSet; use std::io::SeekFrom; +use std::process; use std::time::Duration; use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tokio::time::sleep; @@ -38,6 +40,7 @@ pub async fn update( .connect_timeout(CONNECT_TIMEOUT) .default_headers(headers) .build()?; + let scraper_client = nexus_scraper::NexusScraper::new(client.clone()); let game_id = get_game_id(game_name).expect("valid game name"); let game = game::insert(pool, game_name, game_id).await?; @@ -50,29 +53,28 @@ pub async fn update( let page_span = info_span!("page", page, game_name, include_translations); let _page_span = page_span.enter(); - let mod_list_resp = nexus_scraper::get_mod_list_page( - &client, - page, - game_name, - game.nexus_game_id, - include_translations, - ) - .await?; - let scraped = mod_list_resp.scrape_mods()?; + let mods_response = scraper_client + .get_mods( + &game.name, + page * nexus_scraper::PAGE_SIZE, + include_translations, + ) + .await?; + + let scraped_mods = nexus_scraper::convert_mods_to_scraped(&mods_response.mods.nodes)?; + info!("scraped {} mods from nexus graphql", scraped_mods.len()); + has_next_page = scraped_mods.len() == 20; - has_next_page = scraped.has_next_page; let processed_mods = game_mod::bulk_get_last_updated_by_nexus_mod_ids( pool, game.id, - &scraped - .mods + &scraped_mods .iter() .map(|scraped_mod| scraped_mod.nexus_mod_id) .collect::>(), ) .await?; - let mods_to_create_or_update: Vec = scraped - .mods + let mods_to_create_or_update: Vec = scraped_mods .iter() .filter(|scraped_mod| { if let Some(processed_mod) = processed_mods.iter().find(|processed_mod| { diff --git a/src/main.rs b/src/main.rs index 6317754..3b6ab98 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,12 +42,12 @@ struct Args { #[argh(option, short = 'e')] dump_edits: Option, - /// file to output the cell mod edit counts over time as json (time_step option required with + /// file to output the cell mod edit counts over time as json (time_step option required with /// this option) #[argh(option, short = 'E')] dump_edits_over_time: Option, - /// the span of time to group cell edit counts into (day, week, or month) when dumping cell + /// the span of time to group cell edit counts into (day, week, or month) when dumping cell /// edits (only relevant for use with dump_edits_over_time option) #[argh(option, short = 'T')] time_step: Option, diff --git a/src/nexus_scraper.rs b/src/nexus_scraper.rs index 3fa155b..744684b 100644 --- a/src/nexus_scraper.rs +++ b/src/nexus_scraper.rs @@ -1,12 +1,11 @@ -use anyhow::Result; +use anyhow::{anyhow, Result}; use chrono::NaiveDate; use reqwest::Client; -use scraper::{Html, Selector}; -use tracing::{info, instrument}; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use tracing::instrument; -pub struct ModListResponse { - html: Html, -} +pub const PAGE_SIZE: usize = 20; #[derive(Debug)] pub struct ScrapedMod<'a> { @@ -22,178 +21,229 @@ pub struct ScrapedMod<'a> { pub first_upload_at: NaiveDate, } -pub struct ModListScrape<'a> { - pub mods: Vec>, - pub has_next_page: bool, +#[derive(Debug, Serialize, Deserialize)] +pub struct GraphQLRequest { + query: String, + variables: Value, + #[serde(rename = "operationName")] + operation_name: String, } -#[instrument(skip(client))] -pub async fn get_mod_list_page( - client: &Client, - page: usize, - game_name: &str, - game_id: i32, - include_translations: bool, -) -> Result { - let res = client - .get(format!( - "https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate", - game_id, - match include_translations { true => "yes", false => "no" }, - page - )) - .header("host", "www.nexusmods.com") - .header("referrer", format!("https://www.nexusmods.com/{}/mods/", game_name)) - .header("sec-fetch-dest", "empty") - .header("sec-fetch-mode", "cors") - .header("sec-fetch-site", "same-origin") - .header("x-requested-with", "XMLHttpRequest") - .send() - .await? - .error_for_status()?; - info!(status = %res.status(), "fetched mod list page"); - let text = res.text().await?; - let html = Html::parse_document(&text); - - Ok(ModListResponse { html }) +#[derive(Debug, Deserialize)] +pub struct GraphQLResponse { + data: Option, + errors: Option>, } -impl ModListResponse { - #[instrument(skip(self))] - pub fn scrape_mods<'a>(&'a self) -> Result { - let mod_select = Selector::parse("li.mod-tile").expect("failed to parse CSS selector"); - let left_select = - Selector::parse("div.mod-tile-left").expect("failed to parse CSS selector"); - let right_select = - Selector::parse("div.mod-tile-right").expect("failed to parse CSS selector"); - let name_select = Selector::parse("p.tile-name a").expect("failed to parse CSS selector"); - let category_select = - Selector::parse("div.category a").expect("failed to parse CSS selector"); - let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector"); - let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector"); - let thumbnail_select = - Selector::parse("a.mod-image img.fore").expect("failed to parse CSS selector"); - let first_upload_date_select = - Selector::parse("time.date").expect("failed to parse CSS selector"); - let last_update_date_select = - Selector::parse("div.date").expect("failed to parse CSS selector"); - let next_page_select = - Selector::parse("div.pagination li:last-child a.page-selected").expect("failed to parse CSS selector"); +#[derive(Debug, Deserialize)] +pub struct GraphQLError { + #[allow(dead_code)] + message: String, +} - let next_page_elem = self.html.select(&next_page_select).next(); +#[derive(Debug, Deserialize)] +pub struct ModsResponse { + pub mods: ModsData, +} - let has_next_page = next_page_elem.is_none(); +#[derive(Debug, Deserialize)] +pub struct ModsData { + #[serde(rename = "facetsData")] + #[allow(dead_code)] + pub facets_data: Option, + pub nodes: Vec, + #[allow(dead_code)] + #[serde(rename = "totalCount")] + pub total_count: i32, +} - let mods: Vec = self - .html - .select(&mod_select) - .map(|element| { - let left = element - .select(&left_select) - .next() - .expect("Missing left div for mod"); - let right = element - .select(&right_select) - .next() - .expect("Missing right div for mod"); - let nexus_mod_id = left - .value() - .attr("data-mod-id") - .expect("Missing mod id attribute") - .parse::() - .expect("Failed to parse mod id"); - let name_elem = right - .select(&name_select) - .next() - .expect("Missing name link for mod"); - let name = name_elem.text().next().expect("Missing name text for mod"); - let category_elem = right - .select(&category_select) - .next() - .expect("Missing category link for mod"); - let category_id = match category_elem.value().attr("href") { - Some(href) => href - .split("/") - .nth(6) - .expect("Missing category id for mod") - .parse::() - .ok(), - None => None, - }; - let category_name = category_elem.text().next(); - let author_elem = right - .select(&author_select) - .next() - .expect("Missing author link for mod"); - let author_id = author_elem - .value() - .attr("href") - .expect("Missing author link href for mod") - .split("/") - .last() - .expect("Missing author id for mod") - .parse::() - .expect("Failed to parse author id"); - let author_name = author_elem - .text() - .next() - .unwrap_or("Unknown"); - let desc_elem = right - .select(&desc_select) - .next() - .expect("Missing desc elem for mod"); - let desc = desc_elem.text().next(); - let thumbnail_elem = left - .select(&thumbnail_select) - .next() - .expect("Missing thumbnail elem for mod"); - let thumbnail_link = thumbnail_elem.value().attr("src"); - let first_upload_date_text = right - .select(&first_upload_date_select) - .next() - .expect("Missing dates elem for mod") - .text(); - let first_upload_at = first_upload_date_text - .skip(2) - .next() - .expect("Missing last update text for mod") - .trim(); - let first_upload_at = NaiveDate::parse_from_str(first_upload_at, "%d %b %Y") - .expect("Cannot parse first upload date"); - let last_update_date_text = right - .select(&last_update_date_select) - .next() - .expect("Missing dates elem for mod") - .text(); - let last_update_at = last_update_date_text - .skip(1) - .next() - .expect("Missing last update text for mod") - .trim(); - let last_update_at = NaiveDate::parse_from_str(last_update_at, "%d %b %Y") - .expect("Cannot parse last update date"); +#[derive(Debug, Deserialize)] +pub struct Mod { + #[serde(rename = "modId")] + pub mod_id: i32, + pub name: String, + pub summary: Option, + #[allow(dead_code)] + pub downloads: i32, + #[allow(dead_code)] + pub endorsements: i32, + #[serde(rename = "createdAt")] + pub created_at: String, + #[serde(rename = "updatedAt")] + pub updated_at: String, + #[serde(rename = "modCategory")] + pub mod_category: Option, + pub uploader: Uploader, + #[serde(rename = "thumbnailUrl")] + pub thumbnail_url: Option, +} - ScrapedMod { - nexus_mod_id, - name, - category_name, - category_id, - author_name, - author_id, - desc, - thumbnail_link, - last_update_at, - first_upload_at, - } - }) - .collect(); - info!( - len = mods.len(), - has_next_page, "scraped mods from mod list page" - ); - Ok(ModListScrape { - mods, - has_next_page, +#[derive(Debug, Deserialize)] +pub struct ModCategory { + #[serde(rename = "categoryId")] + pub category_id: i32, + pub name: String, +} + +#[derive(Debug, Deserialize)] +pub struct Uploader { + #[serde(rename = "memberId")] + pub member_id: i32, + pub name: String, +} + +pub struct NexusScraper { + client: Client, + base_url: String, +} + +impl<'a> ScrapedMod<'a> { + pub fn from_api_mod(api_mod: &'a Mod) -> Result { + // Parse dates from ISO 8601 format like "2025-05-30T15:29:50Z" + let parse_date = |date_str: &str| -> Result { + chrono::DateTime::parse_from_rfc3339(date_str).map(|dt| dt.naive_utc().date()) + }; + + let last_update_at = parse_date(&api_mod.updated_at)?; + let first_upload_at = parse_date(&api_mod.created_at)?; + + Ok(ScrapedMod { + nexus_mod_id: api_mod.mod_id, + name: &api_mod.name, + category_name: api_mod.mod_category.as_ref().map(|cat| cat.name.as_str()), + category_id: api_mod.mod_category.as_ref().map(|cat| cat.category_id), + author_name: &api_mod.uploader.name, + author_id: api_mod.uploader.member_id, + desc: api_mod.summary.as_deref(), + thumbnail_link: api_mod.thumbnail_url.as_deref(), + last_update_at, + first_upload_at, }) } } + +pub fn convert_mods_to_scraped<'a>(api_mods: &'a [Mod]) -> Result>> { + api_mods.iter().map(ScrapedMod::from_api_mod).collect() +} + +impl NexusScraper { + pub fn new(client: Client) -> Self { + Self { + client, + base_url: "https://api-router.nexusmods.com/graphql".to_string(), + } + } + + #[instrument(skip(self))] + pub async fn get_mods( + &self, + game_domain: &str, + offset: usize, + include_translations: bool, + ) -> Result { + let mut filter = json!({ "tag": [{ "op": "NOT_EQUALS", "value": "Translation" }] }); + if include_translations { + filter = json!({ "tag": [{ "op": "EQUALS", "value": "Translation" }] }); + } + let query = r#" + query ModsListing($count: Int = 0, $facets: ModsFacet, $filter: ModsFilter, $offset: Int, $postFilter: ModsFilter, $sort: [ModsSort!]) { + mods( + count: $count + facets: $facets + filter: $filter + offset: $offset + postFilter: $postFilter + sort: $sort + viewUserBlockedContent: false + ) { + facetsData + nodes { + ...ModFragment + } + totalCount + } +} + fragment ModFragment on Mod { + adultContent + createdAt + downloads + endorsements + fileSize + game { + domainName + id + name + } + modCategory { + categoryId + name + } + modId + name + status + summary + thumbnailUrl + thumbnailBlurredUrl + uid + updatedAt + uploader { + avatar + memberId + name + } + viewerDownloaded + viewerEndorsed + viewerTracked + viewerUpdateAvailable +}"#; + + let variables = json!({ + "count": 20, + "facets": { + "categoryName": [], + "languageName": [], + "tag": [] + }, + "filter": { + "filter": [], + "gameDomainName": [{"op": "EQUALS", "value": game_domain}], + "name": [] + }, + "offset": offset, + "postFilter": filter, + "sort": { + "updatedAt": {"direction": "DESC"} + } + }); + + let request_body = GraphQLRequest { + query: query.to_string(), + variables, + operation_name: "ModsListing".to_string(), + }; + + let response = self + .client + .post(&self.base_url) + .header("Referer", "https://www.nexusmods.com/") + .header("content-type", "application/json") + .header("x-graphql-operationname", "GameModsListing") + .header("Origin", "https://www.nexusmods.com") + .header("Sec-Fetch-Dest", "empty") + .header("Sec-Fetch-Mode", "cors") + .header("Sec-Fetch-Site", "same-site") + .json(&request_body) + .send() + .await?; + + let graphql_response: GraphQLResponse = response.json().await?; + + if let Some(errors) = graphql_response.errors { + return Err(anyhow!("GraphQL errors: {:?}", errors)); + } + + graphql_response + .data + .ok_or_else(|| anyhow!("No data returned from GraphQL")) + } +}