Update nexus scraper to use new graphql api

Modmapper hasn't been pulling in new mods for a while because nexus mods changed their mod listing page to use a new graphql API and the old API I was using no longer works.
This commit is contained in:
2025-05-30 15:07:53 -04:00
parent 1a105a3ea2
commit 7b8fbea7d4
5 changed files with 248 additions and 194 deletions

View File

@@ -37,7 +37,7 @@ RUST_LOG=mod_mapper=debug
4. Install 4. Install
[`sqlx_cli`](https://github.com/launchbadge/sqlx/tree/master/sqlx-cli) with [`sqlx_cli`](https://github.com/launchbadge/sqlx/tree/master/sqlx-cli) with
`cargo install sqlx-cli --no-default-features --features postgres` `cargo install sqlx-cli --no-default-features --features postgres`
5. Run `sqlx migrate --source migrations run` which will run all the database migrations. 5. Run `sqlx migrate run` which will run all the database migrations.
6. Get your personal Nexus API token from your profile settings and add it to 6. Get your personal Nexus API token from your profile settings and add it to
the `.env` file: the `.env` file:

View File

@@ -3,7 +3,7 @@ use std::time::Duration;
use tokio::time::sleep; use tokio::time::sleep;
use tracing::{debug, info, info_span}; use tracing::{debug, info, info_span};
use crate::nexus_api::{SSE_GAME_ID, SSE_GAME_NAME}; use crate::nexus_api::SSE_GAME_NAME;
use crate::nexus_scraper; use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@@ -21,16 +21,18 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Resul
.timeout(REQUEST_TIMEOUT) .timeout(REQUEST_TIMEOUT)
.connect_timeout(CONNECT_TIMEOUT) .connect_timeout(CONNECT_TIMEOUT)
.build()?; .build()?;
let scraper_client = nexus_scraper::NexusScraper::new(client.clone());
while has_next_page { while has_next_page {
let page_span = info_span!("page", page); let page_span = info_span!("page", page);
let _page_span = page_span.enter(); let _page_span = page_span.enter();
let mod_list_resp = let mods_response = scraper_client
nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_NAME, SSE_GAME_ID, true).await?; .get_mods(&SSE_GAME_NAME, page * nexus_scraper::PAGE_SIZE, true)
let scraped = mod_list_resp.scrape_mods()?; .await?;
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect(); let scraped_mods = nexus_scraper::convert_mods_to_scraped(&mods_response.mods.nodes)?;
let scraped_ids: Vec<i32> = scraped_mods.iter().map(|m| m.nexus_mod_id).collect();
has_next_page = scraped.has_next_page; has_next_page = scraped_mods.len() == 20;
let updated_ids: Vec<i32> = sqlx::query_as!( let updated_ids: Vec<i32> = sqlx::query_as!(
UpdatedMods, UpdatedMods,

View File

@@ -1,10 +1,12 @@
use anyhow::Result; use anyhow::Result;
use chrono::{NaiveDateTime, NaiveTime}; use chrono::{NaiveDateTime, NaiveTime};
use humansize::{format_size_i, DECIMAL}; use humansize::{format_size_i, DECIMAL};
use reqwest::StatusCode;
use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::header::{HeaderMap, HeaderValue};
use reqwest::StatusCode;
use serde_json::json;
use std::collections::HashSet; use std::collections::HashSet;
use std::io::SeekFrom; use std::io::SeekFrom;
use std::process;
use std::time::Duration; use std::time::Duration;
use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tokio::io::{AsyncReadExt, AsyncSeekExt};
use tokio::time::sleep; use tokio::time::sleep;
@@ -38,6 +40,7 @@ pub async fn update(
.connect_timeout(CONNECT_TIMEOUT) .connect_timeout(CONNECT_TIMEOUT)
.default_headers(headers) .default_headers(headers)
.build()?; .build()?;
let scraper_client = nexus_scraper::NexusScraper::new(client.clone());
let game_id = get_game_id(game_name).expect("valid game name"); let game_id = get_game_id(game_name).expect("valid game name");
let game = game::insert(pool, game_name, game_id).await?; let game = game::insert(pool, game_name, game_id).await?;
@@ -50,29 +53,28 @@ pub async fn update(
let page_span = info_span!("page", page, game_name, include_translations); let page_span = info_span!("page", page, game_name, include_translations);
let _page_span = page_span.enter(); let _page_span = page_span.enter();
let mod_list_resp = nexus_scraper::get_mod_list_page( let mods_response = scraper_client
&client, .get_mods(
page, &game.name,
game_name, page * nexus_scraper::PAGE_SIZE,
game.nexus_game_id, include_translations,
include_translations, )
) .await?;
.await?;
let scraped = mod_list_resp.scrape_mods()?; let scraped_mods = nexus_scraper::convert_mods_to_scraped(&mods_response.mods.nodes)?;
info!("scraped {} mods from nexus graphql", scraped_mods.len());
has_next_page = scraped_mods.len() == 20;
has_next_page = scraped.has_next_page;
let processed_mods = game_mod::bulk_get_last_updated_by_nexus_mod_ids( let processed_mods = game_mod::bulk_get_last_updated_by_nexus_mod_ids(
pool, pool,
game.id, game.id,
&scraped &scraped_mods
.mods
.iter() .iter()
.map(|scraped_mod| scraped_mod.nexus_mod_id) .map(|scraped_mod| scraped_mod.nexus_mod_id)
.collect::<Vec<i32>>(), .collect::<Vec<i32>>(),
) )
.await?; .await?;
let mods_to_create_or_update: Vec<UnsavedMod> = scraped let mods_to_create_or_update: Vec<UnsavedMod> = scraped_mods
.mods
.iter() .iter()
.filter(|scraped_mod| { .filter(|scraped_mod| {
if let Some(processed_mod) = processed_mods.iter().find(|processed_mod| { if let Some(processed_mod) = processed_mods.iter().find(|processed_mod| {

View File

@@ -42,12 +42,12 @@ struct Args {
#[argh(option, short = 'e')] #[argh(option, short = 'e')]
dump_edits: Option<String>, dump_edits: Option<String>,
/// file to output the cell mod edit counts over time as json (time_step option required with /// file to output the cell mod edit counts over time as json (time_step option required with
/// this option) /// this option)
#[argh(option, short = 'E')] #[argh(option, short = 'E')]
dump_edits_over_time: Option<String>, dump_edits_over_time: Option<String>,
/// the span of time to group cell edit counts into (day, week, or month) when dumping cell /// the span of time to group cell edit counts into (day, week, or month) when dumping cell
/// edits (only relevant for use with dump_edits_over_time option) /// edits (only relevant for use with dump_edits_over_time option)
#[argh(option, short = 'T')] #[argh(option, short = 'T')]
time_step: Option<TimeStep>, time_step: Option<TimeStep>,

View File

@@ -1,12 +1,11 @@
use anyhow::Result; use anyhow::{anyhow, Result};
use chrono::NaiveDate; use chrono::NaiveDate;
use reqwest::Client; use reqwest::Client;
use scraper::{Html, Selector}; use serde::{Deserialize, Serialize};
use tracing::{info, instrument}; use serde_json::{json, Value};
use tracing::instrument;
pub struct ModListResponse { pub const PAGE_SIZE: usize = 20;
html: Html,
}
#[derive(Debug)] #[derive(Debug)]
pub struct ScrapedMod<'a> { pub struct ScrapedMod<'a> {
@@ -22,178 +21,229 @@ pub struct ScrapedMod<'a> {
pub first_upload_at: NaiveDate, pub first_upload_at: NaiveDate,
} }
pub struct ModListScrape<'a> { #[derive(Debug, Serialize, Deserialize)]
pub mods: Vec<ScrapedMod<'a>>, pub struct GraphQLRequest {
pub has_next_page: bool, query: String,
variables: Value,
#[serde(rename = "operationName")]
operation_name: String,
} }
#[instrument(skip(client))] #[derive(Debug, Deserialize)]
pub async fn get_mod_list_page( pub struct GraphQLResponse<T> {
client: &Client, data: Option<T>,
page: usize, errors: Option<Vec<GraphQLError>>,
game_name: &str,
game_id: i32,
include_translations: bool,
) -> Result<ModListResponse> {
let res = client
.get(format!(
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate",
game_id,
match include_translations { true => "yes", false => "no" },
page
))
.header("host", "www.nexusmods.com")
.header("referrer", format!("https://www.nexusmods.com/{}/mods/", game_name))
.header("sec-fetch-dest", "empty")
.header("sec-fetch-mode", "cors")
.header("sec-fetch-site", "same-origin")
.header("x-requested-with", "XMLHttpRequest")
.send()
.await?
.error_for_status()?;
info!(status = %res.status(), "fetched mod list page");
let text = res.text().await?;
let html = Html::parse_document(&text);
Ok(ModListResponse { html })
} }
impl ModListResponse { #[derive(Debug, Deserialize)]
#[instrument(skip(self))] pub struct GraphQLError {
pub fn scrape_mods<'a>(&'a self) -> Result<ModListScrape> { #[allow(dead_code)]
let mod_select = Selector::parse("li.mod-tile").expect("failed to parse CSS selector"); message: String,
let left_select = }
Selector::parse("div.mod-tile-left").expect("failed to parse CSS selector");
let right_select =
Selector::parse("div.mod-tile-right").expect("failed to parse CSS selector");
let name_select = Selector::parse("p.tile-name a").expect("failed to parse CSS selector");
let category_select =
Selector::parse("div.category a").expect("failed to parse CSS selector");
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
let thumbnail_select =
Selector::parse("a.mod-image img.fore").expect("failed to parse CSS selector");
let first_upload_date_select =
Selector::parse("time.date").expect("failed to parse CSS selector");
let last_update_date_select =
Selector::parse("div.date").expect("failed to parse CSS selector");
let next_page_select =
Selector::parse("div.pagination li:last-child a.page-selected").expect("failed to parse CSS selector");
let next_page_elem = self.html.select(&next_page_select).next(); #[derive(Debug, Deserialize)]
pub struct ModsResponse {
pub mods: ModsData,
}
let has_next_page = next_page_elem.is_none(); #[derive(Debug, Deserialize)]
pub struct ModsData {
#[serde(rename = "facetsData")]
#[allow(dead_code)]
pub facets_data: Option<Value>,
pub nodes: Vec<Mod>,
#[allow(dead_code)]
#[serde(rename = "totalCount")]
pub total_count: i32,
}
let mods: Vec<ScrapedMod> = self #[derive(Debug, Deserialize)]
.html pub struct Mod {
.select(&mod_select) #[serde(rename = "modId")]
.map(|element| { pub mod_id: i32,
let left = element pub name: String,
.select(&left_select) pub summary: Option<String>,
.next() #[allow(dead_code)]
.expect("Missing left div for mod"); pub downloads: i32,
let right = element #[allow(dead_code)]
.select(&right_select) pub endorsements: i32,
.next() #[serde(rename = "createdAt")]
.expect("Missing right div for mod"); pub created_at: String,
let nexus_mod_id = left #[serde(rename = "updatedAt")]
.value() pub updated_at: String,
.attr("data-mod-id") #[serde(rename = "modCategory")]
.expect("Missing mod id attribute") pub mod_category: Option<ModCategory>,
.parse::<i32>() pub uploader: Uploader,
.expect("Failed to parse mod id"); #[serde(rename = "thumbnailUrl")]
let name_elem = right pub thumbnail_url: Option<String>,
.select(&name_select) }
.next()
.expect("Missing name link for mod");
let name = name_elem.text().next().expect("Missing name text for mod");
let category_elem = right
.select(&category_select)
.next()
.expect("Missing category link for mod");
let category_id = match category_elem.value().attr("href") {
Some(href) => href
.split("/")
.nth(6)
.expect("Missing category id for mod")
.parse::<i32>()
.ok(),
None => None,
};
let category_name = category_elem.text().next();
let author_elem = right
.select(&author_select)
.next()
.expect("Missing author link for mod");
let author_id = author_elem
.value()
.attr("href")
.expect("Missing author link href for mod")
.split("/")
.last()
.expect("Missing author id for mod")
.parse::<i32>()
.expect("Failed to parse author id");
let author_name = author_elem
.text()
.next()
.unwrap_or("Unknown");
let desc_elem = right
.select(&desc_select)
.next()
.expect("Missing desc elem for mod");
let desc = desc_elem.text().next();
let thumbnail_elem = left
.select(&thumbnail_select)
.next()
.expect("Missing thumbnail elem for mod");
let thumbnail_link = thumbnail_elem.value().attr("src");
let first_upload_date_text = right
.select(&first_upload_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let first_upload_at = first_upload_date_text
.skip(2)
.next()
.expect("Missing last update text for mod")
.trim();
let first_upload_at = NaiveDate::parse_from_str(first_upload_at, "%d %b %Y")
.expect("Cannot parse first upload date");
let last_update_date_text = right
.select(&last_update_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let last_update_at = last_update_date_text
.skip(1)
.next()
.expect("Missing last update text for mod")
.trim();
let last_update_at = NaiveDate::parse_from_str(last_update_at, "%d %b %Y")
.expect("Cannot parse last update date");
ScrapedMod { #[derive(Debug, Deserialize)]
nexus_mod_id, pub struct ModCategory {
name, #[serde(rename = "categoryId")]
category_name, pub category_id: i32,
category_id, pub name: String,
author_name, }
author_id,
desc, #[derive(Debug, Deserialize)]
thumbnail_link, pub struct Uploader {
last_update_at, #[serde(rename = "memberId")]
first_upload_at, pub member_id: i32,
} pub name: String,
}) }
.collect();
info!( pub struct NexusScraper {
len = mods.len(), client: Client,
has_next_page, "scraped mods from mod list page" base_url: String,
); }
Ok(ModListScrape {
mods, impl<'a> ScrapedMod<'a> {
has_next_page, pub fn from_api_mod(api_mod: &'a Mod) -> Result<Self> {
// Parse dates from ISO 8601 format like "2025-05-30T15:29:50Z"
let parse_date = |date_str: &str| -> Result<NaiveDate, chrono::ParseError> {
chrono::DateTime::parse_from_rfc3339(date_str).map(|dt| dt.naive_utc().date())
};
let last_update_at = parse_date(&api_mod.updated_at)?;
let first_upload_at = parse_date(&api_mod.created_at)?;
Ok(ScrapedMod {
nexus_mod_id: api_mod.mod_id,
name: &api_mod.name,
category_name: api_mod.mod_category.as_ref().map(|cat| cat.name.as_str()),
category_id: api_mod.mod_category.as_ref().map(|cat| cat.category_id),
author_name: &api_mod.uploader.name,
author_id: api_mod.uploader.member_id,
desc: api_mod.summary.as_deref(),
thumbnail_link: api_mod.thumbnail_url.as_deref(),
last_update_at,
first_upload_at,
}) })
} }
} }
pub fn convert_mods_to_scraped<'a>(api_mods: &'a [Mod]) -> Result<Vec<ScrapedMod<'a>>> {
api_mods.iter().map(ScrapedMod::from_api_mod).collect()
}
impl NexusScraper {
pub fn new(client: Client) -> Self {
Self {
client,
base_url: "https://api-router.nexusmods.com/graphql".to_string(),
}
}
#[instrument(skip(self))]
pub async fn get_mods(
&self,
game_domain: &str,
offset: usize,
include_translations: bool,
) -> Result<ModsResponse> {
let mut filter = json!({ "tag": [{ "op": "NOT_EQUALS", "value": "Translation" }] });
if include_translations {
filter = json!({ "tag": [{ "op": "EQUALS", "value": "Translation" }] });
}
let query = r#"
query ModsListing($count: Int = 0, $facets: ModsFacet, $filter: ModsFilter, $offset: Int, $postFilter: ModsFilter, $sort: [ModsSort!]) {
mods(
count: $count
facets: $facets
filter: $filter
offset: $offset
postFilter: $postFilter
sort: $sort
viewUserBlockedContent: false
) {
facetsData
nodes {
...ModFragment
}
totalCount
}
}
fragment ModFragment on Mod {
adultContent
createdAt
downloads
endorsements
fileSize
game {
domainName
id
name
}
modCategory {
categoryId
name
}
modId
name
status
summary
thumbnailUrl
thumbnailBlurredUrl
uid
updatedAt
uploader {
avatar
memberId
name
}
viewerDownloaded
viewerEndorsed
viewerTracked
viewerUpdateAvailable
}"#;
let variables = json!({
"count": 20,
"facets": {
"categoryName": [],
"languageName": [],
"tag": []
},
"filter": {
"filter": [],
"gameDomainName": [{"op": "EQUALS", "value": game_domain}],
"name": []
},
"offset": offset,
"postFilter": filter,
"sort": {
"updatedAt": {"direction": "DESC"}
}
});
let request_body = GraphQLRequest {
query: query.to_string(),
variables,
operation_name: "ModsListing".to_string(),
};
let response = self
.client
.post(&self.base_url)
.header("Referer", "https://www.nexusmods.com/")
.header("content-type", "application/json")
.header("x-graphql-operationname", "GameModsListing")
.header("Origin", "https://www.nexusmods.com")
.header("Sec-Fetch-Dest", "empty")
.header("Sec-Fetch-Mode", "cors")
.header("Sec-Fetch-Site", "same-site")
.json(&request_body)
.send()
.await?;
let graphql_response: GraphQLResponse<ModsResponse> = response.json().await?;
if let Some(errors) = graphql_response.errors {
return Err(anyhow!("GraphQL errors: {:?}", errors));
}
graphql_response
.data
.ok_or_else(|| anyhow!("No data returned from GraphQL"))
}
}