Update nexus scraper to use new graphql api

Modmapper hasn't been pulling in new mods for a while because nexus mods changed their mod listing page to use a new graphql API and the old API I was using no longer works.
This commit is contained in:
2025-05-30 15:07:53 -04:00
parent 1a105a3ea2
commit 7b8fbea7d4
5 changed files with 248 additions and 194 deletions

View File

@@ -37,7 +37,7 @@ RUST_LOG=mod_mapper=debug
4. Install
[`sqlx_cli`](https://github.com/launchbadge/sqlx/tree/master/sqlx-cli) with
`cargo install sqlx-cli --no-default-features --features postgres`
5. Run `sqlx migrate --source migrations run` which will run all the database migrations.
5. Run `sqlx migrate run` which will run all the database migrations.
6. Get your personal Nexus API token from your profile settings and add it to
the `.env` file:

View File

@@ -3,7 +3,7 @@ use std::time::Duration;
use tokio::time::sleep;
use tracing::{debug, info, info_span};
use crate::nexus_api::{SSE_GAME_ID, SSE_GAME_NAME};
use crate::nexus_api::SSE_GAME_NAME;
use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@@ -21,16 +21,18 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Resul
.timeout(REQUEST_TIMEOUT)
.connect_timeout(CONNECT_TIMEOUT)
.build()?;
let scraper_client = nexus_scraper::NexusScraper::new(client.clone());
while has_next_page {
let page_span = info_span!("page", page);
let _page_span = page_span.enter();
let mod_list_resp =
nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_NAME, SSE_GAME_ID, true).await?;
let scraped = mod_list_resp.scrape_mods()?;
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();
let mods_response = scraper_client
.get_mods(&SSE_GAME_NAME, page * nexus_scraper::PAGE_SIZE, true)
.await?;
let scraped_mods = nexus_scraper::convert_mods_to_scraped(&mods_response.mods.nodes)?;
let scraped_ids: Vec<i32> = scraped_mods.iter().map(|m| m.nexus_mod_id).collect();
has_next_page = scraped.has_next_page;
has_next_page = scraped_mods.len() == 20;
let updated_ids: Vec<i32> = sqlx::query_as!(
UpdatedMods,

View File

@@ -1,10 +1,12 @@
use anyhow::Result;
use chrono::{NaiveDateTime, NaiveTime};
use humansize::{format_size_i, DECIMAL};
use reqwest::StatusCode;
use reqwest::header::{HeaderMap, HeaderValue};
use reqwest::StatusCode;
use serde_json::json;
use std::collections::HashSet;
use std::io::SeekFrom;
use std::process;
use std::time::Duration;
use tokio::io::{AsyncReadExt, AsyncSeekExt};
use tokio::time::sleep;
@@ -38,6 +40,7 @@ pub async fn update(
.connect_timeout(CONNECT_TIMEOUT)
.default_headers(headers)
.build()?;
let scraper_client = nexus_scraper::NexusScraper::new(client.clone());
let game_id = get_game_id(game_name).expect("valid game name");
let game = game::insert(pool, game_name, game_id).await?;
@@ -50,29 +53,28 @@ pub async fn update(
let page_span = info_span!("page", page, game_name, include_translations);
let _page_span = page_span.enter();
let mod_list_resp = nexus_scraper::get_mod_list_page(
&client,
page,
game_name,
game.nexus_game_id,
include_translations,
)
.await?;
let scraped = mod_list_resp.scrape_mods()?;
let mods_response = scraper_client
.get_mods(
&game.name,
page * nexus_scraper::PAGE_SIZE,
include_translations,
)
.await?;
let scraped_mods = nexus_scraper::convert_mods_to_scraped(&mods_response.mods.nodes)?;
info!("scraped {} mods from nexus graphql", scraped_mods.len());
has_next_page = scraped_mods.len() == 20;
has_next_page = scraped.has_next_page;
let processed_mods = game_mod::bulk_get_last_updated_by_nexus_mod_ids(
pool,
game.id,
&scraped
.mods
&scraped_mods
.iter()
.map(|scraped_mod| scraped_mod.nexus_mod_id)
.collect::<Vec<i32>>(),
)
.await?;
let mods_to_create_or_update: Vec<UnsavedMod> = scraped
.mods
let mods_to_create_or_update: Vec<UnsavedMod> = scraped_mods
.iter()
.filter(|scraped_mod| {
if let Some(processed_mod) = processed_mods.iter().find(|processed_mod| {

View File

@@ -42,12 +42,12 @@ struct Args {
#[argh(option, short = 'e')]
dump_edits: Option<String>,
/// file to output the cell mod edit counts over time as json (time_step option required with
/// file to output the cell mod edit counts over time as json (time_step option required with
/// this option)
#[argh(option, short = 'E')]
dump_edits_over_time: Option<String>,
/// the span of time to group cell edit counts into (day, week, or month) when dumping cell
/// the span of time to group cell edit counts into (day, week, or month) when dumping cell
/// edits (only relevant for use with dump_edits_over_time option)
#[argh(option, short = 'T')]
time_step: Option<TimeStep>,

View File

@@ -1,12 +1,11 @@
use anyhow::Result;
use anyhow::{anyhow, Result};
use chrono::NaiveDate;
use reqwest::Client;
use scraper::{Html, Selector};
use tracing::{info, instrument};
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
use tracing::instrument;
pub struct ModListResponse {
html: Html,
}
pub const PAGE_SIZE: usize = 20;
#[derive(Debug)]
pub struct ScrapedMod<'a> {
@@ -22,178 +21,229 @@ pub struct ScrapedMod<'a> {
pub first_upload_at: NaiveDate,
}
pub struct ModListScrape<'a> {
pub mods: Vec<ScrapedMod<'a>>,
pub has_next_page: bool,
#[derive(Debug, Serialize, Deserialize)]
pub struct GraphQLRequest {
query: String,
variables: Value,
#[serde(rename = "operationName")]
operation_name: String,
}
#[instrument(skip(client))]
pub async fn get_mod_list_page(
client: &Client,
page: usize,
game_name: &str,
game_id: i32,
include_translations: bool,
) -> Result<ModListResponse> {
let res = client
.get(format!(
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate",
game_id,
match include_translations { true => "yes", false => "no" },
page
))
.header("host", "www.nexusmods.com")
.header("referrer", format!("https://www.nexusmods.com/{}/mods/", game_name))
.header("sec-fetch-dest", "empty")
.header("sec-fetch-mode", "cors")
.header("sec-fetch-site", "same-origin")
.header("x-requested-with", "XMLHttpRequest")
.send()
.await?
.error_for_status()?;
info!(status = %res.status(), "fetched mod list page");
let text = res.text().await?;
let html = Html::parse_document(&text);
Ok(ModListResponse { html })
#[derive(Debug, Deserialize)]
pub struct GraphQLResponse<T> {
data: Option<T>,
errors: Option<Vec<GraphQLError>>,
}
impl ModListResponse {
#[instrument(skip(self))]
pub fn scrape_mods<'a>(&'a self) -> Result<ModListScrape> {
let mod_select = Selector::parse("li.mod-tile").expect("failed to parse CSS selector");
let left_select =
Selector::parse("div.mod-tile-left").expect("failed to parse CSS selector");
let right_select =
Selector::parse("div.mod-tile-right").expect("failed to parse CSS selector");
let name_select = Selector::parse("p.tile-name a").expect("failed to parse CSS selector");
let category_select =
Selector::parse("div.category a").expect("failed to parse CSS selector");
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
let thumbnail_select =
Selector::parse("a.mod-image img.fore").expect("failed to parse CSS selector");
let first_upload_date_select =
Selector::parse("time.date").expect("failed to parse CSS selector");
let last_update_date_select =
Selector::parse("div.date").expect("failed to parse CSS selector");
let next_page_select =
Selector::parse("div.pagination li:last-child a.page-selected").expect("failed to parse CSS selector");
#[derive(Debug, Deserialize)]
pub struct GraphQLError {
#[allow(dead_code)]
message: String,
}
let next_page_elem = self.html.select(&next_page_select).next();
#[derive(Debug, Deserialize)]
pub struct ModsResponse {
pub mods: ModsData,
}
let has_next_page = next_page_elem.is_none();
#[derive(Debug, Deserialize)]
pub struct ModsData {
#[serde(rename = "facetsData")]
#[allow(dead_code)]
pub facets_data: Option<Value>,
pub nodes: Vec<Mod>,
#[allow(dead_code)]
#[serde(rename = "totalCount")]
pub total_count: i32,
}
let mods: Vec<ScrapedMod> = self
.html
.select(&mod_select)
.map(|element| {
let left = element
.select(&left_select)
.next()
.expect("Missing left div for mod");
let right = element
.select(&right_select)
.next()
.expect("Missing right div for mod");
let nexus_mod_id = left
.value()
.attr("data-mod-id")
.expect("Missing mod id attribute")
.parse::<i32>()
.expect("Failed to parse mod id");
let name_elem = right
.select(&name_select)
.next()
.expect("Missing name link for mod");
let name = name_elem.text().next().expect("Missing name text for mod");
let category_elem = right
.select(&category_select)
.next()
.expect("Missing category link for mod");
let category_id = match category_elem.value().attr("href") {
Some(href) => href
.split("/")
.nth(6)
.expect("Missing category id for mod")
.parse::<i32>()
.ok(),
None => None,
};
let category_name = category_elem.text().next();
let author_elem = right
.select(&author_select)
.next()
.expect("Missing author link for mod");
let author_id = author_elem
.value()
.attr("href")
.expect("Missing author link href for mod")
.split("/")
.last()
.expect("Missing author id for mod")
.parse::<i32>()
.expect("Failed to parse author id");
let author_name = author_elem
.text()
.next()
.unwrap_or("Unknown");
let desc_elem = right
.select(&desc_select)
.next()
.expect("Missing desc elem for mod");
let desc = desc_elem.text().next();
let thumbnail_elem = left
.select(&thumbnail_select)
.next()
.expect("Missing thumbnail elem for mod");
let thumbnail_link = thumbnail_elem.value().attr("src");
let first_upload_date_text = right
.select(&first_upload_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let first_upload_at = first_upload_date_text
.skip(2)
.next()
.expect("Missing last update text for mod")
.trim();
let first_upload_at = NaiveDate::parse_from_str(first_upload_at, "%d %b %Y")
.expect("Cannot parse first upload date");
let last_update_date_text = right
.select(&last_update_date_select)
.next()
.expect("Missing dates elem for mod")
.text();
let last_update_at = last_update_date_text
.skip(1)
.next()
.expect("Missing last update text for mod")
.trim();
let last_update_at = NaiveDate::parse_from_str(last_update_at, "%d %b %Y")
.expect("Cannot parse last update date");
#[derive(Debug, Deserialize)]
pub struct Mod {
#[serde(rename = "modId")]
pub mod_id: i32,
pub name: String,
pub summary: Option<String>,
#[allow(dead_code)]
pub downloads: i32,
#[allow(dead_code)]
pub endorsements: i32,
#[serde(rename = "createdAt")]
pub created_at: String,
#[serde(rename = "updatedAt")]
pub updated_at: String,
#[serde(rename = "modCategory")]
pub mod_category: Option<ModCategory>,
pub uploader: Uploader,
#[serde(rename = "thumbnailUrl")]
pub thumbnail_url: Option<String>,
}
ScrapedMod {
nexus_mod_id,
name,
category_name,
category_id,
author_name,
author_id,
desc,
thumbnail_link,
last_update_at,
first_upload_at,
}
})
.collect();
info!(
len = mods.len(),
has_next_page, "scraped mods from mod list page"
);
Ok(ModListScrape {
mods,
has_next_page,
#[derive(Debug, Deserialize)]
pub struct ModCategory {
#[serde(rename = "categoryId")]
pub category_id: i32,
pub name: String,
}
#[derive(Debug, Deserialize)]
pub struct Uploader {
#[serde(rename = "memberId")]
pub member_id: i32,
pub name: String,
}
pub struct NexusScraper {
client: Client,
base_url: String,
}
impl<'a> ScrapedMod<'a> {
pub fn from_api_mod(api_mod: &'a Mod) -> Result<Self> {
// Parse dates from ISO 8601 format like "2025-05-30T15:29:50Z"
let parse_date = |date_str: &str| -> Result<NaiveDate, chrono::ParseError> {
chrono::DateTime::parse_from_rfc3339(date_str).map(|dt| dt.naive_utc().date())
};
let last_update_at = parse_date(&api_mod.updated_at)?;
let first_upload_at = parse_date(&api_mod.created_at)?;
Ok(ScrapedMod {
nexus_mod_id: api_mod.mod_id,
name: &api_mod.name,
category_name: api_mod.mod_category.as_ref().map(|cat| cat.name.as_str()),
category_id: api_mod.mod_category.as_ref().map(|cat| cat.category_id),
author_name: &api_mod.uploader.name,
author_id: api_mod.uploader.member_id,
desc: api_mod.summary.as_deref(),
thumbnail_link: api_mod.thumbnail_url.as_deref(),
last_update_at,
first_upload_at,
})
}
}
pub fn convert_mods_to_scraped<'a>(api_mods: &'a [Mod]) -> Result<Vec<ScrapedMod<'a>>> {
api_mods.iter().map(ScrapedMod::from_api_mod).collect()
}
impl NexusScraper {
pub fn new(client: Client) -> Self {
Self {
client,
base_url: "https://api-router.nexusmods.com/graphql".to_string(),
}
}
#[instrument(skip(self))]
pub async fn get_mods(
&self,
game_domain: &str,
offset: usize,
include_translations: bool,
) -> Result<ModsResponse> {
let mut filter = json!({ "tag": [{ "op": "NOT_EQUALS", "value": "Translation" }] });
if include_translations {
filter = json!({ "tag": [{ "op": "EQUALS", "value": "Translation" }] });
}
let query = r#"
query ModsListing($count: Int = 0, $facets: ModsFacet, $filter: ModsFilter, $offset: Int, $postFilter: ModsFilter, $sort: [ModsSort!]) {
mods(
count: $count
facets: $facets
filter: $filter
offset: $offset
postFilter: $postFilter
sort: $sort
viewUserBlockedContent: false
) {
facetsData
nodes {
...ModFragment
}
totalCount
}
}
fragment ModFragment on Mod {
adultContent
createdAt
downloads
endorsements
fileSize
game {
domainName
id
name
}
modCategory {
categoryId
name
}
modId
name
status
summary
thumbnailUrl
thumbnailBlurredUrl
uid
updatedAt
uploader {
avatar
memberId
name
}
viewerDownloaded
viewerEndorsed
viewerTracked
viewerUpdateAvailable
}"#;
let variables = json!({
"count": 20,
"facets": {
"categoryName": [],
"languageName": [],
"tag": []
},
"filter": {
"filter": [],
"gameDomainName": [{"op": "EQUALS", "value": game_domain}],
"name": []
},
"offset": offset,
"postFilter": filter,
"sort": {
"updatedAt": {"direction": "DESC"}
}
});
let request_body = GraphQLRequest {
query: query.to_string(),
variables,
operation_name: "ModsListing".to_string(),
};
let response = self
.client
.post(&self.base_url)
.header("Referer", "https://www.nexusmods.com/")
.header("content-type", "application/json")
.header("x-graphql-operationname", "GameModsListing")
.header("Origin", "https://www.nexusmods.com")
.header("Sec-Fetch-Dest", "empty")
.header("Sec-Fetch-Mode", "cors")
.header("Sec-Fetch-Site", "same-site")
.json(&request_body)
.send()
.await?;
let graphql_response: GraphQLResponse<ModsResponse> = response.json().await?;
if let Some(errors) = graphql_response.errors {
return Err(anyhow!("GraphQL errors: {:?}", errors));
}
graphql_response
.data
.ok_or_else(|| anyhow!("No data returned from GraphQL"))
}
}