Add support for scraping classic Skyrim mods

Not adding it to the update.sh script quite yet since I need to run the very long process of downloading every classic skyrim mod that currently exists on the nexus first.
This commit is contained in:
Tyler Hallada 2022-09-02 00:43:53 -04:00
parent 89428da6e0
commit 7d229ccd1a
14 changed files with 54 additions and 30 deletions

View File

@ -42,7 +42,8 @@ RUST_LOG=mod_mapper=debug
NEXUS_API_KEY=... NEXUS_API_KEY=...
``` ```
7. Either run `cargo run` for development mode, or build the release binary with `cargo build --release`, which will get saved to `target/release/`. 7. Build the release binary by running `cargo build --release`.
8. See `./target/release/modmapper -h` for further commands or run `./scripts/update.sh` to start populating the database with scraped mods and dumping the data to JSON files.
## Sync and Backup Setup ## Sync and Backup Setup

View File

@ -3,7 +3,7 @@ if [ -f cells/edits.json ]; then
last_update_time=$(date -r cells/edits.json +'%Y-%m-%dT%H:%M:%S') last_update_time=$(date -r cells/edits.json +'%Y-%m-%dT%H:%M:%S')
fi fi
mkdir -p logs mkdir -p logs
./target/release/mod-mapper &>> logs/modmapper.log ./target/release/mod-mapper -g skyrimspecialedition &>> logs/modmapper.log
mkdir -p cells mkdir -p cells
mkdir -p mods mkdir -p mods
mkdir -p files mkdir -p files

View File

@ -4,6 +4,7 @@ use tokio::time::sleep;
use tracing::{debug, info, info_span}; use tracing::{debug, info, info_span};
use crate::nexus_scraper; use crate::nexus_scraper;
use crate::nexus_api::SSE_GAME_ID;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
const CONNECT_TIMEOUT: Duration = Duration::from_secs(30); const CONNECT_TIMEOUT: Duration = Duration::from_secs(30);
@ -24,7 +25,7 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Resul
while has_next_page { while has_next_page {
let page_span = info_span!("page", page); let page_span = info_span!("page", page);
let _page_span = page_span.enter(); let _page_span = page_span.enter();
let mod_list_resp = nexus_scraper::get_mod_list_page(&client, page, true).await?; let mod_list_resp = nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_ID, true).await?;
let scraped = mod_list_resp.scrape_mods()?; let scraped = mod_list_resp.scrape_mods()?;
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect(); let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();

View File

@ -13,7 +13,7 @@ use crate::extractors::{self, extract_with_7zip, extract_with_compress_tools, ex
use crate::models::file; use crate::models::file;
use crate::models::game; use crate::models::game;
use crate::models::{game_mod, game_mod::UnsavedMod}; use crate::models::{game_mod, game_mod::UnsavedMod};
use crate::nexus_api::{self, GAME_ID, GAME_NAME}; use crate::nexus_api::{self, get_game_id};
use crate::nexus_scraper; use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@ -22,6 +22,7 @@ const CONNECT_TIMEOUT: Duration = Duration::from_secs(30);
pub async fn update( pub async fn update(
pool: &sqlx::Pool<sqlx::Postgres>, pool: &sqlx::Pool<sqlx::Postgres>,
start_page: usize, start_page: usize,
game_name: &str,
full: bool, full: bool,
) -> Result<()> { ) -> Result<()> {
for include_translations in [false, true] { for include_translations in [false, true] {
@ -29,23 +30,24 @@ pub async fn update(
let mut has_next_page = true; let mut has_next_page = true;
let mut pages_with_no_updates = 0; let mut pages_with_no_updates = 0;
let game = game::insert(&pool, GAME_NAME, GAME_ID as i32).await?;
let client = reqwest::Client::builder() let client = reqwest::Client::builder()
.timeout(REQUEST_TIMEOUT) .timeout(REQUEST_TIMEOUT)
.connect_timeout(CONNECT_TIMEOUT) .connect_timeout(CONNECT_TIMEOUT)
.build()?; .build()?;
let game_id = get_game_id(game_name).expect("valid game name");
let game = game::insert(&pool, game_name, game_id).await?;
while has_next_page { while has_next_page {
if !full && pages_with_no_updates >= 50 { if !full && pages_with_no_updates >= 50 {
warn!("No updates found for 50 pages in a row, aborting"); warn!("No updates found for 50 pages in a row, aborting");
break; break;
} }
let page_span = info_span!("page", page, include_translations); let page_span = info_span!("page", page, game_name, include_translations);
let _page_span = page_span.enter(); let _page_span = page_span.enter();
let mod_list_resp = let mod_list_resp =
nexus_scraper::get_mod_list_page(&client, page, include_translations).await?; nexus_scraper::get_mod_list_page(&client, page, game.nexus_game_id, include_translations).await?;
let scraped = mod_list_resp.scrape_mods()?; let scraped = mod_list_resp.scrape_mods()?;
has_next_page = scraped.has_next_page; has_next_page = scraped.has_next_page;
@ -109,7 +111,7 @@ pub async fn update(
for db_mod in mods { for db_mod in mods {
let mod_span = info_span!("mod", name = ?&db_mod.name, id = &db_mod.nexus_mod_id); let mod_span = info_span!("mod", name = ?&db_mod.name, id = &db_mod.nexus_mod_id);
let _mod_span = mod_span.enter(); let _mod_span = mod_span.enter();
let files_resp = nexus_api::files::get(&client, db_mod.nexus_mod_id).await?; let files_resp = nexus_api::files::get(&client, game_name, db_mod.nexus_mod_id).await?;
debug!(duration = ?files_resp.wait, "sleeping"); debug!(duration = ?files_resp.wait, "sleeping");
sleep(files_resp.wait).await; sleep(files_resp.wait).await;
@ -190,6 +192,7 @@ pub async fn update(
info!(size = %humanized_size, "decided to download file"); info!(size = %humanized_size, "decided to download file");
let download_link_resp = nexus_api::download_link::get( let download_link_resp = nexus_api::download_link::get(
&client, &client,
game_name,
db_mod.nexus_mod_id, db_mod.nexus_mod_id,
api_file.file_id, api_file.file_id,
) )
@ -252,6 +255,7 @@ pub async fn update(
&pool, &pool,
&db_file, &db_file,
&db_mod, &db_mod,
game_name,
checked_metadata, checked_metadata,
) )
.await .await
@ -261,7 +265,7 @@ pub async fn update(
// unrar failed to extract rar file (e.g. archive has unicode filenames) // unrar failed to extract rar file (e.g. archive has unicode filenames)
// Attempt to uncompress the archive using `7z` unix command instead // Attempt to uncompress the archive using `7z` unix command instead
warn!(error = %err, "failed to extract file with unrar, extracting whole archive with 7z instead"); warn!(error = %err, "failed to extract file with unrar, extracting whole archive with 7z instead");
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await extract_with_7zip(&mut file, &pool, &db_file, &db_mod, game_name).await
} }
}?; }?;
} }
@ -269,7 +273,7 @@ pub async fn update(
tokio_file.seek(SeekFrom::Start(0)).await?; tokio_file.seek(SeekFrom::Start(0)).await?;
let mut file = tokio_file.try_clone().await?.into_std().await; let mut file = tokio_file.try_clone().await?.into_std().await;
match extract_with_compress_tools(&mut file, &pool, &db_file, &db_mod) match extract_with_compress_tools(&mut file, &pool, &db_file, &db_mod, game_name)
.await .await
{ {
Ok(_) => Ok(()), Ok(_) => Ok(()),
@ -284,7 +288,7 @@ pub async fn update(
// compress_tools or libarchive failed to extract zip/7z file (e.g. archive is deflate64 compressed) // compress_tools or libarchive failed to extract zip/7z file (e.g. archive is deflate64 compressed)
// Attempt to uncompress the archive using `7z` unix command instead // Attempt to uncompress the archive using `7z` unix command instead
warn!(error = %err, "failed to extract file with compress_tools, extracting whole archive with 7z instead"); warn!(error = %err, "failed to extract file with compress_tools, extracting whole archive with 7z instead");
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await extract_with_7zip(&mut file, &pool, &db_file, &db_mod, game_name).await
} else { } else {
Err(err) Err(err)
} }

View File

@ -88,13 +88,14 @@ pub async fn extract_with_compress_tools(
pool: &sqlx::Pool<sqlx::Postgres>, pool: &sqlx::Pool<sqlx::Postgres>,
db_file: &File, db_file: &File,
db_mod: &Mod, db_mod: &Mod,
game_name: &str,
) -> Result<()> { ) -> Result<()> {
let extractor = Extractor::new(file); let extractor = Extractor::new(file);
for plugin in extractor.into_iter() { for plugin in extractor.into_iter() {
let (file_path, mut plugin_buf) = plugin?; let (file_path, mut plugin_buf) = plugin?;
let plugin_span = info_span!("plugin", name = ?file_path); let plugin_span = info_span!("plugin", name = ?file_path);
let _plugin_span = plugin_span.enter(); let _plugin_span = plugin_span.enter();
process_plugin(&mut plugin_buf, &pool, &db_file, &db_mod, &file_path).await?; process_plugin(&mut plugin_buf, &pool, &db_file, &db_mod, &file_path, game_name).await?;
} }
Ok(()) Ok(())
} }

View File

@ -14,6 +14,7 @@ pub async fn extract_with_7zip(
pool: &sqlx::Pool<sqlx::Postgres>, pool: &sqlx::Pool<sqlx::Postgres>,
db_file: &File, db_file: &File,
db_mod: &Mod, db_mod: &Mod,
game_name: &str,
) -> Result<()> { ) -> Result<()> {
file.seek(SeekFrom::Start(0))?; file.seek(SeekFrom::Start(0))?;
let temp_dir = tempdir()?; let temp_dir = tempdir()?;
@ -54,6 +55,7 @@ pub async fn extract_with_7zip(
&db_file, &db_file,
&db_mod, &db_mod,
&file_path.to_string_lossy(), &file_path.to_string_lossy(),
game_name,
) )
.await?; .await?;
} }

View File

@ -12,6 +12,7 @@ pub async fn extract_with_unrar(
pool: &sqlx::Pool<sqlx::Postgres>, pool: &sqlx::Pool<sqlx::Postgres>,
db_file: &File, db_file: &File,
db_mod: &Mod, db_mod: &Mod,
game_name: &str,
checked_metadata: bool, checked_metadata: bool,
) -> Result<()> { ) -> Result<()> {
let temp_dir = tempdir()?; let temp_dir = tempdir()?;
@ -80,6 +81,7 @@ pub async fn extract_with_unrar(
&db_file, &db_file,
&db_mod, &db_mod,
&file_path.to_string_lossy(), &file_path.to_string_lossy(),
game_name,
) )
.await?; .await?;
} }

View File

@ -24,6 +24,10 @@ struct Args {
/// the page number to start scraping for mods on nexus mods /// the page number to start scraping for mods on nexus mods
page: usize, page: usize,
#[argh(option, short = 'g', default = "String::from(\"skyrimspecialedition\")")]
/// name of nexus game to scrape (e.g. "skyrim" or "skyrimspecialedition")
game: String,
#[argh(switch, short = 'f')] #[argh(switch, short = 'f')]
/// enable full scrape of all pages, rather than stopping after 50 pages of no updates /// enable full scrape of all pages, rather than stopping after 50 pages of no updates
full: bool, full: bool,
@ -110,5 +114,5 @@ pub async fn main() -> Result<()> {
return backfill_is_translation(&pool).await; return backfill_is_translation(&pool).await;
} }
return update(&pool, args.page, args.full).await; return update(&pool, args.page, &args.game, args.full).await;
} }

View File

@ -8,7 +8,7 @@ use tokio::fs::File;
use tokio_util::compat::FuturesAsyncReadCompatExt; use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{info, instrument}; use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, GAME_NAME, USER_AGENT}; use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
pub struct DownloadLinkResponse { pub struct DownloadLinkResponse {
pub wait: Duration, pub wait: Duration,
@ -16,12 +16,12 @@ pub struct DownloadLinkResponse {
} }
#[instrument(skip(client))] #[instrument(skip(client))]
pub async fn get(client: &Client, mod_id: i32, file_id: i64) -> Result<DownloadLinkResponse> { pub async fn get(client: &Client, game_name: &str, mod_id: i32, file_id: i64) -> Result<DownloadLinkResponse> {
for attempt in 1..=3 { for attempt in 1..=3 {
let res = match client let res = match client
.get(format!( .get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}/files/{}/download_link.json", "https://api.nexusmods.com/v1/games/{}/mods/{}/files/{}/download_link.json",
GAME_NAME, mod_id, file_id game_name, mod_id, file_id
)) ))
.header("accept", "application/json") .header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration}; use std::{env, time::Duration};
use tracing::{info, instrument}; use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, GAME_NAME, USER_AGENT}; use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
pub struct FilesResponse { pub struct FilesResponse {
pub wait: Duration, pub wait: Duration,
@ -25,12 +25,12 @@ pub struct ApiFile<'a> {
} }
#[instrument(skip(client))] #[instrument(skip(client))]
pub async fn get(client: &Client, nexus_mod_id: i32) -> Result<FilesResponse> { pub async fn get(client: &Client, game_name: &str, nexus_mod_id: i32) -> Result<FilesResponse> {
for attempt in 1..=3 { for attempt in 1..=3 {
let res = match client let res = match client
.get(format!( .get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}/files.json", "https://api.nexusmods.com/v1/games/{}/mods/{}/files.json",
GAME_NAME, nexus_mod_id game_name, nexus_mod_id
)) ))
.header("accept", "application/json") .header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration}; use std::{env, time::Duration};
use tracing::{info, instrument}; use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, GAME_NAME, USER_AGENT}; use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
pub struct ModResponse { pub struct ModResponse {
pub wait: Duration, pub wait: Duration,
@ -13,12 +13,12 @@ pub struct ModResponse {
} }
#[instrument(skip(client))] #[instrument(skip(client))]
pub async fn get(client: &Client, mod_id: i32) -> Result<ModResponse> { pub async fn get(client: &Client, game_name: &str, mod_id: i32) -> Result<ModResponse> {
for attempt in 1..=3 { for attempt in 1..=3 {
let res = match client let res = match client
.get(format!( .get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}.json", "https://api.nexusmods.com/v1/games/{}/mods/{}.json",
GAME_NAME, mod_id game_name, mod_id
)) ))
.header("accept", "application/json") .header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)

View File

@ -11,10 +11,20 @@ pub mod files;
pub mod game_mod; pub mod game_mod;
pub mod metadata; pub mod metadata;
pub static GAME_NAME: &str = "skyrimspecialedition"; pub const SKYRIM_GAME_NAME: &str = "skyrim";
pub const GAME_ID: u32 = 1704; pub const SKYRIM_GAME_ID: i32 = 110;
pub const SSE_GAME_NAME: &str = "skyrimspecialedition";
pub const SSE_GAME_ID: i32 = 1704;
pub static USER_AGENT: &str = "mod-mapper/0.1"; pub static USER_AGENT: &str = "mod-mapper/0.1";
pub fn get_game_id(name: &str) -> Option<i32> {
match name {
SKYRIM_GAME_NAME => Some(SKYRIM_GAME_ID),
SSE_GAME_NAME => Some(SSE_GAME_ID),
_ => None,
}
}
pub fn rate_limit_wait_duration(res: &Response) -> Result<std::time::Duration> { pub fn rate_limit_wait_duration(res: &Response) -> Result<std::time::Duration> {
let daily_remaining: i32 = res let daily_remaining: i32 = res
.headers() .headers()

View File

@ -4,8 +4,6 @@ use reqwest::Client;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use tracing::{info, instrument}; use tracing::{info, instrument};
use crate::nexus_api::GAME_ID;
pub struct ModListResponse { pub struct ModListResponse {
html: Html, html: Html,
} }
@ -32,12 +30,13 @@ pub struct ModListScrape<'a> {
pub async fn get_mod_list_page( pub async fn get_mod_list_page(
client: &Client, client: &Client,
page: usize, page: usize,
game_id: i32,
include_translations: bool, include_translations: bool,
) -> Result<ModListResponse> { ) -> Result<ModListResponse> {
let res = client let res = client
.get(format!( .get(format!(
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate", "https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate",
GAME_ID, game_id,
match include_translations { true => "yes", false => "no" }, match include_translations { true => "yes", false => "no" },
page page
)) ))

View File

@ -14,7 +14,6 @@ use crate::models::{plugin, plugin::UnsavedPlugin};
use crate::models::{plugin_cell, plugin_cell::UnsavedPluginCell}; use crate::models::{plugin_cell, plugin_cell::UnsavedPluginCell};
use crate::models::{plugin_world, plugin_world::UnsavedPluginWorld}; use crate::models::{plugin_world, plugin_world::UnsavedPluginWorld};
use crate::models::{world, world::UnsavedWorld}; use crate::models::{world, world::UnsavedWorld};
use crate::nexus_api::GAME_NAME;
fn get_local_form_id_and_master<'a>( fn get_local_form_id_and_master<'a>(
form_id: u32, form_id: u32,
@ -35,6 +34,7 @@ pub async fn process_plugin(
db_file: &File, db_file: &File,
db_mod: &Mod, db_mod: &Mod,
file_path: &str, file_path: &str,
game_name: &str,
) -> Result<()> { ) -> Result<()> {
if plugin_buf.is_empty() { if plugin_buf.is_empty() {
warn!("skipping processing of invalid empty plugin"); warn!("skipping processing of invalid empty plugin");
@ -148,7 +148,7 @@ pub async fn process_plugin(
let plugin_path = [ let plugin_path = [
"plugins", "plugins",
GAME_NAME, game_name,
&format!("{}", db_mod.nexus_mod_id), &format!("{}", db_mod.nexus_mod_id),
&format!("{}", db_file.nexus_file_id), &format!("{}", db_file.nexus_file_id),
file_path, file_path,