Add support for scraping classic Skyrim mods

Not adding it to the update.sh script quite yet since I need to run the very long process of downloading every classic skyrim mod that currently exists on the nexus first.
This commit is contained in:
Tyler Hallada 2022-09-02 00:43:53 -04:00
parent 89428da6e0
commit 7d229ccd1a
14 changed files with 54 additions and 30 deletions

View File

@ -42,7 +42,8 @@ RUST_LOG=mod_mapper=debug
NEXUS_API_KEY=...
```
7. Either run `cargo run` for development mode, or build the release binary with `cargo build --release`, which will get saved to `target/release/`.
7. Build the release binary by running `cargo build --release`.
8. See `./target/release/modmapper -h` for further commands or run `./scripts/update.sh` to start populating the database with scraped mods and dumping the data to JSON files.
## Sync and Backup Setup

View File

@ -3,7 +3,7 @@ if [ -f cells/edits.json ]; then
last_update_time=$(date -r cells/edits.json +'%Y-%m-%dT%H:%M:%S')
fi
mkdir -p logs
./target/release/mod-mapper &>> logs/modmapper.log
./target/release/mod-mapper -g skyrimspecialedition &>> logs/modmapper.log
mkdir -p cells
mkdir -p mods
mkdir -p files

View File

@ -4,6 +4,7 @@ use tokio::time::sleep;
use tracing::{debug, info, info_span};
use crate::nexus_scraper;
use crate::nexus_api::SSE_GAME_ID;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
const CONNECT_TIMEOUT: Duration = Duration::from_secs(30);
@ -24,7 +25,7 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Resul
while has_next_page {
let page_span = info_span!("page", page);
let _page_span = page_span.enter();
let mod_list_resp = nexus_scraper::get_mod_list_page(&client, page, true).await?;
let mod_list_resp = nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_ID, true).await?;
let scraped = mod_list_resp.scrape_mods()?;
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();

View File

@ -13,7 +13,7 @@ use crate::extractors::{self, extract_with_7zip, extract_with_compress_tools, ex
use crate::models::file;
use crate::models::game;
use crate::models::{game_mod, game_mod::UnsavedMod};
use crate::nexus_api::{self, GAME_ID, GAME_NAME};
use crate::nexus_api::{self, get_game_id};
use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@ -22,6 +22,7 @@ const CONNECT_TIMEOUT: Duration = Duration::from_secs(30);
pub async fn update(
pool: &sqlx::Pool<sqlx::Postgres>,
start_page: usize,
game_name: &str,
full: bool,
) -> Result<()> {
for include_translations in [false, true] {
@ -29,23 +30,24 @@ pub async fn update(
let mut has_next_page = true;
let mut pages_with_no_updates = 0;
let game = game::insert(&pool, GAME_NAME, GAME_ID as i32).await?;
let client = reqwest::Client::builder()
.timeout(REQUEST_TIMEOUT)
.connect_timeout(CONNECT_TIMEOUT)
.build()?;
let game_id = get_game_id(game_name).expect("valid game name");
let game = game::insert(&pool, game_name, game_id).await?;
while has_next_page {
if !full && pages_with_no_updates >= 50 {
warn!("No updates found for 50 pages in a row, aborting");
break;
}
let page_span = info_span!("page", page, include_translations);
let page_span = info_span!("page", page, game_name, include_translations);
let _page_span = page_span.enter();
let mod_list_resp =
nexus_scraper::get_mod_list_page(&client, page, include_translations).await?;
nexus_scraper::get_mod_list_page(&client, page, game.nexus_game_id, include_translations).await?;
let scraped = mod_list_resp.scrape_mods()?;
has_next_page = scraped.has_next_page;
@ -109,7 +111,7 @@ pub async fn update(
for db_mod in mods {
let mod_span = info_span!("mod", name = ?&db_mod.name, id = &db_mod.nexus_mod_id);
let _mod_span = mod_span.enter();
let files_resp = nexus_api::files::get(&client, db_mod.nexus_mod_id).await?;
let files_resp = nexus_api::files::get(&client, game_name, db_mod.nexus_mod_id).await?;
debug!(duration = ?files_resp.wait, "sleeping");
sleep(files_resp.wait).await;
@ -190,6 +192,7 @@ pub async fn update(
info!(size = %humanized_size, "decided to download file");
let download_link_resp = nexus_api::download_link::get(
&client,
game_name,
db_mod.nexus_mod_id,
api_file.file_id,
)
@ -252,6 +255,7 @@ pub async fn update(
&pool,
&db_file,
&db_mod,
game_name,
checked_metadata,
)
.await
@ -261,7 +265,7 @@ pub async fn update(
// unrar failed to extract rar file (e.g. archive has unicode filenames)
// Attempt to uncompress the archive using `7z` unix command instead
warn!(error = %err, "failed to extract file with unrar, extracting whole archive with 7z instead");
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await
extract_with_7zip(&mut file, &pool, &db_file, &db_mod, game_name).await
}
}?;
}
@ -269,7 +273,7 @@ pub async fn update(
tokio_file.seek(SeekFrom::Start(0)).await?;
let mut file = tokio_file.try_clone().await?.into_std().await;
match extract_with_compress_tools(&mut file, &pool, &db_file, &db_mod)
match extract_with_compress_tools(&mut file, &pool, &db_file, &db_mod, game_name)
.await
{
Ok(_) => Ok(()),
@ -284,7 +288,7 @@ pub async fn update(
// compress_tools or libarchive failed to extract zip/7z file (e.g. archive is deflate64 compressed)
// Attempt to uncompress the archive using `7z` unix command instead
warn!(error = %err, "failed to extract file with compress_tools, extracting whole archive with 7z instead");
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await
extract_with_7zip(&mut file, &pool, &db_file, &db_mod, game_name).await
} else {
Err(err)
}

View File

@ -88,13 +88,14 @@ pub async fn extract_with_compress_tools(
pool: &sqlx::Pool<sqlx::Postgres>,
db_file: &File,
db_mod: &Mod,
game_name: &str,
) -> Result<()> {
let extractor = Extractor::new(file);
for plugin in extractor.into_iter() {
let (file_path, mut plugin_buf) = plugin?;
let plugin_span = info_span!("plugin", name = ?file_path);
let _plugin_span = plugin_span.enter();
process_plugin(&mut plugin_buf, &pool, &db_file, &db_mod, &file_path).await?;
process_plugin(&mut plugin_buf, &pool, &db_file, &db_mod, &file_path, game_name).await?;
}
Ok(())
}

View File

@ -14,6 +14,7 @@ pub async fn extract_with_7zip(
pool: &sqlx::Pool<sqlx::Postgres>,
db_file: &File,
db_mod: &Mod,
game_name: &str,
) -> Result<()> {
file.seek(SeekFrom::Start(0))?;
let temp_dir = tempdir()?;
@ -54,6 +55,7 @@ pub async fn extract_with_7zip(
&db_file,
&db_mod,
&file_path.to_string_lossy(),
game_name,
)
.await?;
}

View File

@ -12,6 +12,7 @@ pub async fn extract_with_unrar(
pool: &sqlx::Pool<sqlx::Postgres>,
db_file: &File,
db_mod: &Mod,
game_name: &str,
checked_metadata: bool,
) -> Result<()> {
let temp_dir = tempdir()?;
@ -80,6 +81,7 @@ pub async fn extract_with_unrar(
&db_file,
&db_mod,
&file_path.to_string_lossy(),
game_name,
)
.await?;
}

View File

@ -24,6 +24,10 @@ struct Args {
/// the page number to start scraping for mods on nexus mods
page: usize,
#[argh(option, short = 'g', default = "String::from(\"skyrimspecialedition\")")]
/// name of nexus game to scrape (e.g. "skyrim" or "skyrimspecialedition")
game: String,
#[argh(switch, short = 'f')]
/// enable full scrape of all pages, rather than stopping after 50 pages of no updates
full: bool,
@ -110,5 +114,5 @@ pub async fn main() -> Result<()> {
return backfill_is_translation(&pool).await;
}
return update(&pool, args.page, args.full).await;
return update(&pool, args.page, &args.game, args.full).await;
}

View File

@ -8,7 +8,7 @@ use tokio::fs::File;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, GAME_NAME, USER_AGENT};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
pub struct DownloadLinkResponse {
pub wait: Duration,
@ -16,12 +16,12 @@ pub struct DownloadLinkResponse {
}
#[instrument(skip(client))]
pub async fn get(client: &Client, mod_id: i32, file_id: i64) -> Result<DownloadLinkResponse> {
pub async fn get(client: &Client, game_name: &str, mod_id: i32, file_id: i64) -> Result<DownloadLinkResponse> {
for attempt in 1..=3 {
let res = match client
.get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}/files/{}/download_link.json",
GAME_NAME, mod_id, file_id
game_name, mod_id, file_id
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration};
use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, GAME_NAME, USER_AGENT};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
pub struct FilesResponse {
pub wait: Duration,
@ -25,12 +25,12 @@ pub struct ApiFile<'a> {
}
#[instrument(skip(client))]
pub async fn get(client: &Client, nexus_mod_id: i32) -> Result<FilesResponse> {
pub async fn get(client: &Client, game_name: &str, nexus_mod_id: i32) -> Result<FilesResponse> {
for attempt in 1..=3 {
let res = match client
.get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}/files.json",
GAME_NAME, nexus_mod_id
game_name, nexus_mod_id
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration};
use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, GAME_NAME, USER_AGENT};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
pub struct ModResponse {
pub wait: Duration,
@ -13,12 +13,12 @@ pub struct ModResponse {
}
#[instrument(skip(client))]
pub async fn get(client: &Client, mod_id: i32) -> Result<ModResponse> {
pub async fn get(client: &Client, game_name: &str, mod_id: i32) -> Result<ModResponse> {
for attempt in 1..=3 {
let res = match client
.get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}.json",
GAME_NAME, mod_id
game_name, mod_id
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)

View File

@ -11,10 +11,20 @@ pub mod files;
pub mod game_mod;
pub mod metadata;
pub static GAME_NAME: &str = "skyrimspecialedition";
pub const GAME_ID: u32 = 1704;
pub const SKYRIM_GAME_NAME: &str = "skyrim";
pub const SKYRIM_GAME_ID: i32 = 110;
pub const SSE_GAME_NAME: &str = "skyrimspecialedition";
pub const SSE_GAME_ID: i32 = 1704;
pub static USER_AGENT: &str = "mod-mapper/0.1";
pub fn get_game_id(name: &str) -> Option<i32> {
match name {
SKYRIM_GAME_NAME => Some(SKYRIM_GAME_ID),
SSE_GAME_NAME => Some(SSE_GAME_ID),
_ => None,
}
}
pub fn rate_limit_wait_duration(res: &Response) -> Result<std::time::Duration> {
let daily_remaining: i32 = res
.headers()

View File

@ -4,8 +4,6 @@ use reqwest::Client;
use scraper::{Html, Selector};
use tracing::{info, instrument};
use crate::nexus_api::GAME_ID;
pub struct ModListResponse {
html: Html,
}
@ -32,12 +30,13 @@ pub struct ModListScrape<'a> {
pub async fn get_mod_list_page(
client: &Client,
page: usize,
game_id: i32,
include_translations: bool,
) -> Result<ModListResponse> {
let res = client
.get(format!(
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate",
GAME_ID,
game_id,
match include_translations { true => "yes", false => "no" },
page
))

View File

@ -14,7 +14,6 @@ use crate::models::{plugin, plugin::UnsavedPlugin};
use crate::models::{plugin_cell, plugin_cell::UnsavedPluginCell};
use crate::models::{plugin_world, plugin_world::UnsavedPluginWorld};
use crate::models::{world, world::UnsavedWorld};
use crate::nexus_api::GAME_NAME;
fn get_local_form_id_and_master<'a>(
form_id: u32,
@ -35,6 +34,7 @@ pub async fn process_plugin(
db_file: &File,
db_mod: &Mod,
file_path: &str,
game_name: &str,
) -> Result<()> {
if plugin_buf.is_empty() {
warn!("skipping processing of invalid empty plugin");
@ -148,7 +148,7 @@ pub async fn process_plugin(
let plugin_path = [
"plugins",
GAME_NAME,
game_name,
&format!("{}", db_mod.nexus_mod_id),
&format!("{}", db_file.nexus_file_id),
file_path,