Modularize scraping and api requests

This commit is contained in:
Tyler Hallada 2021-07-09 00:37:08 -04:00
parent 19350081c3
commit 22757bc475
5 changed files with 416 additions and 292 deletions

View File

@ -1,15 +1,6 @@
use anyhow::{anyhow, Context, Result}; use anyhow::Result;
use chrono::DateTime;
use chrono::Duration;
use chrono::NaiveDateTime;
use chrono::Utc;
use compress_tools::{list_archive_files, uncompress_archive_file}; use compress_tools::{list_archive_files, uncompress_archive_file};
use dotenv::dotenv; use dotenv::dotenv;
use futures::stream::TryStreamExt;
use reqwest::Response;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use skyrim_cell_dump::parse_plugin; use skyrim_cell_dump::parse_plugin;
use sqlx::postgres::PgPoolOptions; use sqlx::postgres::PgPoolOptions;
use std::convert::TryInto; use std::convert::TryInto;
@ -17,14 +8,16 @@ use std::env;
use std::fs::OpenOptions; use std::fs::OpenOptions;
use std::io::Seek; use std::io::Seek;
use std::io::SeekFrom; use std::io::SeekFrom;
use tempfile::{tempdir, tempfile}; use std::time::Duration;
use tempfile::tempdir;
use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tokio::io::{AsyncReadExt, AsyncSeekExt};
use tokio::time::sleep; use tokio::time::sleep;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use unrar::Archive; use unrar::Archive;
use zip::write::{FileOptions, ZipWriter}; use zip::write::{FileOptions, ZipWriter};
mod models; mod models;
mod nexus_api;
mod nexus_scraper;
use models::cell::insert_cell; use models::cell::insert_cell;
use models::file::{insert_file, File}; use models::file::{insert_file, File};
@ -32,51 +25,14 @@ use models::game::insert_game;
use models::game_mod::{get_mod_by_nexus_mod_id, insert_mod, Mod}; use models::game_mod::{get_mod_by_nexus_mod_id, insert_mod, Mod};
use models::plugin::insert_plugin; use models::plugin::insert_plugin;
use models::plugin_cell::insert_plugin_cell; use models::plugin_cell::insert_plugin_cell;
use nexus_api::{GAME_ID, GAME_NAME};
static USER_AGENT: &str = "mod-mapper/0.1";
static GAME_NAME: &str = "skyrimspecialedition";
const GAME_ID: u32 = 1704;
fn rate_limit_wait_duration(res: &Response) -> Result<Option<std::time::Duration>> {
let daily_remaining = res
.headers()
.get("x-rl-daily-remaining")
.expect("No daily remaining in response headers");
let hourly_remaining = res
.headers()
.get("x-rl-hourly-remaining")
.expect("No hourly limit in response headers");
let hourly_reset = res
.headers()
.get("x-rl-hourly-reset")
.expect("No hourly reset in response headers");
dbg!(daily_remaining);
dbg!(hourly_remaining);
if hourly_remaining == "0" {
let hourly_reset = hourly_reset.to_str()?.trim();
let hourly_reset: DateTime<Utc> =
(DateTime::parse_from_str(hourly_reset, "%Y-%m-%d %H:%M:%S %z")?
+ Duration::seconds(5))
.into();
dbg!(hourly_reset);
let duration = (hourly_reset - Utc::now()).to_std()?;
dbg!(duration);
return Ok(Some(duration));
}
Ok(None)
}
async fn process_plugin<W>( async fn process_plugin<W>(
plugin_buf: &mut [u8], plugin_buf: &mut [u8],
pool: &sqlx::Pool<sqlx::Postgres>, pool: &sqlx::Pool<sqlx::Postgres>,
plugin_archive: &mut ZipWriter<W>, plugin_archive: &mut ZipWriter<W>,
name: &str,
db_file: &File, db_file: &File,
mod_obj: &Mod, mod_obj: &Mod,
file_id: i64,
file_name: &str, file_name: &str,
) -> Result<()> ) -> Result<()>
where where
@ -86,7 +42,7 @@ where
let hash = seahash::hash(&plugin_buf); let hash = seahash::hash(&plugin_buf);
let plugin_row = insert_plugin( let plugin_row = insert_plugin(
&pool, &pool,
name, &db_file.name,
hash as i64, hash as i64,
db_file.id, db_file.id,
Some(plugin.header.version as f64), Some(plugin.header.version as f64),
@ -116,7 +72,7 @@ where
plugin_archive.start_file( plugin_archive.start_file(
format!( format!(
"{}/{}/{}/{}", "{}/{}/{}/{}",
GAME_NAME, mod_obj.nexus_mod_id, file_id, file_name GAME_NAME, mod_obj.nexus_mod_id, db_file.nexus_file_id, file_name
), ),
FileOptions::default(), FileOptions::default(),
)?; )?;
@ -126,6 +82,21 @@ where
Ok(()) Ok(())
} }
fn initialize_plugins_archive(mod_id: i32, file_id: i32) -> Result<()> {
let mut plugins_archive = ZipWriter::new(
OpenOptions::new()
.write(true)
.create(true)
.open("plugins.zip")?,
);
plugins_archive.add_directory(
format!("{}/{}/{}", GAME_NAME, mod_id, file_id),
FileOptions::default(),
)?;
plugins_archive.finish()?;
Ok(())
}
#[tokio::main] #[tokio::main]
pub async fn main() -> Result<()> { pub async fn main() -> Result<()> {
dotenv().ok(); dotenv().ok();
@ -140,263 +111,77 @@ pub async fn main() -> Result<()> {
let mut has_next_page = true; let mut has_next_page = true;
while has_next_page { while has_next_page {
let res = client let mod_list_resp = nexus_scraper::get_mod_list_page(&client, page).await?;
.get(format!( let scraped = mod_list_resp.scrape_mods()?;
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,include_adult:true,page_size:80,show_game_filter:false,open:false,page:{},sort_by:OLD_u_downloads",
GAME_ID,
page
))
.send()
.await?
.error_for_status()?;
let html = res.text().await?;
let document = Html::parse_document(&html);
let mod_select = Selector::parse("li.mod-tile").expect("failed to parse CSS selector");
let left_select =
Selector::parse("div.mod-tile-left").expect("failed to parse CSS selector");
let right_select =
Selector::parse("div.mod-tile-right").expect("failed to parse CSS selector");
let name_select = Selector::parse("p.tile-name a").expect("failed to parse CSS selector");
let category_select =
Selector::parse("div.category a").expect("failed to parse CSS selector");
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
let next_page_select =
Selector::parse("div.pagination li.next").expect("failed to parse CSS selector");
let next_page_elem = document.select(&next_page_select).next(); has_next_page = scraped.has_next_page;
let mut mods = Vec::new();
has_next_page = next_page_elem.is_some(); for scraped_mod in scraped.mods {
if let None = get_mod_by_nexus_mod_id(&pool, scraped_mod.nexus_mod_id).await? {
let mut mods = vec![];
for element in document.select(&mod_select) {
let left = element
.select(&left_select)
.next()
.expect("Missing left div for mod");
let right = element
.select(&right_select)
.next()
.expect("Missing right div for mod");
let nexus_mod_id = left
.value()
.attr("data-mod-id")
.expect("Missing mod id attribute")
.parse::<i32>()
.ok()
.expect("Failed to parse mod id");
let name_elem = right
.select(&name_select)
.next()
.expect("Missing name link for mod");
let name = name_elem.text().next().expect("Missing name text for mod");
let category_elem = right
.select(&category_select)
.next()
.expect("Missing category link for mod");
let category = category_elem
.text()
.next()
.expect("Missing category text for mod");
let author_elem = right
.select(&author_select)
.next()
.expect("Missing author link for mod");
let author = author_elem
.text()
.next()
.expect("Missing author text for mod");
let desc_elem = right
.select(&desc_select)
.next()
.expect("Missing desc elem for mod");
let desc = desc_elem.text().next();
if let None = get_mod_by_nexus_mod_id(&pool, nexus_mod_id).await? {
mods.push( mods.push(
insert_mod(&pool, name, nexus_mod_id, author, category, desc, game.id).await?, insert_mod(
&pool,
scraped_mod.name,
scraped_mod.nexus_mod_id,
scraped_mod.author,
scraped_mod.category,
scraped_mod.desc,
game.id,
)
.await?,
); );
} }
} }
dbg!(mods.len()); dbg!(mods.len());
for mod_obj in mods { for db_mod in mods {
dbg!(&mod_obj.name); dbg!(&db_mod.name);
let res = client let files_resp = nexus_api::files::get(&client, db_mod.nexus_mod_id).await?;
.get(format!( // TODO: download other files than just MAIN files
"https://api.nexusmods.com/v1/games/{}/mods/{}/files.json", // let files = files.into_iter().filter(|file| {
GAME_NAME, mod_obj.nexus_mod_id // if let Some(category_name) = file.get("category_name") {
)) // category_name.as_str() == Some("MAIN")
.header("accept", "application/json") // } else {
.header("apikey", env::var("NEXUS_API_KEY")?) // false
.header("user-agent", USER_AGENT) // }
.send() // });
.await? if let Some(duration) = files_resp.wait {
.error_for_status()?;
if let Some(duration) = rate_limit_wait_duration(&res)? {
sleep(duration).await; sleep(duration).await;
} }
let files = res.json::<Value>().await?; for api_file in files_resp.files()? {
let files = files
.get("files")
.ok_or_else(|| anyhow!("Missing files key in API response"))?
.as_array()
.ok_or_else(|| anyhow!("files value in API response is not an array"))?;
// TODO: download other files than just MAIN files
let files = files.into_iter().filter(|file| {
if let Some(category_name) = file.get("category_name") {
category_name.as_str() == Some("MAIN")
} else {
false
}
});
for file in files {
let file_id = file
.get("file_id")
.ok_or_else(|| anyhow!("Missing file_id key in file in API response"))?
.as_i64()
.ok_or_else(|| anyhow!("file_id value in API response file is not a number"))?;
dbg!(file_id);
let name = file
.get("name")
.ok_or_else(|| anyhow!("Missing name key in file in API response"))?
.as_str()
.ok_or_else(|| anyhow!("name value in API response file is not a string"))?;
let file_name = file
.get("file_name")
.ok_or_else(|| anyhow!("Missing file_name key in file in API response"))?
.as_str()
.ok_or_else(|| {
anyhow!("file_name value in API response file is not a string")
})?;
let category = file
.get("category_name")
.ok_or_else(|| anyhow!("Missing category key in file in API response"))?
.as_str();
let version = file
.get("version")
.ok_or_else(|| anyhow!("Missing version key in file in API response"))?
.as_str();
let mod_version = file
.get("mod_version")
.ok_or_else(|| anyhow!("Missing mod_version key in file in API response"))?
.as_str();
let uploaded_timestamp = file
.get("uploaded_timestamp")
.ok_or_else(|| {
anyhow!("Missing uploaded_timestamp key in file in API response")
})?
.as_i64()
.ok_or_else(|| {
anyhow!("uploaded_timestamp value in API response file is not a number")
})?;
let uploaded_at = NaiveDateTime::from_timestamp(uploaded_timestamp, 0);
let db_file = insert_file( let db_file = insert_file(
&pool, &pool,
name, api_file.name,
file_name, api_file.file_name,
file_id as i32, api_file.file_id as i32,
mod_obj.id, db_mod.id,
category, api_file.category,
version, api_file.version,
mod_version, api_file.mod_version,
uploaded_at, api_file.uploaded_at,
) )
.await?; .await?;
let res = client
.get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}/files/{}/download_link.json",
GAME_NAME, mod_obj.nexus_mod_id, file_id
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await?
.error_for_status()?;
let duration = rate_limit_wait_duration(&res)?; let download_link_resp =
nexus_api::download_link::get(&client, db_mod.nexus_mod_id, api_file.file_id)
.await?;
let mut tokio_file = download_link_resp.download_file(&client).await?;
let links = res.json::<Value>().await?; initialize_plugins_archive(db_mod.nexus_mod_id, db_file.nexus_file_id)?;
let link = links let mut plugins_archive = ZipWriter::new_append(
.get(0)
.ok_or_else(|| anyhow!("Links array in API response is missing first element"))?
.get("URI")
.ok_or_else(|| anyhow!("Missing URI key in link in API response"))?
.as_str()
.ok_or_else(|| anyhow!("URI value in API response link is not a string"))?;
let mut tokio_file = tokio::fs::File::from_std(tempfile()?);
let res = client
.get(link)
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await?
.error_for_status()?;
// See: https://github.com/benkay86/async-applied/blob/master/reqwest-tokio-compat/src/main.rs
let mut byte_stream = res
.bytes_stream()
.map_err(|e| futures::io::Error::new(futures::io::ErrorKind::Other, e))
.into_async_read()
.compat();
tokio::io::copy(&mut byte_stream, &mut tokio_file).await?;
let mut plugin_archive = ZipWriter::new(
OpenOptions::new()
.write(true)
.create(true)
.open("plugins.zip")?,
);
plugin_archive.add_directory(
format!("{}/{}/{}", GAME_NAME, mod_obj.nexus_mod_id, file_id),
FileOptions::default(),
)?;
plugin_archive.finish()?;
let mut plugin_archive = ZipWriter::new_append(
OpenOptions::new() OpenOptions::new()
.read(true) .read(true)
.write(true) .write(true)
.open("plugins.zip")?, .open("plugins.zip")?,
)?; )?;
let mut initial_bytes = [0; 8]; let mut initial_bytes = [0; 8];
tokio_file.seek(SeekFrom::Start(0)).await?; tokio_file.seek(SeekFrom::Start(0)).await?;
tokio_file.read_exact(&mut initial_bytes).await?; tokio_file.read_exact(&mut initial_bytes).await?;
let kind = infer::get(&initial_bytes).expect("unknown file type of file download"); let kind = infer::get(&initial_bytes).expect("unknown file type of file download");
dbg!(kind.mime_type()); dbg!(kind.mime_type());
// "application/zip" => {
// let mut archive = ZipArchive::new(reader)?;
// let mut plugin_file_paths = Vec::new();
// for file_name in archive.file_names() {
// dbg!(file_name);
// if file_name.ends_with(".esp")
// || file_name.ends_with(".esm")
// || file_name.ends_with(".esl")
// {
// plugin_file_paths.push(file_name.to_string());
// }
// }
// dbg!(&plugin_file_paths);
// for file_name in plugin_file_paths.iter() {
// let mut file = archive.by_name(file_name)?;
// let plugin = parse_plugin(file)?;
// dbg!(plugin);
// plugin_archive.start_file(
// format!("{}/{}/{}/{}", GAME_NAME, mod_id, file_id, file_name),
// FileOptions::default(),
// )?;
// std::io::copy(&mut file, &mut plugin_archive)?;
// }
// }
// Use unrar to uncompress the entire .rar file to avoid a bug with compress_tools panicking when uncompressing
// certain .rar files: https://github.com/libarchive/libarchive/issues/373
tokio_file.seek(SeekFrom::Start(0)).await?; tokio_file.seek(SeekFrom::Start(0)).await?;
let mut file = tokio_file.try_clone().await?.into_std().await; let mut file = tokio_file.try_clone().await?.into_std().await;
let mut plugin_file_paths = Vec::new(); let mut plugin_file_paths = Vec::new();
@ -419,11 +204,9 @@ pub async fn main() -> Result<()> {
process_plugin( process_plugin(
&mut buf, &mut buf,
&pool, &pool,
&mut plugin_archive, &mut plugins_archive,
name,
&db_file, &db_file,
&mod_obj, &db_mod,
file_id,
file_name, file_name,
) )
.await?; .await?;
@ -433,6 +216,8 @@ pub async fn main() -> Result<()> {
if kind.mime_type() == "application/x-rar-compressed" if kind.mime_type() == "application/x-rar-compressed"
|| kind.mime_type() == "application/vnd.rar" || kind.mime_type() == "application/vnd.rar"
{ {
// Use unrar to uncompress the entire .rar file to avoid a bug with compress_tools panicking when uncompressing
// certain .rar files: https://github.com/libarchive/libarchive/issues/373
tokio_file.seek(SeekFrom::Start(0)).await?; tokio_file.seek(SeekFrom::Start(0)).await?;
let mut file = tokio_file.try_clone().await?.into_std().await; let mut file = tokio_file.try_clone().await?.into_std().await;
let temp_dir = tempdir()?; let temp_dir = tempdir()?;
@ -474,11 +259,9 @@ pub async fn main() -> Result<()> {
process_plugin( process_plugin(
&mut plugin_buf, &mut plugin_buf,
&pool, &pool,
&mut plugin_archive, &mut plugins_archive,
name,
&db_file, &db_file,
&mod_obj, &db_mod,
file_id,
file_name, file_name,
) )
.await?; .await?;
@ -490,8 +273,8 @@ pub async fn main() -> Result<()> {
} }
} }
plugin_archive.finish()?; plugins_archive.finish()?;
if let Some(duration) = duration { if let Some(duration) = download_link_resp.wait {
sleep(duration).await; sleep(duration).await;
} }
} }
@ -500,6 +283,7 @@ pub async fn main() -> Result<()> {
page += 1; page += 1;
dbg!(page); dbg!(page);
dbg!(has_next_page); dbg!(has_next_page);
sleep(Duration::new(1, 0)).await;
} }
Ok(()) Ok(())

View File

@ -0,0 +1,70 @@
use anyhow::{anyhow, Result};
use futures::TryStreamExt;
use reqwest::Client;
use serde_json::Value;
use std::{env, time::Duration};
use tempfile::tempfile;
use tokio::fs::File;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use super::{rate_limit_wait_duration, GAME_NAME, USER_AGENT};
pub struct DownloadLinkResponse {
pub wait: Option<Duration>,
json: Value,
}
pub async fn get(client: &Client, mod_id: i32, file_id: i64) -> Result<DownloadLinkResponse> {
let res = client
.get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}/files/{}/download_link.json",
GAME_NAME, mod_id, file_id
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await?
.error_for_status()?;
let wait = rate_limit_wait_duration(&res)?;
let json = res.json::<Value>().await?;
Ok(DownloadLinkResponse { wait, json })
}
impl DownloadLinkResponse {
pub fn link<'a>(&'a self) -> Result<&'a str> {
let link = self
.json
.get(0)
.ok_or_else(|| anyhow!("Links array in API response is missing first element"))?
.get("URI")
.ok_or_else(|| anyhow!("Missing URI key in link in API response"))?
.as_str()
.ok_or_else(|| anyhow!("URI value in API response link is not a string"))?;
Ok(link)
}
pub async fn download_file(&self, client: &Client) -> Result<File> {
let mut tokio_file = File::from_std(tempfile()?);
let res = client
.get(self.link()?)
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await?
.error_for_status()?;
// See: https://github.com/benkay86/async-applied/blob/master/reqwest-tokio-compat/src/main.rs
let mut byte_stream = res
.bytes_stream()
.map_err(|e| futures::io::Error::new(futures::io::ErrorKind::Other, e))
.into_async_read()
.compat();
tokio::io::copy(&mut byte_stream, &mut tokio_file).await?;
return Ok(tokio_file);
}
}

107
src/nexus_api/files.rs Normal file
View File

@ -0,0 +1,107 @@
use anyhow::{anyhow, Result};
use chrono::NaiveDateTime;
use reqwest::Client;
use serde_json::Value;
use std::{env, time::Duration};
use super::{rate_limit_wait_duration, GAME_NAME, USER_AGENT};
pub struct FilesResponse {
pub wait: Option<Duration>,
json: Value,
}
pub struct ApiFile<'a> {
pub file_id: i64,
pub name: &'a str,
pub file_name: &'a str,
pub category: Option<&'a str>,
pub version: Option<&'a str>,
pub mod_version: Option<&'a str>,
pub uploaded_at: NaiveDateTime,
}
pub async fn get(client: &Client, nexus_mod_id: i32) -> Result<FilesResponse> {
let res = client
.get(format!(
"https://api.nexusmods.com/v1/games/{}/mods/{}/files.json",
GAME_NAME, nexus_mod_id
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await?
.error_for_status()?;
let wait = rate_limit_wait_duration(&res)?;
let json = res.json::<Value>().await?;
Ok(FilesResponse { wait, json })
}
impl FilesResponse {
pub fn files<'a>(&'a self) -> Result<Vec<ApiFile<'a>>> {
let files = self
.json
.get("files")
.ok_or_else(|| anyhow!("Missing files key in API response"))?
.as_array()
.ok_or_else(|| anyhow!("files value in API response is not an array"))?;
files
.into_iter()
.map(|file| {
let file_id = file
.get("file_id")
.ok_or_else(|| anyhow!("Missing file_id key in file in API response"))?
.as_i64()
.ok_or_else(|| anyhow!("file_id value in API response file is not a number"))?;
dbg!(file_id);
let name = file
.get("name")
.ok_or_else(|| anyhow!("Missing name key in file in API response"))?
.as_str()
.ok_or_else(|| anyhow!("name value in API response file is not a string"))?;
let file_name = file
.get("file_name")
.ok_or_else(|| anyhow!("Missing file_name key in file in API response"))?
.as_str()
.ok_or_else(|| {
anyhow!("file_name value in API response file is not a string")
})?;
let category = file
.get("category_name")
.ok_or_else(|| anyhow!("Missing category key in file in API response"))?
.as_str();
let version = file
.get("version")
.ok_or_else(|| anyhow!("Missing version key in file in API response"))?
.as_str();
let mod_version = file
.get("mod_version")
.ok_or_else(|| anyhow!("Missing mod_version key in file in API response"))?
.as_str();
let uploaded_timestamp = file
.get("uploaded_timestamp")
.ok_or_else(|| {
anyhow!("Missing uploaded_timestamp key in file in API response")
})?
.as_i64()
.ok_or_else(|| {
anyhow!("uploaded_timestamp value in API response file is not a number")
})?;
let uploaded_at = NaiveDateTime::from_timestamp(uploaded_timestamp, 0);
Ok(ApiFile {
file_id,
name,
file_name,
category,
version,
mod_version,
uploaded_at,
})
})
.collect()
}
}

44
src/nexus_api/mod.rs Normal file
View File

@ -0,0 +1,44 @@
use anyhow::Result;
use chrono::DateTime;
use chrono::Duration;
use chrono::Utc;
use reqwest::Response;
pub mod download_link;
pub mod files;
pub static GAME_NAME: &str = "skyrimspecialedition";
pub const GAME_ID: u32 = 1704;
pub static USER_AGENT: &str = "mod-mapper/0.1";
pub fn rate_limit_wait_duration(res: &Response) -> Result<Option<std::time::Duration>> {
let daily_remaining = res
.headers()
.get("x-rl-daily-remaining")
.expect("No daily remaining in response headers");
let hourly_remaining = res
.headers()
.get("x-rl-hourly-remaining")
.expect("No hourly limit in response headers");
let hourly_reset = res
.headers()
.get("x-rl-hourly-reset")
.expect("No hourly reset in response headers");
dbg!(daily_remaining);
dbg!(hourly_remaining);
if hourly_remaining == "0" {
let hourly_reset = hourly_reset.to_str()?.trim();
let hourly_reset: DateTime<Utc> =
(DateTime::parse_from_str(hourly_reset, "%Y-%m-%d %H:%M:%S %z")?
+ Duration::seconds(5))
.into();
dbg!(hourly_reset);
let duration = (hourly_reset - Utc::now()).to_std()?;
dbg!(duration);
return Ok(Some(duration));
}
Ok(None)
}

119
src/nexus_scraper.rs Normal file
View File

@ -0,0 +1,119 @@
use anyhow::Result;
use reqwest::Client;
use scraper::{Html, Selector};
use crate::nexus_api::GAME_ID;
pub struct ModListResponse {
html: Html,
}
pub struct ScrapedMod<'a> {
pub nexus_mod_id: i32,
pub name: &'a str,
pub category: &'a str,
pub author: &'a str,
pub desc: Option<&'a str>,
}
pub struct ModListScrape<'a> {
pub mods: Vec<ScrapedMod<'a>>,
pub has_next_page: bool,
}
pub async fn get_mod_list_page(client: &Client, page: i32) -> Result<ModListResponse> {
let res = client
.get(format!(
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,include_adult:true,page_size:80,show_game_filter:false,open:false,page:{},sort_by:OLD_u_downloads",
GAME_ID,
page
))
.send()
.await?
.error_for_status()?;
let text = res.text().await?;
let html = Html::parse_document(&text);
Ok(ModListResponse { html })
}
impl ModListResponse {
pub fn scrape_mods<'a>(&'a self) -> Result<ModListScrape> {
let mod_select = Selector::parse("li.mod-tile").expect("failed to parse CSS selector");
let left_select =
Selector::parse("div.mod-tile-left").expect("failed to parse CSS selector");
let right_select =
Selector::parse("div.mod-tile-right").expect("failed to parse CSS selector");
let name_select = Selector::parse("p.tile-name a").expect("failed to parse CSS selector");
let category_select =
Selector::parse("div.category a").expect("failed to parse CSS selector");
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
let next_page_select =
Selector::parse("div.pagination li.next").expect("failed to parse CSS selector");
let next_page_elem = self.html.select(&next_page_select).next();
let has_next_page = next_page_elem.is_some();
let mods: Vec<ScrapedMod> = self
.html
.select(&mod_select)
.map(|element| {
let left = element
.select(&left_select)
.next()
.expect("Missing left div for mod");
let right = element
.select(&right_select)
.next()
.expect("Missing right div for mod");
let nexus_mod_id = left
.value()
.attr("data-mod-id")
.expect("Missing mod id attribute")
.parse::<i32>()
.ok()
.expect("Failed to parse mod id");
let name_elem = right
.select(&name_select)
.next()
.expect("Missing name link for mod");
let name = name_elem.text().next().expect("Missing name text for mod");
let category_elem = right
.select(&category_select)
.next()
.expect("Missing category link for mod");
let category = category_elem
.text()
.next()
.expect("Missing category text for mod");
let author_elem = right
.select(&author_select)
.next()
.expect("Missing author link for mod");
let author = author_elem
.text()
.next()
.expect("Missing author text for mod");
let desc_elem = right
.select(&desc_select)
.next()
.expect("Missing desc elem for mod");
let desc = desc_elem.text().next();
ScrapedMod {
nexus_mod_id,
name,
category,
author,
desc,
}
})
.collect();
dbg!(mods.len());
Ok(ModListScrape {
mods,
has_next_page,
})
}
}