2021-07-09 04:37:08 +00:00
|
|
|
use anyhow::Result;
|
2021-09-05 20:02:16 +00:00
|
|
|
use chrono::NaiveDate;
|
2021-07-09 04:37:08 +00:00
|
|
|
use reqwest::Client;
|
|
|
|
use scraper::{Html, Selector};
|
2021-07-11 23:45:26 +00:00
|
|
|
use tracing::{info, instrument};
|
2021-07-09 04:37:08 +00:00
|
|
|
|
|
|
|
pub struct ModListResponse {
|
|
|
|
html: Html,
|
|
|
|
}
|
2021-09-05 20:02:16 +00:00
|
|
|
|
2021-07-09 04:37:08 +00:00
|
|
|
pub struct ScrapedMod<'a> {
|
|
|
|
pub nexus_mod_id: i32,
|
|
|
|
pub name: &'a str,
|
2022-01-18 04:37:58 +00:00
|
|
|
pub category_name: Option<&'a str>,
|
|
|
|
pub category_id: Option<i32>,
|
|
|
|
pub author_name: &'a str,
|
|
|
|
pub author_id: i32,
|
2021-07-09 04:37:08 +00:00
|
|
|
pub desc: Option<&'a str>,
|
2022-01-18 04:37:58 +00:00
|
|
|
pub thumbnail_link: Option<&'a str>,
|
|
|
|
pub last_update_at: NaiveDate,
|
|
|
|
pub first_upload_at: NaiveDate,
|
2021-07-09 04:37:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
pub struct ModListScrape<'a> {
|
|
|
|
pub mods: Vec<ScrapedMod<'a>>,
|
|
|
|
pub has_next_page: bool,
|
|
|
|
}
|
|
|
|
|
2021-07-11 23:45:26 +00:00
|
|
|
#[instrument(skip(client))]
|
2022-03-16 03:59:56 +00:00
|
|
|
pub async fn get_mod_list_page(
|
|
|
|
client: &Client,
|
|
|
|
page: usize,
|
2022-09-02 04:43:53 +00:00
|
|
|
game_id: i32,
|
2022-03-16 03:59:56 +00:00
|
|
|
include_translations: bool,
|
|
|
|
) -> Result<ModListResponse> {
|
2021-07-09 04:37:08 +00:00
|
|
|
let res = client
|
|
|
|
.get(format!(
|
2022-03-16 03:59:56 +00:00
|
|
|
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate",
|
2022-09-02 04:43:53 +00:00
|
|
|
game_id,
|
2022-03-16 03:59:56 +00:00
|
|
|
match include_translations { true => "yes", false => "no" },
|
2021-07-09 04:37:08 +00:00
|
|
|
page
|
|
|
|
))
|
|
|
|
.send()
|
|
|
|
.await?
|
|
|
|
.error_for_status()?;
|
2021-07-11 23:45:26 +00:00
|
|
|
info!(status = %res.status(), "fetched mod list page");
|
2021-07-09 04:37:08 +00:00
|
|
|
let text = res.text().await?;
|
|
|
|
let html = Html::parse_document(&text);
|
|
|
|
|
|
|
|
Ok(ModListResponse { html })
|
|
|
|
}
|
|
|
|
|
|
|
|
impl ModListResponse {
|
2021-07-11 23:45:26 +00:00
|
|
|
#[instrument(skip(self))]
|
2021-07-09 04:37:08 +00:00
|
|
|
pub fn scrape_mods<'a>(&'a self) -> Result<ModListScrape> {
|
|
|
|
let mod_select = Selector::parse("li.mod-tile").expect("failed to parse CSS selector");
|
|
|
|
let left_select =
|
|
|
|
Selector::parse("div.mod-tile-left").expect("failed to parse CSS selector");
|
|
|
|
let right_select =
|
|
|
|
Selector::parse("div.mod-tile-right").expect("failed to parse CSS selector");
|
|
|
|
let name_select = Selector::parse("p.tile-name a").expect("failed to parse CSS selector");
|
|
|
|
let category_select =
|
|
|
|
Selector::parse("div.category a").expect("failed to parse CSS selector");
|
|
|
|
let author_select = Selector::parse("div.author a").expect("failed to parse CSS selector");
|
|
|
|
let desc_select = Selector::parse("p.desc").expect("failed to parse CSS selector");
|
2022-01-18 04:37:58 +00:00
|
|
|
let thumbnail_select =
|
|
|
|
Selector::parse("a.mod-image img.fore").expect("failed to parse CSS selector");
|
|
|
|
let first_upload_date_select =
|
|
|
|
Selector::parse("time.date").expect("failed to parse CSS selector");
|
|
|
|
let last_update_date_select =
|
|
|
|
Selector::parse("div.date").expect("failed to parse CSS selector");
|
2021-07-09 04:37:08 +00:00
|
|
|
let next_page_select =
|
|
|
|
Selector::parse("div.pagination li.next").expect("failed to parse CSS selector");
|
|
|
|
|
|
|
|
let next_page_elem = self.html.select(&next_page_select).next();
|
|
|
|
|
|
|
|
let has_next_page = next_page_elem.is_some();
|
|
|
|
|
|
|
|
let mods: Vec<ScrapedMod> = self
|
|
|
|
.html
|
|
|
|
.select(&mod_select)
|
|
|
|
.map(|element| {
|
|
|
|
let left = element
|
|
|
|
.select(&left_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing left div for mod");
|
|
|
|
let right = element
|
|
|
|
.select(&right_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing right div for mod");
|
|
|
|
let nexus_mod_id = left
|
|
|
|
.value()
|
|
|
|
.attr("data-mod-id")
|
|
|
|
.expect("Missing mod id attribute")
|
|
|
|
.parse::<i32>()
|
|
|
|
.expect("Failed to parse mod id");
|
|
|
|
let name_elem = right
|
|
|
|
.select(&name_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing name link for mod");
|
|
|
|
let name = name_elem.text().next().expect("Missing name text for mod");
|
|
|
|
let category_elem = right
|
|
|
|
.select(&category_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing category link for mod");
|
2022-01-18 04:37:58 +00:00
|
|
|
let category_id = match category_elem.value().attr("href") {
|
2022-01-18 22:03:22 +00:00
|
|
|
Some(href) => href
|
|
|
|
.split("/")
|
|
|
|
.nth(6)
|
|
|
|
.expect("Missing category id for mod")
|
|
|
|
.parse::<i32>()
|
|
|
|
.ok(),
|
2022-01-18 04:37:58 +00:00
|
|
|
None => None,
|
|
|
|
};
|
|
|
|
let category_name = category_elem.text().next();
|
2021-07-09 04:37:08 +00:00
|
|
|
let author_elem = right
|
|
|
|
.select(&author_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing author link for mod");
|
2022-01-18 04:37:58 +00:00
|
|
|
let author_id = author_elem
|
|
|
|
.value()
|
|
|
|
.attr("href")
|
|
|
|
.expect("Missing author link href for mod")
|
|
|
|
.split("/")
|
|
|
|
.last()
|
|
|
|
.expect("Missing author id for mod")
|
|
|
|
.parse::<i32>()
|
|
|
|
.expect("Failed to parse author id");
|
|
|
|
let author_name = author_elem
|
2021-07-09 04:37:08 +00:00
|
|
|
.text()
|
|
|
|
.next()
|
|
|
|
.expect("Missing author text for mod");
|
|
|
|
let desc_elem = right
|
|
|
|
.select(&desc_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing desc elem for mod");
|
|
|
|
let desc = desc_elem.text().next();
|
2022-01-18 04:37:58 +00:00
|
|
|
let thumbnail_elem = left
|
|
|
|
.select(&thumbnail_select)
|
2021-09-05 20:02:16 +00:00
|
|
|
.next()
|
2022-01-18 04:37:58 +00:00
|
|
|
.expect("Missing thumbnail elem for mod");
|
|
|
|
let thumbnail_link = thumbnail_elem.value().attr("src");
|
|
|
|
let first_upload_date_text = right
|
|
|
|
.select(&first_upload_date_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing dates elem for mod")
|
|
|
|
.text();
|
|
|
|
let first_upload_at = first_upload_date_text
|
|
|
|
.skip(2)
|
|
|
|
.next()
|
|
|
|
.expect("Missing last update text for mod")
|
|
|
|
.trim();
|
|
|
|
let first_upload_at = NaiveDate::parse_from_str(first_upload_at, "%d %b %Y")
|
|
|
|
.expect("Cannot parse first upload date");
|
|
|
|
let last_update_date_text = right
|
|
|
|
.select(&last_update_date_select)
|
|
|
|
.next()
|
|
|
|
.expect("Missing dates elem for mod")
|
|
|
|
.text();
|
|
|
|
let last_update_at = last_update_date_text
|
2021-09-05 20:02:16 +00:00
|
|
|
.skip(1)
|
|
|
|
.next()
|
|
|
|
.expect("Missing last update text for mod")
|
|
|
|
.trim();
|
2022-01-18 04:37:58 +00:00
|
|
|
let last_update_at = NaiveDate::parse_from_str(last_update_at, "%d %b %Y")
|
2021-09-05 20:02:16 +00:00
|
|
|
.expect("Cannot parse last update date");
|
2021-07-09 04:37:08 +00:00
|
|
|
|
|
|
|
ScrapedMod {
|
|
|
|
nexus_mod_id,
|
|
|
|
name,
|
2022-01-18 04:37:58 +00:00
|
|
|
category_name,
|
|
|
|
category_id,
|
|
|
|
author_name,
|
|
|
|
author_id,
|
2021-07-09 04:37:08 +00:00
|
|
|
desc,
|
2022-01-18 04:37:58 +00:00
|
|
|
thumbnail_link,
|
|
|
|
last_update_at,
|
|
|
|
first_upload_at,
|
2021-07-09 04:37:08 +00:00
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
2021-07-11 23:45:26 +00:00
|
|
|
info!(
|
|
|
|
len = mods.len(),
|
|
|
|
has_next_page, "scraped mods from mod list page"
|
|
|
|
);
|
2021-07-09 04:37:08 +00:00
|
|
|
Ok(ModListScrape {
|
|
|
|
mods,
|
|
|
|
has_next_page,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|