Fix mode page scraping

Was returning 403s for the past month.
This commit is contained in:
Tyler Hallada 2023-06-30 12:30:12 -04:00
parent 694ef6e89b
commit 56864c7f29
7 changed files with 19 additions and 12 deletions

View File

@ -3,7 +3,7 @@ use std::time::Duration;
use tokio::time::sleep; use tokio::time::sleep;
use tracing::{debug, info, info_span}; use tracing::{debug, info, info_span};
use crate::nexus_api::SSE_GAME_ID; use crate::nexus_api::{SSE_GAME_ID, SSE_GAME_NAME};
use crate::nexus_scraper; use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@ -26,7 +26,7 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Resul
let page_span = info_span!("page", page); let page_span = info_span!("page", page);
let _page_span = page_span.enter(); let _page_span = page_span.enter();
let mod_list_resp = let mod_list_resp =
nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_ID, true).await?; nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_NAME, SSE_GAME_ID, true).await?;
let scraped = mod_list_resp.scrape_mods()?; let scraped = mod_list_resp.scrape_mods()?;
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect(); let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();

View File

@ -2,6 +2,7 @@ use anyhow::Result;
use chrono::{NaiveDateTime, NaiveTime}; use chrono::{NaiveDateTime, NaiveTime};
use humansize::{format_size_i, DECIMAL}; use humansize::{format_size_i, DECIMAL};
use reqwest::StatusCode; use reqwest::StatusCode;
use reqwest::header::{HeaderMap, HeaderValue};
use std::collections::HashSet; use std::collections::HashSet;
use std::io::SeekFrom; use std::io::SeekFrom;
use std::time::Duration; use std::time::Duration;
@ -13,7 +14,7 @@ use crate::extractors::{self, extract_with_7zip, extract_with_compress_tools, ex
use crate::models::file; use crate::models::file;
use crate::models::game; use crate::models::game;
use crate::models::{game_mod, game_mod::UnsavedMod}; use crate::models::{game_mod, game_mod::UnsavedMod};
use crate::nexus_api::{self, get_game_id}; use crate::nexus_api::{self, get_game_id, USER_AGENT};
use crate::nexus_scraper; use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@ -30,9 +31,12 @@ pub async fn update(
let mut has_next_page = true; let mut has_next_page = true;
let mut pages_with_no_updates = 0; let mut pages_with_no_updates = 0;
let mut headers = HeaderMap::new();
headers.insert("user-agent", HeaderValue::from_static(USER_AGENT));
let client = reqwest::Client::builder() let client = reqwest::Client::builder()
.timeout(REQUEST_TIMEOUT) .timeout(REQUEST_TIMEOUT)
.connect_timeout(CONNECT_TIMEOUT) .connect_timeout(CONNECT_TIMEOUT)
.default_headers(headers)
.build()?; .build()?;
let game_id = get_game_id(game_name).expect("valid game name"); let game_id = get_game_id(game_name).expect("valid game name");
@ -49,6 +53,7 @@ pub async fn update(
let mod_list_resp = nexus_scraper::get_mod_list_page( let mod_list_resp = nexus_scraper::get_mod_list_page(
&client, &client,
page, page,
game_name,
game.nexus_game_id, game.nexus_game_id,
include_translations, include_translations,
) )

View File

@ -8,7 +8,7 @@ use tokio::fs::File;
use tokio_util::compat::FuturesAsyncReadCompatExt; use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{info, instrument}; use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT}; use super::{rate_limit_wait_duration, warn_and_sleep};
pub struct DownloadLinkResponse { pub struct DownloadLinkResponse {
pub wait: Duration, pub wait: Duration,
@ -30,7 +30,6 @@ pub async fn get(
)) ))
.header("accept", "application/json") .header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send() .send()
.await .await
{ {
@ -80,7 +79,6 @@ impl DownloadLinkResponse {
let res = match client let res = match client
.get(self.link()?) .get(self.link()?)
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send() .send()
.await .await
{ {

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration}; use std::{env, time::Duration};
use tracing::{info, instrument}; use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT}; use super::{rate_limit_wait_duration, warn_and_sleep};
pub struct FilesResponse { pub struct FilesResponse {
pub wait: Duration, pub wait: Duration,
@ -34,7 +34,6 @@ pub async fn get(client: &Client, game_name: &str, nexus_mod_id: i32) -> Result<
)) ))
.header("accept", "application/json") .header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send() .send()
.await .await
{ {

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration}; use std::{env, time::Duration};
use tracing::{info, instrument}; use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT}; use super::{rate_limit_wait_duration, warn_and_sleep};
pub struct ModResponse { pub struct ModResponse {
pub wait: Duration, pub wait: Duration,
@ -22,7 +22,6 @@ pub async fn get(client: &Client, game_name: &str, mod_id: i32) -> Result<ModRes
)) ))
.header("accept", "application/json") .header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send() .send()
.await .await
{ {

View File

@ -5,7 +5,7 @@ use std::env;
use tracing::{info, instrument}; use tracing::{info, instrument};
use super::files::ApiFile; use super::files::ApiFile;
use super::{warn_and_sleep, USER_AGENT}; use super::warn_and_sleep;
fn has_plugin(json: &Value) -> Result<bool> { fn has_plugin(json: &Value) -> Result<bool> {
let node_type = json let node_type = json
@ -50,7 +50,6 @@ pub async fn contains_plugin(client: &Client, api_file: &ApiFile<'_>) -> Result<
.get(metadata_link) .get(metadata_link)
.header("accept", "application/json") .header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?) .header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send() .send()
.await .await
{ {

View File

@ -31,6 +31,7 @@ pub struct ModListScrape<'a> {
pub async fn get_mod_list_page( pub async fn get_mod_list_page(
client: &Client, client: &Client,
page: usize, page: usize,
game_name: &str,
game_id: i32, game_id: i32,
include_translations: bool, include_translations: bool,
) -> Result<ModListResponse> { ) -> Result<ModListResponse> {
@ -41,6 +42,12 @@ pub async fn get_mod_list_page(
match include_translations { true => "yes", false => "no" }, match include_translations { true => "yes", false => "no" },
page page
)) ))
.header("host", "www.nexusmods.com")
.header("referrer", format!("https://www.nexusmods.com/{}/mods/", game_name))
.header("sec-fetch-dest", "empty")
.header("sec-fetch-mode", "cors")
.header("sec-fetch-site", "same-origin")
.header("x-requested-with", "XMLHttpRequest")
.send() .send()
.await? .await?
.error_for_status()?; .error_for_status()?;