Fix mode page scraping

Was returning 403s for the past month.
This commit is contained in:
Tyler Hallada 2023-06-30 12:30:12 -04:00
parent 694ef6e89b
commit 56864c7f29
7 changed files with 19 additions and 12 deletions

View File

@ -3,7 +3,7 @@ use std::time::Duration;
use tokio::time::sleep;
use tracing::{debug, info, info_span};
use crate::nexus_api::SSE_GAME_ID;
use crate::nexus_api::{SSE_GAME_ID, SSE_GAME_NAME};
use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@ -26,7 +26,7 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Resul
let page_span = info_span!("page", page);
let _page_span = page_span.enter();
let mod_list_resp =
nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_ID, true).await?;
nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_NAME, SSE_GAME_ID, true).await?;
let scraped = mod_list_resp.scrape_mods()?;
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();

View File

@ -2,6 +2,7 @@ use anyhow::Result;
use chrono::{NaiveDateTime, NaiveTime};
use humansize::{format_size_i, DECIMAL};
use reqwest::StatusCode;
use reqwest::header::{HeaderMap, HeaderValue};
use std::collections::HashSet;
use std::io::SeekFrom;
use std::time::Duration;
@ -13,7 +14,7 @@ use crate::extractors::{self, extract_with_7zip, extract_with_compress_tools, ex
use crate::models::file;
use crate::models::game;
use crate::models::{game_mod, game_mod::UnsavedMod};
use crate::nexus_api::{self, get_game_id};
use crate::nexus_api::{self, get_game_id, USER_AGENT};
use crate::nexus_scraper;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
@ -30,9 +31,12 @@ pub async fn update(
let mut has_next_page = true;
let mut pages_with_no_updates = 0;
let mut headers = HeaderMap::new();
headers.insert("user-agent", HeaderValue::from_static(USER_AGENT));
let client = reqwest::Client::builder()
.timeout(REQUEST_TIMEOUT)
.connect_timeout(CONNECT_TIMEOUT)
.default_headers(headers)
.build()?;
let game_id = get_game_id(game_name).expect("valid game name");
@ -49,6 +53,7 @@ pub async fn update(
let mod_list_resp = nexus_scraper::get_mod_list_page(
&client,
page,
game_name,
game.nexus_game_id,
include_translations,
)

View File

@ -8,7 +8,7 @@ use tokio::fs::File;
use tokio_util::compat::FuturesAsyncReadCompatExt;
use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
use super::{rate_limit_wait_duration, warn_and_sleep};
pub struct DownloadLinkResponse {
pub wait: Duration,
@ -30,7 +30,6 @@ pub async fn get(
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await
{
@ -80,7 +79,6 @@ impl DownloadLinkResponse {
let res = match client
.get(self.link()?)
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await
{

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration};
use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
use super::{rate_limit_wait_duration, warn_and_sleep};
pub struct FilesResponse {
pub wait: Duration,
@ -34,7 +34,6 @@ pub async fn get(client: &Client, game_name: &str, nexus_mod_id: i32) -> Result<
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await
{

View File

@ -5,7 +5,7 @@ use serde_json::Value;
use std::{env, time::Duration};
use tracing::{info, instrument};
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
use super::{rate_limit_wait_duration, warn_and_sleep};
pub struct ModResponse {
pub wait: Duration,
@ -22,7 +22,6 @@ pub async fn get(client: &Client, game_name: &str, mod_id: i32) -> Result<ModRes
))
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await
{

View File

@ -5,7 +5,7 @@ use std::env;
use tracing::{info, instrument};
use super::files::ApiFile;
use super::{warn_and_sleep, USER_AGENT};
use super::warn_and_sleep;
fn has_plugin(json: &Value) -> Result<bool> {
let node_type = json
@ -50,7 +50,6 @@ pub async fn contains_plugin(client: &Client, api_file: &ApiFile<'_>) -> Result<
.get(metadata_link)
.header("accept", "application/json")
.header("apikey", env::var("NEXUS_API_KEY")?)
.header("user-agent", USER_AGENT)
.send()
.await
{

View File

@ -31,6 +31,7 @@ pub struct ModListScrape<'a> {
pub async fn get_mod_list_page(
client: &Client,
page: usize,
game_name: &str,
game_id: i32,
include_translations: bool,
) -> Result<ModListResponse> {
@ -41,6 +42,12 @@ pub async fn get_mod_list_page(
match include_translations { true => "yes", false => "no" },
page
))
.header("host", "www.nexusmods.com")
.header("referrer", format!("https://www.nexusmods.com/{}/mods/", game_name))
.header("sec-fetch-dest", "empty")
.header("sec-fetch-mode", "cors")
.header("sec-fetch-site", "same-origin")
.header("x-requested-with", "XMLHttpRequest")
.send()
.await?
.error_for_status()?;