Fix mode page scraping
Was returning 403s for the past month.
This commit is contained in:
parent
694ef6e89b
commit
56864c7f29
@ -3,7 +3,7 @@ use std::time::Duration;
|
|||||||
use tokio::time::sleep;
|
use tokio::time::sleep;
|
||||||
use tracing::{debug, info, info_span};
|
use tracing::{debug, info, info_span};
|
||||||
|
|
||||||
use crate::nexus_api::SSE_GAME_ID;
|
use crate::nexus_api::{SSE_GAME_ID, SSE_GAME_NAME};
|
||||||
use crate::nexus_scraper;
|
use crate::nexus_scraper;
|
||||||
|
|
||||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
|
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
|
||||||
@ -26,7 +26,7 @@ pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Resul
|
|||||||
let page_span = info_span!("page", page);
|
let page_span = info_span!("page", page);
|
||||||
let _page_span = page_span.enter();
|
let _page_span = page_span.enter();
|
||||||
let mod_list_resp =
|
let mod_list_resp =
|
||||||
nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_ID, true).await?;
|
nexus_scraper::get_mod_list_page(&client, page, SSE_GAME_NAME, SSE_GAME_ID, true).await?;
|
||||||
let scraped = mod_list_resp.scrape_mods()?;
|
let scraped = mod_list_resp.scrape_mods()?;
|
||||||
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();
|
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ use anyhow::Result;
|
|||||||
use chrono::{NaiveDateTime, NaiveTime};
|
use chrono::{NaiveDateTime, NaiveTime};
|
||||||
use humansize::{format_size_i, DECIMAL};
|
use humansize::{format_size_i, DECIMAL};
|
||||||
use reqwest::StatusCode;
|
use reqwest::StatusCode;
|
||||||
|
use reqwest::header::{HeaderMap, HeaderValue};
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io::SeekFrom;
|
use std::io::SeekFrom;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
@ -13,7 +14,7 @@ use crate::extractors::{self, extract_with_7zip, extract_with_compress_tools, ex
|
|||||||
use crate::models::file;
|
use crate::models::file;
|
||||||
use crate::models::game;
|
use crate::models::game;
|
||||||
use crate::models::{game_mod, game_mod::UnsavedMod};
|
use crate::models::{game_mod, game_mod::UnsavedMod};
|
||||||
use crate::nexus_api::{self, get_game_id};
|
use crate::nexus_api::{self, get_game_id, USER_AGENT};
|
||||||
use crate::nexus_scraper;
|
use crate::nexus_scraper;
|
||||||
|
|
||||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
|
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
|
||||||
@ -30,9 +31,12 @@ pub async fn update(
|
|||||||
let mut has_next_page = true;
|
let mut has_next_page = true;
|
||||||
let mut pages_with_no_updates = 0;
|
let mut pages_with_no_updates = 0;
|
||||||
|
|
||||||
|
let mut headers = HeaderMap::new();
|
||||||
|
headers.insert("user-agent", HeaderValue::from_static(USER_AGENT));
|
||||||
let client = reqwest::Client::builder()
|
let client = reqwest::Client::builder()
|
||||||
.timeout(REQUEST_TIMEOUT)
|
.timeout(REQUEST_TIMEOUT)
|
||||||
.connect_timeout(CONNECT_TIMEOUT)
|
.connect_timeout(CONNECT_TIMEOUT)
|
||||||
|
.default_headers(headers)
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
let game_id = get_game_id(game_name).expect("valid game name");
|
let game_id = get_game_id(game_name).expect("valid game name");
|
||||||
@ -49,6 +53,7 @@ pub async fn update(
|
|||||||
let mod_list_resp = nexus_scraper::get_mod_list_page(
|
let mod_list_resp = nexus_scraper::get_mod_list_page(
|
||||||
&client,
|
&client,
|
||||||
page,
|
page,
|
||||||
|
game_name,
|
||||||
game.nexus_game_id,
|
game.nexus_game_id,
|
||||||
include_translations,
|
include_translations,
|
||||||
)
|
)
|
||||||
|
@ -8,7 +8,7 @@ use tokio::fs::File;
|
|||||||
use tokio_util::compat::FuturesAsyncReadCompatExt;
|
use tokio_util::compat::FuturesAsyncReadCompatExt;
|
||||||
use tracing::{info, instrument};
|
use tracing::{info, instrument};
|
||||||
|
|
||||||
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
|
use super::{rate_limit_wait_duration, warn_and_sleep};
|
||||||
|
|
||||||
pub struct DownloadLinkResponse {
|
pub struct DownloadLinkResponse {
|
||||||
pub wait: Duration,
|
pub wait: Duration,
|
||||||
@ -30,7 +30,6 @@ pub async fn get(
|
|||||||
))
|
))
|
||||||
.header("accept", "application/json")
|
.header("accept", "application/json")
|
||||||
.header("apikey", env::var("NEXUS_API_KEY")?)
|
.header("apikey", env::var("NEXUS_API_KEY")?)
|
||||||
.header("user-agent", USER_AGENT)
|
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
@ -80,7 +79,6 @@ impl DownloadLinkResponse {
|
|||||||
let res = match client
|
let res = match client
|
||||||
.get(self.link()?)
|
.get(self.link()?)
|
||||||
.header("apikey", env::var("NEXUS_API_KEY")?)
|
.header("apikey", env::var("NEXUS_API_KEY")?)
|
||||||
.header("user-agent", USER_AGENT)
|
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
|
@ -5,7 +5,7 @@ use serde_json::Value;
|
|||||||
use std::{env, time::Duration};
|
use std::{env, time::Duration};
|
||||||
use tracing::{info, instrument};
|
use tracing::{info, instrument};
|
||||||
|
|
||||||
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
|
use super::{rate_limit_wait_duration, warn_and_sleep};
|
||||||
|
|
||||||
pub struct FilesResponse {
|
pub struct FilesResponse {
|
||||||
pub wait: Duration,
|
pub wait: Duration,
|
||||||
@ -34,7 +34,6 @@ pub async fn get(client: &Client, game_name: &str, nexus_mod_id: i32) -> Result<
|
|||||||
))
|
))
|
||||||
.header("accept", "application/json")
|
.header("accept", "application/json")
|
||||||
.header("apikey", env::var("NEXUS_API_KEY")?)
|
.header("apikey", env::var("NEXUS_API_KEY")?)
|
||||||
.header("user-agent", USER_AGENT)
|
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
|
@ -5,7 +5,7 @@ use serde_json::Value;
|
|||||||
use std::{env, time::Duration};
|
use std::{env, time::Duration};
|
||||||
use tracing::{info, instrument};
|
use tracing::{info, instrument};
|
||||||
|
|
||||||
use super::{rate_limit_wait_duration, warn_and_sleep, USER_AGENT};
|
use super::{rate_limit_wait_duration, warn_and_sleep};
|
||||||
|
|
||||||
pub struct ModResponse {
|
pub struct ModResponse {
|
||||||
pub wait: Duration,
|
pub wait: Duration,
|
||||||
@ -22,7 +22,6 @@ pub async fn get(client: &Client, game_name: &str, mod_id: i32) -> Result<ModRes
|
|||||||
))
|
))
|
||||||
.header("accept", "application/json")
|
.header("accept", "application/json")
|
||||||
.header("apikey", env::var("NEXUS_API_KEY")?)
|
.header("apikey", env::var("NEXUS_API_KEY")?)
|
||||||
.header("user-agent", USER_AGENT)
|
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
|
@ -5,7 +5,7 @@ use std::env;
|
|||||||
use tracing::{info, instrument};
|
use tracing::{info, instrument};
|
||||||
|
|
||||||
use super::files::ApiFile;
|
use super::files::ApiFile;
|
||||||
use super::{warn_and_sleep, USER_AGENT};
|
use super::warn_and_sleep;
|
||||||
|
|
||||||
fn has_plugin(json: &Value) -> Result<bool> {
|
fn has_plugin(json: &Value) -> Result<bool> {
|
||||||
let node_type = json
|
let node_type = json
|
||||||
@ -50,7 +50,6 @@ pub async fn contains_plugin(client: &Client, api_file: &ApiFile<'_>) -> Result<
|
|||||||
.get(metadata_link)
|
.get(metadata_link)
|
||||||
.header("accept", "application/json")
|
.header("accept", "application/json")
|
||||||
.header("apikey", env::var("NEXUS_API_KEY")?)
|
.header("apikey", env::var("NEXUS_API_KEY")?)
|
||||||
.header("user-agent", USER_AGENT)
|
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
|
@ -31,6 +31,7 @@ pub struct ModListScrape<'a> {
|
|||||||
pub async fn get_mod_list_page(
|
pub async fn get_mod_list_page(
|
||||||
client: &Client,
|
client: &Client,
|
||||||
page: usize,
|
page: usize,
|
||||||
|
game_name: &str,
|
||||||
game_id: i32,
|
game_id: i32,
|
||||||
include_translations: bool,
|
include_translations: bool,
|
||||||
) -> Result<ModListResponse> {
|
) -> Result<ModListResponse> {
|
||||||
@ -41,6 +42,12 @@ pub async fn get_mod_list_page(
|
|||||||
match include_translations { true => "yes", false => "no" },
|
match include_translations { true => "yes", false => "no" },
|
||||||
page
|
page
|
||||||
))
|
))
|
||||||
|
.header("host", "www.nexusmods.com")
|
||||||
|
.header("referrer", format!("https://www.nexusmods.com/{}/mods/", game_name))
|
||||||
|
.header("sec-fetch-dest", "empty")
|
||||||
|
.header("sec-fetch-mode", "cors")
|
||||||
|
.header("sec-fetch-site", "same-origin")
|
||||||
|
.header("x-requested-with", "XMLHttpRequest")
|
||||||
.send()
|
.send()
|
||||||
.await?
|
.await?
|
||||||
.error_for_status()?;
|
.error_for_status()?;
|
||||||
|
Loading…
Reference in New Issue
Block a user