Add is_translation to mod with backfill
Now, the update command will scrape all non-translation mods and translation mods separately so the is_translation value can be set correctly in the future.
This commit is contained in:
parent
5d55e78283
commit
a42c22cf4b
1
migrations/20220316031403_add_is_translation_to_mods.sql
Normal file
1
migrations/20220316031403_add_is_translation_to_mods.sql
Normal file
@ -0,0 +1 @@
|
|||||||
|
ALTER TABLE "mods" ADD COLUMN "is_translation" BOOL NOT NULL DEFAULT FALSE;
|
54
src/commands/backfills/is_translation.rs
Normal file
54
src/commands/backfills/is_translation.rs
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
use anyhow::{Context, Result};
|
||||||
|
use std::time::Duration;
|
||||||
|
use tokio::time::sleep;
|
||||||
|
use tracing::{debug, info, info_span};
|
||||||
|
|
||||||
|
use crate::nexus_scraper;
|
||||||
|
|
||||||
|
const REQUEST_TIMEOUT: Duration = Duration::from_secs(7200); // 2 hours
|
||||||
|
const CONNECT_TIMEOUT: Duration = Duration::from_secs(30);
|
||||||
|
|
||||||
|
struct UpdatedMods {
|
||||||
|
id: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn backfill_is_translation(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
|
||||||
|
let mut page = 0;
|
||||||
|
let mut has_next_page = true;
|
||||||
|
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.timeout(REQUEST_TIMEOUT)
|
||||||
|
.connect_timeout(CONNECT_TIMEOUT)
|
||||||
|
.build()?;
|
||||||
|
|
||||||
|
while has_next_page {
|
||||||
|
let page_span = info_span!("page", page);
|
||||||
|
let _page_span = page_span.enter();
|
||||||
|
let mod_list_resp = nexus_scraper::get_mod_list_page(&client, page, true).await?;
|
||||||
|
let scraped = mod_list_resp.scrape_mods()?;
|
||||||
|
let scraped_ids: Vec<i32> = scraped.mods.iter().map(|m| m.nexus_mod_id).collect();
|
||||||
|
|
||||||
|
has_next_page = scraped.has_next_page;
|
||||||
|
|
||||||
|
let updated_ids: Vec<i32> = sqlx::query_as!(
|
||||||
|
UpdatedMods,
|
||||||
|
"UPDATE mods
|
||||||
|
SET is_translation = true
|
||||||
|
WHERE nexus_mod_id = ANY($1::int[])
|
||||||
|
RETURNING id",
|
||||||
|
&scraped_ids,
|
||||||
|
)
|
||||||
|
.fetch_all(pool)
|
||||||
|
.await
|
||||||
|
.context("Failed to update mod is_translation values")?
|
||||||
|
.iter()
|
||||||
|
.map(|u| u.id)
|
||||||
|
.collect();
|
||||||
|
info!(?updated_ids, "updated mods is_translation values");
|
||||||
|
|
||||||
|
page += 1;
|
||||||
|
debug!(?page, ?has_next_page, "sleeping 1 second");
|
||||||
|
sleep(Duration::from_secs(1)).await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
3
src/commands/backfills/mod.rs
Normal file
3
src/commands/backfills/mod.rs
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
pub mod is_translation;
|
||||||
|
|
||||||
|
pub use is_translation::backfill_is_translation;
|
@ -1,3 +1,4 @@
|
|||||||
|
pub mod backfills;
|
||||||
pub mod download_tiles;
|
pub mod download_tiles;
|
||||||
pub mod dump_cell_data;
|
pub mod dump_cell_data;
|
||||||
pub mod dump_cell_edit_counts;
|
pub mod dump_cell_edit_counts;
|
||||||
|
@ -24,276 +24,286 @@ pub async fn update(
|
|||||||
start_page: usize,
|
start_page: usize,
|
||||||
full: bool,
|
full: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut page = start_page;
|
for include_translations in [false, true] {
|
||||||
let mut has_next_page = true;
|
let mut page = start_page;
|
||||||
let mut pages_with_no_updates = 0;
|
let mut has_next_page = true;
|
||||||
|
let mut pages_with_no_updates = 0;
|
||||||
|
|
||||||
let game = game::insert(&pool, GAME_NAME, GAME_ID as i32).await?;
|
let game = game::insert(&pool, GAME_NAME, GAME_ID as i32).await?;
|
||||||
|
|
||||||
let client = reqwest::Client::builder()
|
let client = reqwest::Client::builder()
|
||||||
.timeout(REQUEST_TIMEOUT)
|
.timeout(REQUEST_TIMEOUT)
|
||||||
.connect_timeout(CONNECT_TIMEOUT)
|
.connect_timeout(CONNECT_TIMEOUT)
|
||||||
.build()?;
|
.build()?;
|
||||||
|
|
||||||
while has_next_page {
|
while has_next_page {
|
||||||
if !full && pages_with_no_updates >= 50 {
|
if !full && pages_with_no_updates >= 50 {
|
||||||
warn!("No updates found for 50 pages in a row, aborting");
|
warn!("No updates found for 50 pages in a row, aborting");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
let page_span = info_span!("page", page);
|
let page_span = info_span!("page", page, include_translations);
|
||||||
let _page_span = page_span.enter();
|
let _page_span = page_span.enter();
|
||||||
let mod_list_resp = nexus_scraper::get_mod_list_page(&client, page).await?;
|
let mod_list_resp =
|
||||||
let scraped = mod_list_resp.scrape_mods()?;
|
nexus_scraper::get_mod_list_page(&client, page, include_translations).await?;
|
||||||
|
let scraped = mod_list_resp.scrape_mods()?;
|
||||||
|
|
||||||
has_next_page = scraped.has_next_page;
|
has_next_page = scraped.has_next_page;
|
||||||
let processed_mods = game_mod::bulk_get_last_updated_by_nexus_mod_ids(
|
let processed_mods = game_mod::bulk_get_last_updated_by_nexus_mod_ids(
|
||||||
&pool,
|
&pool,
|
||||||
&scraped
|
&scraped
|
||||||
|
.mods
|
||||||
|
.iter()
|
||||||
|
.map(|scraped_mod| scraped_mod.nexus_mod_id)
|
||||||
|
.collect::<Vec<i32>>(),
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let mods_to_create_or_update: Vec<UnsavedMod> = scraped
|
||||||
.mods
|
.mods
|
||||||
.iter()
|
.iter()
|
||||||
.map(|scraped_mod| scraped_mod.nexus_mod_id)
|
.filter(|scraped_mod| {
|
||||||
.collect::<Vec<i32>>(),
|
if let Some(processed_mod) = processed_mods.iter().find(|processed_mod| {
|
||||||
)
|
processed_mod.nexus_mod_id == scraped_mod.nexus_mod_id
|
||||||
.await?;
|
}) {
|
||||||
let mods_to_create_or_update: Vec<UnsavedMod> = scraped
|
if processed_mod.last_updated_files_at
|
||||||
.mods
|
> NaiveDateTime::new(
|
||||||
.iter()
|
scraped_mod.last_update_at,
|
||||||
.filter(|scraped_mod| {
|
NaiveTime::from_hms(0, 0, 0),
|
||||||
if let Some(processed_mod) = processed_mods
|
)
|
||||||
.iter()
|
{
|
||||||
.find(|processed_mod| processed_mod.nexus_mod_id == scraped_mod.nexus_mod_id)
|
return false;
|
||||||
{
|
}
|
||||||
if processed_mod.last_updated_files_at
|
|
||||||
> NaiveDateTime::new(
|
|
||||||
scraped_mod.last_update_at,
|
|
||||||
NaiveTime::from_hms(0, 0, 0),
|
|
||||||
)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
}
|
true
|
||||||
true
|
})
|
||||||
})
|
.map(|scraped_mod| UnsavedMod {
|
||||||
.map(|scraped_mod| UnsavedMod {
|
name: scraped_mod.name,
|
||||||
name: scraped_mod.name,
|
nexus_mod_id: scraped_mod.nexus_mod_id,
|
||||||
nexus_mod_id: scraped_mod.nexus_mod_id,
|
author_name: scraped_mod.author_name,
|
||||||
author_name: scraped_mod.author_name,
|
author_id: scraped_mod.author_id,
|
||||||
author_id: scraped_mod.author_id,
|
category_name: scraped_mod.category_name,
|
||||||
category_name: scraped_mod.category_name,
|
category_id: scraped_mod.category_id,
|
||||||
category_id: scraped_mod.category_id,
|
description: scraped_mod.desc,
|
||||||
description: scraped_mod.desc,
|
thumbnail_link: scraped_mod.thumbnail_link,
|
||||||
thumbnail_link: scraped_mod.thumbnail_link,
|
game_id: game.id,
|
||||||
game_id: game.id,
|
is_translation: include_translations,
|
||||||
last_update_at: NaiveDateTime::new(
|
last_update_at: NaiveDateTime::new(
|
||||||
scraped_mod.last_update_at,
|
scraped_mod.last_update_at,
|
||||||
NaiveTime::from_hms(0, 0, 0),
|
NaiveTime::from_hms(0, 0, 0),
|
||||||
),
|
),
|
||||||
first_upload_at: NaiveDateTime::new(
|
first_upload_at: NaiveDateTime::new(
|
||||||
scraped_mod.first_upload_at,
|
scraped_mod.first_upload_at,
|
||||||
NaiveTime::from_hms(0, 0, 0),
|
NaiveTime::from_hms(0, 0, 0),
|
||||||
),
|
),
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let mods = game_mod::batched_insert(&pool, &mods_to_create_or_update).await?;
|
let mods = game_mod::batched_insert(&pool, &mods_to_create_or_update).await?;
|
||||||
|
|
||||||
if mods.is_empty() {
|
if mods.is_empty() {
|
||||||
pages_with_no_updates += 1;
|
pages_with_no_updates += 1;
|
||||||
} else {
|
} else {
|
||||||
pages_with_no_updates = 0;
|
pages_with_no_updates = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for db_mod in mods {
|
for db_mod in mods {
|
||||||
let mod_span = info_span!("mod", name = ?&db_mod.name, id = &db_mod.nexus_mod_id);
|
let mod_span = info_span!("mod", name = ?&db_mod.name, id = &db_mod.nexus_mod_id);
|
||||||
let _mod_span = mod_span.enter();
|
let _mod_span = mod_span.enter();
|
||||||
let files_resp = nexus_api::files::get(&client, db_mod.nexus_mod_id).await?;
|
let files_resp = nexus_api::files::get(&client, db_mod.nexus_mod_id).await?;
|
||||||
|
|
||||||
debug!(duration = ?files_resp.wait, "sleeping");
|
debug!(duration = ?files_resp.wait, "sleeping");
|
||||||
sleep(files_resp.wait).await;
|
sleep(files_resp.wait).await;
|
||||||
|
|
||||||
// Filter out replaced/deleted files (indicated by null category) and archived files
|
// Filter out replaced/deleted files (indicated by null category) and archived files
|
||||||
let files = files_resp
|
let files = files_resp
|
||||||
.files()?
|
.files()?
|
||||||
.into_iter()
|
|
||||||
.filter(|file| match file.category {
|
|
||||||
None => {
|
|
||||||
info!(
|
|
||||||
name = file.file_name,
|
|
||||||
id = file.file_id,
|
|
||||||
"skipping file with no category"
|
|
||||||
);
|
|
||||||
false
|
|
||||||
}
|
|
||||||
Some(category) if category == "ARCHIVED" => false,
|
|
||||||
Some(_) => true,
|
|
||||||
});
|
|
||||||
|
|
||||||
let processed_file_ids: HashSet<i32> =
|
|
||||||
file::get_processed_nexus_file_ids_by_mod_id(&pool, db_mod.id)
|
|
||||||
.await?
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect();
|
.filter(|file| match file.category {
|
||||||
|
None => {
|
||||||
for api_file in files {
|
info!(
|
||||||
let file_span =
|
name = file.file_name,
|
||||||
info_span!("file", name = &api_file.file_name, id = &api_file.file_id,);
|
id = file.file_id,
|
||||||
let _file_span = file_span.enter();
|
"skipping file with no category"
|
||||||
|
|
||||||
if processed_file_ids.contains(&(api_file.file_id as i32)) {
|
|
||||||
info!("skipping file already present and processed in database");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let db_file = file::insert(
|
|
||||||
&pool,
|
|
||||||
&file::UnsavedFile {
|
|
||||||
name: api_file.name,
|
|
||||||
file_name: api_file.file_name,
|
|
||||||
nexus_file_id: api_file.file_id as i32,
|
|
||||||
mod_id: db_mod.id,
|
|
||||||
category: api_file.category,
|
|
||||||
version: api_file.version,
|
|
||||||
mod_version: api_file.mod_version,
|
|
||||||
size: api_file.size,
|
|
||||||
uploaded_at: api_file.uploaded_at,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let mut checked_metadata = false;
|
|
||||||
match nexus_api::metadata::contains_plugin(&client, &api_file).await {
|
|
||||||
Ok(contains_plugin) => {
|
|
||||||
if let Some(contains_plugin) = contains_plugin {
|
|
||||||
checked_metadata = true;
|
|
||||||
if !contains_plugin {
|
|
||||||
info!("file metadata does not contain a plugin, skip downloading");
|
|
||||||
file::update_has_plugin(&pool, db_file.id, false).await?;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
warn!("file has no metadata link, continuing with download");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(err) => {
|
|
||||||
warn!(error = %err, "error retreiving metadata for file, continuing with download");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let humanized_size = api_file
|
|
||||||
.size
|
|
||||||
.file_size(file_size_opts::CONVENTIONAL)
|
|
||||||
.expect("unable to create human-readable file size");
|
|
||||||
info!(size = %humanized_size, "decided to download file");
|
|
||||||
let download_link_resp =
|
|
||||||
nexus_api::download_link::get(&client, db_mod.nexus_mod_id, api_file.file_id)
|
|
||||||
.await;
|
|
||||||
if let Err(err) = &download_link_resp {
|
|
||||||
if let Some(reqwest_err) = err.downcast_ref::<reqwest::Error>() {
|
|
||||||
if reqwest_err.status() == Some(StatusCode::NOT_FOUND) {
|
|
||||||
warn!(
|
|
||||||
status = ?reqwest_err.status(),
|
|
||||||
"failed to get download link for file, skipping file"
|
|
||||||
);
|
);
|
||||||
file::update_has_download_link(&pool, db_file.id, false).await?;
|
false
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
Some(category) if category == "ARCHIVED" => false,
|
||||||
}
|
Some(_) => true,
|
||||||
let download_link_resp = download_link_resp?;
|
});
|
||||||
|
|
||||||
let mut tokio_file = match download_link_resp.download_file(&client).await {
|
let processed_file_ids: HashSet<i32> =
|
||||||
Ok(file) => {
|
file::get_processed_nexus_file_ids_by_mod_id(&pool, db_mod.id)
|
||||||
info!(bytes = api_file.size, "download finished");
|
.await?
|
||||||
file::update_downloaded_at(&pool, db_file.id).await?;
|
.into_iter()
|
||||||
file
|
.collect();
|
||||||
}
|
|
||||||
Err(err) => {
|
for api_file in files {
|
||||||
warn!(error = %err, "failed all attempts at downloading file, skipping file");
|
let file_span =
|
||||||
|
info_span!("file", name = &api_file.file_name, id = &api_file.file_id,);
|
||||||
|
let _file_span = file_span.enter();
|
||||||
|
|
||||||
|
if processed_file_ids.contains(&(api_file.file_id as i32)) {
|
||||||
|
info!("skipping file already present and processed in database");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
let db_file = file::insert(
|
||||||
|
&pool,
|
||||||
|
&file::UnsavedFile {
|
||||||
|
name: api_file.name,
|
||||||
|
file_name: api_file.file_name,
|
||||||
|
nexus_file_id: api_file.file_id as i32,
|
||||||
|
mod_id: db_mod.id,
|
||||||
|
category: api_file.category,
|
||||||
|
version: api_file.version,
|
||||||
|
mod_version: api_file.mod_version,
|
||||||
|
size: api_file.size,
|
||||||
|
uploaded_at: api_file.uploaded_at,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
let mut initial_bytes = [0; 8];
|
let mut checked_metadata = false;
|
||||||
tokio_file.seek(SeekFrom::Start(0)).await?;
|
match nexus_api::metadata::contains_plugin(&client, &api_file).await {
|
||||||
if let Err(err) = tokio_file.read_exact(&mut initial_bytes).await {
|
Ok(contains_plugin) => {
|
||||||
warn!(error = %err, "failed to read initial bytes, skipping file");
|
if let Some(contains_plugin) = contains_plugin {
|
||||||
file::update_unable_to_extract_plugins(&pool, db_file.id, true).await?;
|
checked_metadata = true;
|
||||||
continue;
|
if !contains_plugin {
|
||||||
}
|
info!(
|
||||||
let kind = match infer::get(&initial_bytes) {
|
"file metadata does not contain a plugin, skip downloading"
|
||||||
Some(kind) => kind,
|
);
|
||||||
None => {
|
file::update_has_plugin(&pool, db_file.id, false).await?;
|
||||||
warn!(initial_bytes = ?initial_bytes, "unable to determine file type of archive, skipping file");
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
warn!("file has no metadata link, continuing with download");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
warn!(error = %err, "error retreiving metadata for file, continuing with download");
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let humanized_size = api_file
|
||||||
|
.size
|
||||||
|
.file_size(file_size_opts::CONVENTIONAL)
|
||||||
|
.expect("unable to create human-readable file size");
|
||||||
|
info!(size = %humanized_size, "decided to download file");
|
||||||
|
let download_link_resp = nexus_api::download_link::get(
|
||||||
|
&client,
|
||||||
|
db_mod.nexus_mod_id,
|
||||||
|
api_file.file_id,
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
if let Err(err) = &download_link_resp {
|
||||||
|
if let Some(reqwest_err) = err.downcast_ref::<reqwest::Error>() {
|
||||||
|
if reqwest_err.status() == Some(StatusCode::NOT_FOUND) {
|
||||||
|
warn!(
|
||||||
|
status = ?reqwest_err.status(),
|
||||||
|
"failed to get download link for file, skipping file"
|
||||||
|
);
|
||||||
|
file::update_has_download_link(&pool, db_file.id, false).await?;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let download_link_resp = download_link_resp?;
|
||||||
|
|
||||||
|
let mut tokio_file = match download_link_resp.download_file(&client).await {
|
||||||
|
Ok(file) => {
|
||||||
|
info!(bytes = api_file.size, "download finished");
|
||||||
|
file::update_downloaded_at(&pool, db_file.id).await?;
|
||||||
|
file
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
warn!(error = %err, "failed all attempts at downloading file, skipping file");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut initial_bytes = [0; 8];
|
||||||
|
tokio_file.seek(SeekFrom::Start(0)).await?;
|
||||||
|
if let Err(err) = tokio_file.read_exact(&mut initial_bytes).await {
|
||||||
|
warn!(error = %err, "failed to read initial bytes, skipping file");
|
||||||
file::update_unable_to_extract_plugins(&pool, db_file.id, true).await?;
|
file::update_unable_to_extract_plugins(&pool, db_file.id, true).await?;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
let kind = match infer::get(&initial_bytes) {
|
||||||
info!(
|
Some(kind) => kind,
|
||||||
mime_type = kind.mime_type(),
|
None => {
|
||||||
"inferred mime_type of downloaded archive"
|
warn!(initial_bytes = ?initial_bytes, "unable to determine file type of archive, skipping file");
|
||||||
);
|
file::update_unable_to_extract_plugins(&pool, db_file.id, true).await?;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
info!(
|
||||||
|
mime_type = kind.mime_type(),
|
||||||
|
"inferred mime_type of downloaded archive"
|
||||||
|
);
|
||||||
|
|
||||||
match kind.mime_type() {
|
match kind.mime_type() {
|
||||||
"application/vnd.rar" => {
|
"application/vnd.rar" => {
|
||||||
info!("downloaded archive is RAR archive, attempt to uncompress entire archive");
|
info!("downloaded archive is RAR archive, attempt to uncompress entire archive");
|
||||||
// Use unrar to uncompress the entire .rar file to avoid bugs with compress_tools uncompressing certain .rar files:
|
// Use unrar to uncompress the entire .rar file to avoid bugs with compress_tools uncompressing certain .rar files:
|
||||||
// https://github.com/libarchive/libarchive/issues/373, https://github.com/libarchive/libarchive/issues/1426
|
// https://github.com/libarchive/libarchive/issues/373, https://github.com/libarchive/libarchive/issues/1426
|
||||||
tokio_file.seek(SeekFrom::Start(0)).await?;
|
tokio_file.seek(SeekFrom::Start(0)).await?;
|
||||||
let mut file = tokio_file.try_clone().await?.into_std().await;
|
let mut file = tokio_file.try_clone().await?.into_std().await;
|
||||||
match extract_with_unrar(
|
match extract_with_unrar(
|
||||||
&mut file,
|
&mut file,
|
||||||
&pool,
|
&pool,
|
||||||
&db_file,
|
&db_file,
|
||||||
&db_mod,
|
&db_mod,
|
||||||
checked_metadata,
|
checked_metadata,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(_) => Ok(()),
|
Ok(_) => Ok(()),
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
// unrar failed to extract rar file (e.g. archive has unicode filenames)
|
// unrar failed to extract rar file (e.g. archive has unicode filenames)
|
||||||
// Attempt to uncompress the archive using `7z` unix command instead
|
|
||||||
warn!(error = %err, "failed to extract file with unrar, extracting whole archive with 7z instead");
|
|
||||||
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await
|
|
||||||
}
|
|
||||||
}?;
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
tokio_file.seek(SeekFrom::Start(0)).await?;
|
|
||||||
let mut file = tokio_file.try_clone().await?.into_std().await;
|
|
||||||
|
|
||||||
match extract_with_compress_tools(&mut file, &pool, &db_file, &db_mod).await
|
|
||||||
{
|
|
||||||
Ok(_) => Ok(()),
|
|
||||||
Err(err) => {
|
|
||||||
if err
|
|
||||||
.downcast_ref::<extractors::compress_tools::ExtractorError>()
|
|
||||||
.is_some()
|
|
||||||
&& (kind.mime_type() == "application/zip"
|
|
||||||
|| kind.mime_type() == "application/x-7z-compressed")
|
|
||||||
{
|
|
||||||
// compress_tools or libarchive failed to extract zip/7z file (e.g. archive is deflate64 compressed)
|
|
||||||
// Attempt to uncompress the archive using `7z` unix command instead
|
// Attempt to uncompress the archive using `7z` unix command instead
|
||||||
warn!(error = %err, "failed to extract file with compress_tools, extracting whole archive with 7z instead");
|
warn!(error = %err, "failed to extract file with unrar, extracting whole archive with 7z instead");
|
||||||
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await
|
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await
|
||||||
} else {
|
|
||||||
Err(err)
|
|
||||||
}
|
}
|
||||||
}
|
}?;
|
||||||
}?;
|
}
|
||||||
|
_ => {
|
||||||
|
tokio_file.seek(SeekFrom::Start(0)).await?;
|
||||||
|
let mut file = tokio_file.try_clone().await?.into_std().await;
|
||||||
|
|
||||||
|
match extract_with_compress_tools(&mut file, &pool, &db_file, &db_mod)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(_) => Ok(()),
|
||||||
|
Err(err) => {
|
||||||
|
if err
|
||||||
|
.downcast_ref::<extractors::compress_tools::ExtractorError>(
|
||||||
|
)
|
||||||
|
.is_some()
|
||||||
|
&& (kind.mime_type() == "application/zip"
|
||||||
|
|| kind.mime_type() == "application/x-7z-compressed")
|
||||||
|
{
|
||||||
|
// compress_tools or libarchive failed to extract zip/7z file (e.g. archive is deflate64 compressed)
|
||||||
|
// Attempt to uncompress the archive using `7z` unix command instead
|
||||||
|
warn!(error = %err, "failed to extract file with compress_tools, extracting whole archive with 7z instead");
|
||||||
|
extract_with_7zip(&mut file, &pool, &db_file, &db_mod).await
|
||||||
|
} else {
|
||||||
|
Err(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
debug!(duration = ?download_link_resp.wait, "sleeping");
|
||||||
|
sleep(download_link_resp.wait).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!(duration = ?download_link_resp.wait, "sleeping");
|
game_mod::update_last_updated_files_at(&pool, db_mod.id).await?;
|
||||||
sleep(download_link_resp.wait).await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
game_mod::update_last_updated_files_at(&pool, db_mod.id).await?;
|
page += 1;
|
||||||
|
debug!(?page, ?has_next_page, "sleeping 1 second");
|
||||||
|
sleep(Duration::from_secs(1)).await;
|
||||||
}
|
}
|
||||||
|
|
||||||
page += 1;
|
|
||||||
debug!(?page, ?has_next_page, "sleeping 1 second");
|
|
||||||
sleep(Duration::from_secs(1)).await;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
13
src/main.rs
13
src/main.rs
@ -12,8 +12,8 @@ mod nexus_scraper;
|
|||||||
mod plugin_processor;
|
mod plugin_processor;
|
||||||
|
|
||||||
use commands::{
|
use commands::{
|
||||||
download_tiles, dump_cell_data, dump_cell_edit_counts, dump_mod_data, dump_mod_search_index,
|
backfills::backfill_is_translation, download_tiles, dump_cell_data, dump_cell_edit_counts,
|
||||||
dump_plugin_data, update,
|
dump_mod_data, dump_mod_search_index, dump_plugin_data, update,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(FromArgs)]
|
#[derive(FromArgs)]
|
||||||
@ -23,7 +23,7 @@ struct Args {
|
|||||||
/// the page number to start scraping for mods on nexus mods
|
/// the page number to start scraping for mods on nexus mods
|
||||||
page: usize,
|
page: usize,
|
||||||
|
|
||||||
#[argh(option, short = 'f', default = "false")]
|
#[argh(switch, short = 'f')]
|
||||||
/// enable full scrape of all pages, rather than stopping after 50 pages of no updates
|
/// enable full scrape of all pages, rather than stopping after 50 pages of no updates
|
||||||
full: bool,
|
full: bool,
|
||||||
|
|
||||||
@ -50,6 +50,10 @@ struct Args {
|
|||||||
/// folder to output all map tile images downloaded from the UESP wiki
|
/// folder to output all map tile images downloaded from the UESP wiki
|
||||||
#[argh(option, short = 't')]
|
#[argh(option, short = 't')]
|
||||||
download_tiles: Option<String>,
|
download_tiles: Option<String>,
|
||||||
|
|
||||||
|
/// backfill the is_translation column in the mods table
|
||||||
|
#[argh(switch)]
|
||||||
|
backfill_is_translation: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
@ -83,6 +87,9 @@ pub async fn main() -> Result<()> {
|
|||||||
if let Some(dir) = args.download_tiles {
|
if let Some(dir) = args.download_tiles {
|
||||||
return download_tiles(&dir).await;
|
return download_tiles(&dir).await;
|
||||||
}
|
}
|
||||||
|
if args.backfill_is_translation {
|
||||||
|
return backfill_is_translation(&pool).await;
|
||||||
|
}
|
||||||
|
|
||||||
return update(&pool, args.page, args.full).await;
|
return update(&pool, args.page, args.full).await;
|
||||||
}
|
}
|
||||||
|
@ -20,6 +20,7 @@ pub struct Mod {
|
|||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
pub thumbnail_link: Option<String>,
|
pub thumbnail_link: Option<String>,
|
||||||
pub game_id: i32,
|
pub game_id: i32,
|
||||||
|
pub is_translation: bool,
|
||||||
pub updated_at: NaiveDateTime,
|
pub updated_at: NaiveDateTime,
|
||||||
pub created_at: NaiveDateTime,
|
pub created_at: NaiveDateTime,
|
||||||
pub last_update_at: NaiveDateTime,
|
pub last_update_at: NaiveDateTime,
|
||||||
@ -38,6 +39,7 @@ pub struct UnsavedMod<'a> {
|
|||||||
pub description: Option<&'a str>,
|
pub description: Option<&'a str>,
|
||||||
pub thumbnail_link: Option<&'a str>,
|
pub thumbnail_link: Option<&'a str>,
|
||||||
pub game_id: i32,
|
pub game_id: i32,
|
||||||
|
pub is_translation: bool,
|
||||||
pub last_update_at: NaiveDateTime,
|
pub last_update_at: NaiveDateTime,
|
||||||
pub first_upload_at: NaiveDateTime,
|
pub first_upload_at: NaiveDateTime,
|
||||||
}
|
}
|
||||||
@ -61,6 +63,7 @@ pub struct ModWithCells {
|
|||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
pub thumbnail_link: Option<String>,
|
pub thumbnail_link: Option<String>,
|
||||||
pub game_id: i32,
|
pub game_id: i32,
|
||||||
|
pub is_translation: bool,
|
||||||
pub updated_at: NaiveDateTime,
|
pub updated_at: NaiveDateTime,
|
||||||
pub created_at: NaiveDateTime,
|
pub created_at: NaiveDateTime,
|
||||||
pub last_update_at: NaiveDateTime,
|
pub last_update_at: NaiveDateTime,
|
||||||
@ -123,17 +126,18 @@ pub async fn insert(
|
|||||||
description: Option<&str>,
|
description: Option<&str>,
|
||||||
thumbnail_link: Option<&str>,
|
thumbnail_link: Option<&str>,
|
||||||
game_id: i32,
|
game_id: i32,
|
||||||
|
is_translation: bool,
|
||||||
last_update_at: NaiveDateTime,
|
last_update_at: NaiveDateTime,
|
||||||
first_upload_at: NaiveDateTime,
|
first_upload_at: NaiveDateTime,
|
||||||
) -> Result<Mod> {
|
) -> Result<Mod> {
|
||||||
sqlx::query_as!(
|
sqlx::query_as!(
|
||||||
Mod,
|
Mod,
|
||||||
"INSERT INTO mods
|
"INSERT INTO mods
|
||||||
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
|
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, is_translation, last_update_at, first_upload_at, created_at, updated_at)
|
||||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, now(), now())
|
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, now(), now())
|
||||||
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
|
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
|
||||||
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
|
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, is_translation, last_update_at, first_upload_at, updated_at) =
|
||||||
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
|
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.is_translation, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
|
||||||
RETURNING *",
|
RETURNING *",
|
||||||
name,
|
name,
|
||||||
nexus_mod_id,
|
nexus_mod_id,
|
||||||
@ -144,6 +148,7 @@ pub async fn insert(
|
|||||||
description,
|
description,
|
||||||
thumbnail_link,
|
thumbnail_link,
|
||||||
game_id,
|
game_id,
|
||||||
|
is_translation,
|
||||||
last_update_at,
|
last_update_at,
|
||||||
first_upload_at
|
first_upload_at
|
||||||
)
|
)
|
||||||
@ -168,6 +173,7 @@ pub async fn batched_insert<'a>(
|
|||||||
let mut descriptions: Vec<Option<&str>> = vec![];
|
let mut descriptions: Vec<Option<&str>> = vec![];
|
||||||
let mut thumbnail_links: Vec<Option<&str>> = vec![];
|
let mut thumbnail_links: Vec<Option<&str>> = vec![];
|
||||||
let mut game_ids: Vec<i32> = vec![];
|
let mut game_ids: Vec<i32> = vec![];
|
||||||
|
let mut is_translations: Vec<bool> = vec![];
|
||||||
let mut last_update_ats: Vec<NaiveDateTime> = vec![];
|
let mut last_update_ats: Vec<NaiveDateTime> = vec![];
|
||||||
let mut first_upload_ats: Vec<NaiveDateTime> = vec![];
|
let mut first_upload_ats: Vec<NaiveDateTime> = vec![];
|
||||||
batch.iter().for_each(|unsaved_mod| {
|
batch.iter().for_each(|unsaved_mod| {
|
||||||
@ -180,6 +186,7 @@ pub async fn batched_insert<'a>(
|
|||||||
descriptions.push(unsaved_mod.description);
|
descriptions.push(unsaved_mod.description);
|
||||||
thumbnail_links.push(unsaved_mod.thumbnail_link);
|
thumbnail_links.push(unsaved_mod.thumbnail_link);
|
||||||
game_ids.push(unsaved_mod.game_id);
|
game_ids.push(unsaved_mod.game_id);
|
||||||
|
is_translations.push(unsaved_mod.is_translation);
|
||||||
last_update_ats.push(unsaved_mod.last_update_at);
|
last_update_ats.push(unsaved_mod.last_update_at);
|
||||||
first_upload_ats.push(unsaved_mod.first_upload_at);
|
first_upload_ats.push(unsaved_mod.first_upload_at);
|
||||||
});
|
});
|
||||||
@ -187,12 +194,12 @@ pub async fn batched_insert<'a>(
|
|||||||
// sqlx doesn't understand arrays of Options with the query_as! macro
|
// sqlx doesn't understand arrays of Options with the query_as! macro
|
||||||
&mut sqlx::query_as(
|
&mut sqlx::query_as(
|
||||||
r#"INSERT INTO mods
|
r#"INSERT INTO mods
|
||||||
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, last_update_at, first_upload_at, created_at, updated_at)
|
(name, nexus_mod_id, author_name, author_id, category_name, category_id, description, thumbnail_link, game_id, is_translation, last_update_at, first_upload_at, created_at, updated_at)
|
||||||
SELECT *, now(), now()
|
SELECT *, now(), now()
|
||||||
FROM UNNEST($1::text[], $2::int[], $3::text[], $4::int[], $5::text[], $6::int[], $7::text[], $8::text[], $9::int[], $10::timestamp(3)[], $11::timestamp(3)[])
|
FROM UNNEST($1::text[], $2::int[], $3::text[], $4::int[], $5::text[], $6::int[], $7::text[], $8::text[], $9::int[], $10::bool[], $11::timestamp(3)[], $12::timestamp(3)[])
|
||||||
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
|
ON CONFLICT (game_id, nexus_mod_id) DO UPDATE
|
||||||
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, last_update_at, first_upload_at, updated_at) =
|
SET (name, author_name, author_id, category_name, category_id, description, thumbnail_link, is_translation, last_update_at, first_upload_at, updated_at) =
|
||||||
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
|
(EXCLUDED.name, EXCLUDED.author_name, EXCLUDED.author_id, EXCLUDED.category_name, EXCLUDED.category_id, EXCLUDED.description, EXCLUDED.thumbnail_link, EXCLUDED.is_translation, EXCLUDED.last_update_at, EXCLUDED.first_upload_at, now())
|
||||||
RETURNING *"#,
|
RETURNING *"#,
|
||||||
)
|
)
|
||||||
.bind(&names)
|
.bind(&names)
|
||||||
@ -204,6 +211,7 @@ pub async fn batched_insert<'a>(
|
|||||||
.bind(&descriptions)
|
.bind(&descriptions)
|
||||||
.bind(&thumbnail_links)
|
.bind(&thumbnail_links)
|
||||||
.bind(&game_ids)
|
.bind(&game_ids)
|
||||||
|
.bind(&is_translations)
|
||||||
.bind(&last_update_ats)
|
.bind(&last_update_ats)
|
||||||
.bind(&first_upload_ats)
|
.bind(&first_upload_ats)
|
||||||
.fetch_all(pool)
|
.fetch_all(pool)
|
||||||
@ -240,18 +248,6 @@ pub async fn update_last_updated_files_at(
|
|||||||
.context("Failed to update mod")
|
.context("Failed to update mod")
|
||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(level = "debug", skip(pool))]
|
|
||||||
pub async fn bulk_get_need_backfill(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<Vec<Mod>> {
|
|
||||||
sqlx::query_as!(
|
|
||||||
Mod,
|
|
||||||
"SELECT * FROM mods
|
|
||||||
WHERE author_id IS NULL"
|
|
||||||
)
|
|
||||||
.fetch_all(pool)
|
|
||||||
.await
|
|
||||||
.context("Failed to bulk get need backfill")
|
|
||||||
}
|
|
||||||
|
|
||||||
#[instrument(level = "debug", skip(pool, game_mod, mod_data))]
|
#[instrument(level = "debug", skip(pool, game_mod, mod_data))]
|
||||||
pub async fn update_from_api_response<'a>(
|
pub async fn update_from_api_response<'a>(
|
||||||
pool: &sqlx::Pool<sqlx::Postgres>,
|
pool: &sqlx::Pool<sqlx::Postgres>,
|
||||||
|
@ -29,11 +29,16 @@ pub struct ModListScrape<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[instrument(skip(client))]
|
#[instrument(skip(client))]
|
||||||
pub async fn get_mod_list_page(client: &Client, page: usize) -> Result<ModListResponse> {
|
pub async fn get_mod_list_page(
|
||||||
|
client: &Client,
|
||||||
|
page: usize,
|
||||||
|
include_translations: bool,
|
||||||
|
) -> Result<ModListResponse> {
|
||||||
let res = client
|
let res = client
|
||||||
.get(format!(
|
.get(format!(
|
||||||
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate",
|
"https://www.nexusmods.com/Core/Libs/Common/Widgets/ModList?RH_ModList=nav:true,home:false,type:0,user_id:0,game_id:{},advfilt:true,tags_{}%5B%5D:1428,include_adult:true,page_size:20,show_game_filter:false,open:false,page:{},sort_by:lastupdate",
|
||||||
GAME_ID,
|
GAME_ID,
|
||||||
|
match include_translations { true => "yes", false => "no" },
|
||||||
page
|
page
|
||||||
))
|
))
|
||||||
.send()
|
.send()
|
||||||
|
Loading…
Reference in New Issue
Block a user