Fix bad interior cell data in db
I had a bad unique index on the `cells` table. This commit does not fix it but has two backfill scripts that will allow me to fixup the data before creating the new unique index. `is_base_game` identifies cells that are from the `Skyrim.esm` shipped with the game (and not from some mod that happens to also have a `Skyrim.esm` with a 0,0 cell with a different form id, see: https://github.com/thallada/modmapper-web/issues/8). `dump_cell_data` now only dumps cells from `is_base_game` cells. `deduplicate_interior_cells` removes duplicate cell and plugin_cell rows caused by the buggy index. I will need to upgrade the database and create a new index in a later commit.
This commit is contained in:
parent
56864c7f29
commit
e85edff30a
1297
Cargo.lock
generated
1297
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
11
README.md
11
README.md
@ -21,6 +21,7 @@ postgres=# ALTER DATABASE modmapper OWNER TO modmapper;
|
||||
& 'C:\Program Files\PostgreSQL\13\bin\createdb.exe' -U postgres modmapper
|
||||
& 'C:\Program Files\PostgreSQL\13\bin\psql.exe' -U postgres
|
||||
postgres=# ALTER DATABASE modmapper OWNER TO modmapper;
|
||||
postgres=# ALTER USER modmapper CREATEDB;
|
||||
\password modmapper
|
||||
```
|
||||
|
||||
@ -36,14 +37,20 @@ RUST_LOG=mod_mapper=debug
|
||||
[`sqlx_cli`](https://github.com/launchbadge/sqlx/tree/master/sqlx-cli) with
|
||||
`cargo install sqlx-cli --no-default-features --features postgres`
|
||||
5. Run `sqlx migrate --source migrations run` which will run all the database migrations.
|
||||
6. Get your personal Nexus API token from your profile settings and add it to the `.env` file:
|
||||
6. Get your personal Nexus API token from your profile settings and add it to
|
||||
the `.env` file:
|
||||
|
||||
```
|
||||
NEXUS_API_KEY=...
|
||||
```
|
||||
|
||||
7. Build the release binary by running `cargo build --release`.
|
||||
8. See `./target/release/modmapper -h` for further commands or run `./scripts/update.sh` to start populating the database with scraped mods and dumping the data to JSON files.
|
||||
8. Run `./target/release/modmapper --backfill-is-game-cell` to pre-populate the
|
||||
database with worlds and cells from the base game's Skyrim.esm. (This is so
|
||||
that the base game cells can later be differentiated from cells in plugins that
|
||||
also happen to be named Skyrim.esm and have cells that reference a world with
|
||||
the same form ID as Tamriel.)
|
||||
9. See `./target/release/modmapper -h` for further commands or run `./scripts/update.sh` to start populating the database with scraped mods and dumping the data to JSON files.
|
||||
|
||||
## Sync and Backup Setup
|
||||
|
||||
|
1
data/skyrim.json
Normal file
1
data/skyrim.json
Normal file
File diff suppressed because one or more lines are too long
1
migrations/20231101025211_add_is_base_game_to_cells.sql
Normal file
1
migrations/20231101025211_add_is_base_game_to_cells.sql
Normal file
@ -0,0 +1 @@
|
||||
ALTER TABLE "cells" ADD COLUMN "is_base_game" BOOLEAN NOT NULL DEFAULT false;
|
138
src/commands/backfills/deduplicate_interior_cells.rs
Normal file
138
src/commands/backfills/deduplicate_interior_cells.rs
Normal file
@ -0,0 +1,138 @@
|
||||
/// There was a bug with the unique index on the `cells` table that was causing the same form_id
|
||||
/// and master to be inserted multiple times. This function deduplicates those, choosing the cell
|
||||
/// with `is_base_game = true` if there is one, otherwise randomly chooses one.
|
||||
/// rows referencing the duplicate cells are updated to reference the chosen cell.
|
||||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::postgres::PgDatabaseError;
|
||||
use sqlx::types::Json;
|
||||
use sqlx::FromRow;
|
||||
use tracing::{info, warn};
|
||||
|
||||
const PAGE_SIZE: i64 = 100;
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, FromRow)]
|
||||
pub struct CellId {
|
||||
pub id: i32,
|
||||
pub is_base_game: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, FromRow)]
|
||||
pub struct CellDuplicates {
|
||||
pub ids: Option<Json<Vec<CellId>>>,
|
||||
pub form_id: i32,
|
||||
pub master: String,
|
||||
}
|
||||
|
||||
pub async fn deduplicate_interior_cells(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
|
||||
let mut page = 0;
|
||||
loop {
|
||||
info!("deduplicating interior cells page {}", page);
|
||||
let duplicates = sqlx::query_as!(
|
||||
CellDuplicates,
|
||||
r#"SELECT
|
||||
json_agg(
|
||||
json_build_object(
|
||||
'id', id,
|
||||
'is_base_game', is_base_game
|
||||
)) as "ids: Json<Vec<CellId>>",
|
||||
form_id,
|
||||
master
|
||||
FROM cells
|
||||
WHERE world_id IS NULL
|
||||
GROUP BY (form_id, master)
|
||||
HAVING COUNT(*) > 1
|
||||
LIMIT $1
|
||||
"#,
|
||||
PAGE_SIZE,
|
||||
)
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
if duplicates.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
for duplicate_cell in duplicates.into_iter() {
|
||||
info!(
|
||||
"deduplicating cells form_id={} master={}",
|
||||
duplicate_cell.form_id, duplicate_cell.master
|
||||
);
|
||||
let duplicate_ids = duplicate_cell.ids.clone().unwrap();
|
||||
let chosen_cell = duplicate_ids
|
||||
.iter()
|
||||
.find(|cell| cell.is_base_game)
|
||||
.unwrap_or_else(|| {
|
||||
duplicate_ids
|
||||
.iter()
|
||||
.next()
|
||||
.expect("duplicate cell has no ids")
|
||||
});
|
||||
info!(
|
||||
"choosing cell_id={} is_base_game={}",
|
||||
chosen_cell.id, chosen_cell.is_base_game
|
||||
);
|
||||
// Update all plugin_cells cell_id references to point to the chosen cell
|
||||
let duplicate_ids = duplicate_cell
|
||||
.ids
|
||||
.unwrap()
|
||||
.iter()
|
||||
.map(|cell| cell.id)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// First, I need to fix-up any duplicated plugin_cells rows caused by broken
|
||||
// plugins that have multiple cells with the same form_id. For these duplicate
|
||||
// plugin_cells with the same plugin_id, I just arbitrarily choose one and delete
|
||||
// the others (since it's undefined behavior of which duplicate record should "win"
|
||||
// out in this case anyways). In the case of exterior cells, where the duplicate
|
||||
// interior cell bug is not a problem, the last processed cell record in the plugin
|
||||
// wins since `process_plugin` uses an upsert method which updates existing
|
||||
// `plugin_cells` if it tries to insert a new one that conflicts with an existing one.
|
||||
// So I am effectively retroactively doing the same here for interior cells.
|
||||
let plugin_cells_delete = sqlx::query!(
|
||||
r#"DELETE FROM plugin_cells
|
||||
WHERE id NOT IN (
|
||||
SELECT MIN(id)
|
||||
FROM plugin_cells
|
||||
WHERE cell_id = ANY($1)
|
||||
GROUP BY plugin_id
|
||||
)
|
||||
AND cell_id = ANY($1)
|
||||
"#,
|
||||
&duplicate_ids
|
||||
)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
info!(
|
||||
"deleted {} duplicate plugin_cells from broken plugins",
|
||||
plugin_cells_delete.rows_affected()
|
||||
);
|
||||
|
||||
let update = sqlx::query!(
|
||||
r#"UPDATE plugin_cells
|
||||
SET
|
||||
cell_id = $1,
|
||||
updated_at = now()
|
||||
WHERE cell_id = ANY($2)"#,
|
||||
chosen_cell.id,
|
||||
&duplicate_ids
|
||||
)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
info!("updated {} plugin_cells", update.rows_affected());
|
||||
|
||||
// Delete all cells that are not the chosen cell
|
||||
let delete = sqlx::query!(
|
||||
r#"DELETE FROM cells
|
||||
WHERE id != $1 AND id = ANY($2)"#,
|
||||
chosen_cell.id,
|
||||
&duplicate_ids
|
||||
)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
info!("deleted {} cells", delete.rows_affected());
|
||||
}
|
||||
page += 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
71
src/commands/backfills/is_base_game.rs
Normal file
71
src/commands/backfills/is_base_game.rs
Normal file
@ -0,0 +1,71 @@
|
||||
use std::borrow::Borrow;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use skyrim_cell_dump::Plugin;
|
||||
use tracing::info;
|
||||
|
||||
use crate::models::cell::{self, UnsavedCell};
|
||||
use crate::models::world::{self, UnsavedWorld};
|
||||
use crate::plugin_processor::get_local_form_id_and_master;
|
||||
|
||||
pub async fn backfill_is_base_game(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
|
||||
let file = File::open("./data/skyrim.json")?;
|
||||
let reader = BufReader::new(file);
|
||||
let plugin: Plugin =
|
||||
serde_json::from_reader(reader).context("failed to deserialize data/skyrim.json")?;
|
||||
let file_name = "Skyrim.esm";
|
||||
let masters: Vec<&str> = plugin.header.masters.iter().map(|s| s.borrow()).collect();
|
||||
let base_worlds: Vec<UnsavedWorld> = plugin
|
||||
.worlds
|
||||
.iter()
|
||||
.map(|world| {
|
||||
let (form_id, master) =
|
||||
get_local_form_id_and_master(world.form_id, &masters, file_name)
|
||||
.expect("form_id to be a valid i32");
|
||||
UnsavedWorld { form_id, master }
|
||||
})
|
||||
.collect();
|
||||
let db_worlds = world::batched_insert(pool, &base_worlds).await?;
|
||||
info!("Upserted {} Skyrim.esm base worlds", db_worlds.len());
|
||||
let base_cells: Vec<UnsavedCell> = plugin
|
||||
.cells
|
||||
.iter()
|
||||
.map(|cell| {
|
||||
let world_id = if let Some(world_form_id) = cell.world_form_id {
|
||||
let (form_id, master) =
|
||||
get_local_form_id_and_master(world_form_id, &masters, file_name)
|
||||
.expect("form_id to be valid i32");
|
||||
Some(
|
||||
db_worlds
|
||||
.iter()
|
||||
.find(|&world| world.form_id == form_id && world.master == master)
|
||||
.expect("cell references world in the plugin worlds")
|
||||
.id,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let (form_id, master) =
|
||||
get_local_form_id_and_master(cell.form_id, &masters, file_name)
|
||||
.expect("form_id is a valid i32");
|
||||
UnsavedCell {
|
||||
form_id,
|
||||
master,
|
||||
x: cell.x,
|
||||
y: cell.y,
|
||||
world_id,
|
||||
is_persistent: cell.is_persistent,
|
||||
is_base_game: true, // the whole point of this function
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let db_cells = cell::batched_insert(pool, &base_cells).await?;
|
||||
info!("Upserted {} Skyrim.esm base cells", db_cells.len());
|
||||
// This works for exterior cells, but there's a bug with the unique index on cells that
|
||||
// creates duplicate interior cells. To fix that, I need to upgrade postgres to
|
||||
// 15 or later, migate the data to the new db cluster, consolidate all of the duplicate cells
|
||||
// into one cell in a separate backfill command, then fix the unique index.
|
||||
Ok(())
|
||||
}
|
@ -1,3 +1,7 @@
|
||||
pub mod deduplicate_interior_cells;
|
||||
pub mod is_translation;
|
||||
pub mod is_base_game;
|
||||
|
||||
pub use deduplicate_interior_cells::deduplicate_interior_cells;
|
||||
pub use is_translation::backfill_is_translation;
|
||||
pub use is_base_game::backfill_is_base_game;
|
||||
|
@ -10,7 +10,7 @@ pub async fn dump_cell_data(pool: &sqlx::Pool<sqlx::Postgres>, dir: &str) -> Res
|
||||
let mut cell_count = 0;
|
||||
for x in -77..75 {
|
||||
for y in -50..44 {
|
||||
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y).await {
|
||||
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y, true).await {
|
||||
let path = format!("{}/{}", &dir, x);
|
||||
let path = Path::new(&path);
|
||||
create_dir_all(path)?;
|
||||
|
17
src/main.rs
17
src/main.rs
@ -13,7 +13,8 @@ mod nexus_scraper;
|
||||
mod plugin_processor;
|
||||
|
||||
use commands::{
|
||||
backfills::backfill_is_translation, download_tiles, dump_cell_data, dump_cell_edit_counts,
|
||||
backfills::backfill_is_base_game, backfills::backfill_is_translation,
|
||||
backfills::deduplicate_interior_cells, download_tiles, dump_cell_data, dump_cell_edit_counts,
|
||||
dump_file_data, dump_games, dump_mod_cell_counts, dump_mod_data, dump_mod_search_index,
|
||||
dump_plugin_data, update,
|
||||
};
|
||||
@ -77,6 +78,14 @@ struct Args {
|
||||
#[argh(switch)]
|
||||
backfill_is_translation: bool,
|
||||
|
||||
/// backfill the is_base_game column in the cells table (for Skyrim.esm)
|
||||
#[argh(switch)]
|
||||
backfill_is_base_game: bool,
|
||||
|
||||
/// deduplicate the interior cells with same form_id and master
|
||||
#[argh(switch)]
|
||||
deduplicate_interior_cells: bool,
|
||||
|
||||
/// when dumping data, only dump data for mods or files that have been updated since this date
|
||||
#[argh(option, short = 'u')]
|
||||
updated_after: Option<NaiveDateTime>,
|
||||
@ -125,6 +134,12 @@ pub async fn main() -> Result<()> {
|
||||
if args.backfill_is_translation {
|
||||
return backfill_is_translation(&pool).await;
|
||||
}
|
||||
if args.backfill_is_base_game {
|
||||
return backfill_is_base_game(&pool).await;
|
||||
}
|
||||
if args.deduplicate_interior_cells {
|
||||
return deduplicate_interior_cells(&pool).await;
|
||||
}
|
||||
|
||||
update(&pool, args.page, &args.game, args.full).await
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ pub struct Cell {
|
||||
pub y: Option<i32>,
|
||||
pub world_id: Option<i32>,
|
||||
pub is_persistent: bool,
|
||||
pub is_base_game: bool,
|
||||
pub updated_at: NaiveDateTime,
|
||||
pub created_at: NaiveDateTime,
|
||||
}
|
||||
@ -27,6 +28,7 @@ pub struct UnsavedCell<'a> {
|
||||
pub y: Option<i32>,
|
||||
pub world_id: Option<i32>,
|
||||
pub is_persistent: bool,
|
||||
pub is_base_game: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, FromRow)]
|
||||
@ -50,22 +52,24 @@ pub async fn insert(
|
||||
y: Option<i32>,
|
||||
world_id: Option<i32>,
|
||||
is_persistent: bool,
|
||||
is_base_game: bool,
|
||||
) -> Result<Cell> {
|
||||
sqlx::query_as!(
|
||||
Cell,
|
||||
"INSERT INTO cells
|
||||
(form_id, master, x, y, world_id, is_persistent, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, now(), now())
|
||||
(form_id, master, x, y, world_id, is_persistent, is_base_game, created_at, updated_at)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, now(), now())
|
||||
ON CONFLICT (form_id, master, world_id) DO UPDATE
|
||||
SET (x, y, is_persistent, updated_at) =
|
||||
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, now())
|
||||
SET (x, y, is_persistent, is_base_game, updated_at) =
|
||||
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, EXCLUDED.is_base_game, now())
|
||||
RETURNING *",
|
||||
form_id,
|
||||
master,
|
||||
x,
|
||||
y,
|
||||
world_id,
|
||||
is_persistent
|
||||
is_persistent,
|
||||
is_base_game
|
||||
)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
@ -85,6 +89,7 @@ pub async fn batched_insert<'a>(
|
||||
let mut ys: Vec<Option<i32>> = vec![];
|
||||
let mut world_ids: Vec<Option<i32>> = vec![];
|
||||
let mut is_persistents: Vec<bool> = vec![];
|
||||
let mut is_base_games: Vec<bool> = vec![];
|
||||
batch.iter().for_each(|unsaved_cell| {
|
||||
form_ids.push(unsaved_cell.form_id);
|
||||
masters.push(unsaved_cell.master);
|
||||
@ -92,15 +97,16 @@ pub async fn batched_insert<'a>(
|
||||
ys.push(unsaved_cell.y);
|
||||
world_ids.push(unsaved_cell.world_id);
|
||||
is_persistents.push(unsaved_cell.is_persistent);
|
||||
is_base_games.push(unsaved_cell.is_base_game);
|
||||
});
|
||||
saved_cells.append(
|
||||
// sqlx doesn't understand arrays of Options with the query_as! macro
|
||||
&mut sqlx::query_as(
|
||||
r#"INSERT INTO cells (form_id, master, x, y, world_id, is_persistent, created_at, updated_at)
|
||||
SELECT *, now(), now() FROM UNNEST($1::int[], $2::text[], $3::int[], $4::int[], $5::int[], $6::bool[])
|
||||
r#"INSERT INTO cells (form_id, master, x, y, world_id, is_persistent, is_base_game, created_at, updated_at)
|
||||
SELECT *, now(), now() FROM UNNEST($1::int[], $2::text[], $3::int[], $4::int[], $5::int[], $6::bool[], $7::bool[])
|
||||
ON CONFLICT (form_id, master, world_id) DO UPDATE
|
||||
SET (x, y, is_persistent, updated_at) =
|
||||
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, now())
|
||||
SET (x, y, is_persistent, is_base_game, updated_at) =
|
||||
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, EXCLUDED.is_base_game, now())
|
||||
RETURNING *"#,
|
||||
)
|
||||
.bind(&form_ids)
|
||||
@ -109,6 +115,7 @@ pub async fn batched_insert<'a>(
|
||||
.bind(&ys)
|
||||
.bind(&world_ids)
|
||||
.bind(&is_persistents)
|
||||
.bind(&is_base_games)
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.context("Failed to insert cells")?,
|
||||
@ -151,7 +158,9 @@ pub async fn get_cell_data(
|
||||
world_id: i32,
|
||||
x: i32,
|
||||
y: i32,
|
||||
is_base_game_only: bool,
|
||||
) -> Result<CellData> {
|
||||
if is_base_game_only {
|
||||
sqlx::query_as!(
|
||||
CellData,
|
||||
r#"SELECT
|
||||
@ -178,4 +187,33 @@ pub async fn get_cell_data(
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.context("Failed get cell data")
|
||||
} else {
|
||||
sqlx::query_as!(
|
||||
CellData,
|
||||
r#"SELECT
|
||||
cells.x,
|
||||
cells.y,
|
||||
cells.is_persistent,
|
||||
cells.form_id,
|
||||
COUNT(DISTINCT plugins.id) as plugins_count,
|
||||
COUNT(DISTINCT files.id) as files_count,
|
||||
COUNT(DISTINCT mods.id) as mods_count,
|
||||
json_agg(DISTINCT mods.*) as mods
|
||||
FROM cells
|
||||
JOIN plugin_cells on cells.id = cell_id
|
||||
JOIN plugins ON plugins.id = plugin_id
|
||||
JOIN files ON files.id = plugins.file_id
|
||||
JOIN mods ON mods.id = files.mod_id
|
||||
WHERE cells.master = $1 AND cells.world_id = $2 AND cells.x = $3 and cells.y = $4 AND is_base_game = true
|
||||
GROUP BY cells.x, cells.y, cells.is_persistent, cells.form_id"#,
|
||||
master,
|
||||
world_id,
|
||||
x,
|
||||
y
|
||||
)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.context("Failed get cell data")
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -15,7 +15,7 @@ use crate::models::{plugin_cell, plugin_cell::UnsavedPluginCell};
|
||||
use crate::models::{plugin_world, plugin_world::UnsavedPluginWorld};
|
||||
use crate::models::{world, world::UnsavedWorld};
|
||||
|
||||
fn get_local_form_id_and_master<'a>(
|
||||
pub fn get_local_form_id_and_master<'a>(
|
||||
form_id: u32,
|
||||
masters: &'a [&str],
|
||||
file_name: &'a str,
|
||||
@ -124,6 +124,7 @@ pub async fn process_plugin(
|
||||
y: cell.y,
|
||||
world_id,
|
||||
is_persistent: cell.is_persistent,
|
||||
is_base_game: false,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
Loading…
Reference in New Issue
Block a user