Fix bad interior cell data in db

I had a bad unique index on the `cells` table. This commit does not fix
it but has two backfill scripts that will allow me to fixup the data
before creating the new unique index.

`is_base_game` identifies cells that are from the `Skyrim.esm` shipped
with the game (and not from some mod that happens to also have a
`Skyrim.esm` with a 0,0 cell with a different form id, see:
https://github.com/thallada/modmapper-web/issues/8).

`dump_cell_data` now only dumps cells from `is_base_game` cells.

`deduplicate_interior_cells` removes duplicate cell and plugin_cell rows
caused by the buggy index.

I will need to upgrade the database and create a new index in a later
commit.
This commit is contained in:
2023-11-08 01:05:19 -05:00
parent 56864c7f29
commit e85edff30a
11 changed files with 984 additions and 669 deletions

View File

@@ -0,0 +1,138 @@
/// There was a bug with the unique index on the `cells` table that was causing the same form_id
/// and master to be inserted multiple times. This function deduplicates those, choosing the cell
/// with `is_base_game = true` if there is one, otherwise randomly chooses one.
/// rows referencing the duplicate cells are updated to reference the chosen cell.
use anyhow::Result;
use serde::{Deserialize, Serialize};
use sqlx::postgres::PgDatabaseError;
use sqlx::types::Json;
use sqlx::FromRow;
use tracing::{info, warn};
const PAGE_SIZE: i64 = 100;
#[derive(Debug, Clone, Deserialize, Serialize, FromRow)]
pub struct CellId {
pub id: i32,
pub is_base_game: bool,
}
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct CellDuplicates {
pub ids: Option<Json<Vec<CellId>>>,
pub form_id: i32,
pub master: String,
}
pub async fn deduplicate_interior_cells(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
let mut page = 0;
loop {
info!("deduplicating interior cells page {}", page);
let duplicates = sqlx::query_as!(
CellDuplicates,
r#"SELECT
json_agg(
json_build_object(
'id', id,
'is_base_game', is_base_game
)) as "ids: Json<Vec<CellId>>",
form_id,
master
FROM cells
WHERE world_id IS NULL
GROUP BY (form_id, master)
HAVING COUNT(*) > 1
LIMIT $1
"#,
PAGE_SIZE,
)
.fetch_all(pool)
.await?;
if duplicates.is_empty() {
break;
}
for duplicate_cell in duplicates.into_iter() {
info!(
"deduplicating cells form_id={} master={}",
duplicate_cell.form_id, duplicate_cell.master
);
let duplicate_ids = duplicate_cell.ids.clone().unwrap();
let chosen_cell = duplicate_ids
.iter()
.find(|cell| cell.is_base_game)
.unwrap_or_else(|| {
duplicate_ids
.iter()
.next()
.expect("duplicate cell has no ids")
});
info!(
"choosing cell_id={} is_base_game={}",
chosen_cell.id, chosen_cell.is_base_game
);
// Update all plugin_cells cell_id references to point to the chosen cell
let duplicate_ids = duplicate_cell
.ids
.unwrap()
.iter()
.map(|cell| cell.id)
.collect::<Vec<_>>();
// First, I need to fix-up any duplicated plugin_cells rows caused by broken
// plugins that have multiple cells with the same form_id. For these duplicate
// plugin_cells with the same plugin_id, I just arbitrarily choose one and delete
// the others (since it's undefined behavior of which duplicate record should "win"
// out in this case anyways). In the case of exterior cells, where the duplicate
// interior cell bug is not a problem, the last processed cell record in the plugin
// wins since `process_plugin` uses an upsert method which updates existing
// `plugin_cells` if it tries to insert a new one that conflicts with an existing one.
// So I am effectively retroactively doing the same here for interior cells.
let plugin_cells_delete = sqlx::query!(
r#"DELETE FROM plugin_cells
WHERE id NOT IN (
SELECT MIN(id)
FROM plugin_cells
WHERE cell_id = ANY($1)
GROUP BY plugin_id
)
AND cell_id = ANY($1)
"#,
&duplicate_ids
)
.execute(pool)
.await?;
info!(
"deleted {} duplicate plugin_cells from broken plugins",
plugin_cells_delete.rows_affected()
);
let update = sqlx::query!(
r#"UPDATE plugin_cells
SET
cell_id = $1,
updated_at = now()
WHERE cell_id = ANY($2)"#,
chosen_cell.id,
&duplicate_ids
)
.execute(pool)
.await?;
info!("updated {} plugin_cells", update.rows_affected());
// Delete all cells that are not the chosen cell
let delete = sqlx::query!(
r#"DELETE FROM cells
WHERE id != $1 AND id = ANY($2)"#,
chosen_cell.id,
&duplicate_ids
)
.execute(pool)
.await?;
info!("deleted {} cells", delete.rows_affected());
}
page += 1;
}
Ok(())
}

View File

@@ -0,0 +1,71 @@
use std::borrow::Borrow;
use std::fs::File;
use std::io::BufReader;
use anyhow::{Context, Result};
use skyrim_cell_dump::Plugin;
use tracing::info;
use crate::models::cell::{self, UnsavedCell};
use crate::models::world::{self, UnsavedWorld};
use crate::plugin_processor::get_local_form_id_and_master;
pub async fn backfill_is_base_game(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
let file = File::open("./data/skyrim.json")?;
let reader = BufReader::new(file);
let plugin: Plugin =
serde_json::from_reader(reader).context("failed to deserialize data/skyrim.json")?;
let file_name = "Skyrim.esm";
let masters: Vec<&str> = plugin.header.masters.iter().map(|s| s.borrow()).collect();
let base_worlds: Vec<UnsavedWorld> = plugin
.worlds
.iter()
.map(|world| {
let (form_id, master) =
get_local_form_id_and_master(world.form_id, &masters, file_name)
.expect("form_id to be a valid i32");
UnsavedWorld { form_id, master }
})
.collect();
let db_worlds = world::batched_insert(pool, &base_worlds).await?;
info!("Upserted {} Skyrim.esm base worlds", db_worlds.len());
let base_cells: Vec<UnsavedCell> = plugin
.cells
.iter()
.map(|cell| {
let world_id = if let Some(world_form_id) = cell.world_form_id {
let (form_id, master) =
get_local_form_id_and_master(world_form_id, &masters, file_name)
.expect("form_id to be valid i32");
Some(
db_worlds
.iter()
.find(|&world| world.form_id == form_id && world.master == master)
.expect("cell references world in the plugin worlds")
.id,
)
} else {
None
};
let (form_id, master) =
get_local_form_id_and_master(cell.form_id, &masters, file_name)
.expect("form_id is a valid i32");
UnsavedCell {
form_id,
master,
x: cell.x,
y: cell.y,
world_id,
is_persistent: cell.is_persistent,
is_base_game: true, // the whole point of this function
}
})
.collect();
let db_cells = cell::batched_insert(pool, &base_cells).await?;
info!("Upserted {} Skyrim.esm base cells", db_cells.len());
// This works for exterior cells, but there's a bug with the unique index on cells that
// creates duplicate interior cells. To fix that, I need to upgrade postgres to
// 15 or later, migate the data to the new db cluster, consolidate all of the duplicate cells
// into one cell in a separate backfill command, then fix the unique index.
Ok(())
}

View File

@@ -1,3 +1,7 @@
pub mod deduplicate_interior_cells;
pub mod is_translation;
pub mod is_base_game;
pub use deduplicate_interior_cells::deduplicate_interior_cells;
pub use is_translation::backfill_is_translation;
pub use is_base_game::backfill_is_base_game;

View File

@@ -10,7 +10,7 @@ pub async fn dump_cell_data(pool: &sqlx::Pool<sqlx::Postgres>, dir: &str) -> Res
let mut cell_count = 0;
for x in -77..75 {
for y in -50..44 {
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y).await {
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y, true).await {
let path = format!("{}/{}", &dir, x);
let path = Path::new(&path);
create_dir_all(path)?;