Fix bad interior cell data in db

I had a bad unique index on the `cells` table. This commit does not fix
it but has two backfill scripts that will allow me to fixup the data
before creating the new unique index.

`is_base_game` identifies cells that are from the `Skyrim.esm` shipped
with the game (and not from some mod that happens to also have a
`Skyrim.esm` with a 0,0 cell with a different form id, see:
https://github.com/thallada/modmapper-web/issues/8).

`dump_cell_data` now only dumps cells from `is_base_game` cells.

`deduplicate_interior_cells` removes duplicate cell and plugin_cell rows
caused by the buggy index.

I will need to upgrade the database and create a new index in a later
commit.
This commit is contained in:
2023-11-08 01:05:19 -05:00
parent 56864c7f29
commit e85edff30a
11 changed files with 984 additions and 669 deletions

View File

@@ -0,0 +1,138 @@
/// There was a bug with the unique index on the `cells` table that was causing the same form_id
/// and master to be inserted multiple times. This function deduplicates those, choosing the cell
/// with `is_base_game = true` if there is one, otherwise randomly chooses one.
/// rows referencing the duplicate cells are updated to reference the chosen cell.
use anyhow::Result;
use serde::{Deserialize, Serialize};
use sqlx::postgres::PgDatabaseError;
use sqlx::types::Json;
use sqlx::FromRow;
use tracing::{info, warn};
const PAGE_SIZE: i64 = 100;
#[derive(Debug, Clone, Deserialize, Serialize, FromRow)]
pub struct CellId {
pub id: i32,
pub is_base_game: bool,
}
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct CellDuplicates {
pub ids: Option<Json<Vec<CellId>>>,
pub form_id: i32,
pub master: String,
}
pub async fn deduplicate_interior_cells(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
let mut page = 0;
loop {
info!("deduplicating interior cells page {}", page);
let duplicates = sqlx::query_as!(
CellDuplicates,
r#"SELECT
json_agg(
json_build_object(
'id', id,
'is_base_game', is_base_game
)) as "ids: Json<Vec<CellId>>",
form_id,
master
FROM cells
WHERE world_id IS NULL
GROUP BY (form_id, master)
HAVING COUNT(*) > 1
LIMIT $1
"#,
PAGE_SIZE,
)
.fetch_all(pool)
.await?;
if duplicates.is_empty() {
break;
}
for duplicate_cell in duplicates.into_iter() {
info!(
"deduplicating cells form_id={} master={}",
duplicate_cell.form_id, duplicate_cell.master
);
let duplicate_ids = duplicate_cell.ids.clone().unwrap();
let chosen_cell = duplicate_ids
.iter()
.find(|cell| cell.is_base_game)
.unwrap_or_else(|| {
duplicate_ids
.iter()
.next()
.expect("duplicate cell has no ids")
});
info!(
"choosing cell_id={} is_base_game={}",
chosen_cell.id, chosen_cell.is_base_game
);
// Update all plugin_cells cell_id references to point to the chosen cell
let duplicate_ids = duplicate_cell
.ids
.unwrap()
.iter()
.map(|cell| cell.id)
.collect::<Vec<_>>();
// First, I need to fix-up any duplicated plugin_cells rows caused by broken
// plugins that have multiple cells with the same form_id. For these duplicate
// plugin_cells with the same plugin_id, I just arbitrarily choose one and delete
// the others (since it's undefined behavior of which duplicate record should "win"
// out in this case anyways). In the case of exterior cells, where the duplicate
// interior cell bug is not a problem, the last processed cell record in the plugin
// wins since `process_plugin` uses an upsert method which updates existing
// `plugin_cells` if it tries to insert a new one that conflicts with an existing one.
// So I am effectively retroactively doing the same here for interior cells.
let plugin_cells_delete = sqlx::query!(
r#"DELETE FROM plugin_cells
WHERE id NOT IN (
SELECT MIN(id)
FROM plugin_cells
WHERE cell_id = ANY($1)
GROUP BY plugin_id
)
AND cell_id = ANY($1)
"#,
&duplicate_ids
)
.execute(pool)
.await?;
info!(
"deleted {} duplicate plugin_cells from broken plugins",
plugin_cells_delete.rows_affected()
);
let update = sqlx::query!(
r#"UPDATE plugin_cells
SET
cell_id = $1,
updated_at = now()
WHERE cell_id = ANY($2)"#,
chosen_cell.id,
&duplicate_ids
)
.execute(pool)
.await?;
info!("updated {} plugin_cells", update.rows_affected());
// Delete all cells that are not the chosen cell
let delete = sqlx::query!(
r#"DELETE FROM cells
WHERE id != $1 AND id = ANY($2)"#,
chosen_cell.id,
&duplicate_ids
)
.execute(pool)
.await?;
info!("deleted {} cells", delete.rows_affected());
}
page += 1;
}
Ok(())
}

View File

@@ -0,0 +1,71 @@
use std::borrow::Borrow;
use std::fs::File;
use std::io::BufReader;
use anyhow::{Context, Result};
use skyrim_cell_dump::Plugin;
use tracing::info;
use crate::models::cell::{self, UnsavedCell};
use crate::models::world::{self, UnsavedWorld};
use crate::plugin_processor::get_local_form_id_and_master;
pub async fn backfill_is_base_game(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
let file = File::open("./data/skyrim.json")?;
let reader = BufReader::new(file);
let plugin: Plugin =
serde_json::from_reader(reader).context("failed to deserialize data/skyrim.json")?;
let file_name = "Skyrim.esm";
let masters: Vec<&str> = plugin.header.masters.iter().map(|s| s.borrow()).collect();
let base_worlds: Vec<UnsavedWorld> = plugin
.worlds
.iter()
.map(|world| {
let (form_id, master) =
get_local_form_id_and_master(world.form_id, &masters, file_name)
.expect("form_id to be a valid i32");
UnsavedWorld { form_id, master }
})
.collect();
let db_worlds = world::batched_insert(pool, &base_worlds).await?;
info!("Upserted {} Skyrim.esm base worlds", db_worlds.len());
let base_cells: Vec<UnsavedCell> = plugin
.cells
.iter()
.map(|cell| {
let world_id = if let Some(world_form_id) = cell.world_form_id {
let (form_id, master) =
get_local_form_id_and_master(world_form_id, &masters, file_name)
.expect("form_id to be valid i32");
Some(
db_worlds
.iter()
.find(|&world| world.form_id == form_id && world.master == master)
.expect("cell references world in the plugin worlds")
.id,
)
} else {
None
};
let (form_id, master) =
get_local_form_id_and_master(cell.form_id, &masters, file_name)
.expect("form_id is a valid i32");
UnsavedCell {
form_id,
master,
x: cell.x,
y: cell.y,
world_id,
is_persistent: cell.is_persistent,
is_base_game: true, // the whole point of this function
}
})
.collect();
let db_cells = cell::batched_insert(pool, &base_cells).await?;
info!("Upserted {} Skyrim.esm base cells", db_cells.len());
// This works for exterior cells, but there's a bug with the unique index on cells that
// creates duplicate interior cells. To fix that, I need to upgrade postgres to
// 15 or later, migate the data to the new db cluster, consolidate all of the duplicate cells
// into one cell in a separate backfill command, then fix the unique index.
Ok(())
}

View File

@@ -1,3 +1,7 @@
pub mod deduplicate_interior_cells;
pub mod is_translation;
pub mod is_base_game;
pub use deduplicate_interior_cells::deduplicate_interior_cells;
pub use is_translation::backfill_is_translation;
pub use is_base_game::backfill_is_base_game;

View File

@@ -10,7 +10,7 @@ pub async fn dump_cell_data(pool: &sqlx::Pool<sqlx::Postgres>, dir: &str) -> Res
let mut cell_count = 0;
for x in -77..75 {
for y in -50..44 {
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y).await {
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y, true).await {
let path = format!("{}/{}", &dir, x);
let path = Path::new(&path);
create_dir_all(path)?;

View File

@@ -13,7 +13,8 @@ mod nexus_scraper;
mod plugin_processor;
use commands::{
backfills::backfill_is_translation, download_tiles, dump_cell_data, dump_cell_edit_counts,
backfills::backfill_is_base_game, backfills::backfill_is_translation,
backfills::deduplicate_interior_cells, download_tiles, dump_cell_data, dump_cell_edit_counts,
dump_file_data, dump_games, dump_mod_cell_counts, dump_mod_data, dump_mod_search_index,
dump_plugin_data, update,
};
@@ -77,6 +78,14 @@ struct Args {
#[argh(switch)]
backfill_is_translation: bool,
/// backfill the is_base_game column in the cells table (for Skyrim.esm)
#[argh(switch)]
backfill_is_base_game: bool,
/// deduplicate the interior cells with same form_id and master
#[argh(switch)]
deduplicate_interior_cells: bool,
/// when dumping data, only dump data for mods or files that have been updated since this date
#[argh(option, short = 'u')]
updated_after: Option<NaiveDateTime>,
@@ -125,6 +134,12 @@ pub async fn main() -> Result<()> {
if args.backfill_is_translation {
return backfill_is_translation(&pool).await;
}
if args.backfill_is_base_game {
return backfill_is_base_game(&pool).await;
}
if args.deduplicate_interior_cells {
return deduplicate_interior_cells(&pool).await;
}
update(&pool, args.page, &args.game, args.full).await
}

View File

@@ -15,6 +15,7 @@ pub struct Cell {
pub y: Option<i32>,
pub world_id: Option<i32>,
pub is_persistent: bool,
pub is_base_game: bool,
pub updated_at: NaiveDateTime,
pub created_at: NaiveDateTime,
}
@@ -27,6 +28,7 @@ pub struct UnsavedCell<'a> {
pub y: Option<i32>,
pub world_id: Option<i32>,
pub is_persistent: bool,
pub is_base_game: bool,
}
#[derive(Debug, Serialize, Deserialize, FromRow)]
@@ -50,22 +52,24 @@ pub async fn insert(
y: Option<i32>,
world_id: Option<i32>,
is_persistent: bool,
is_base_game: bool,
) -> Result<Cell> {
sqlx::query_as!(
Cell,
"INSERT INTO cells
(form_id, master, x, y, world_id, is_persistent, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, now(), now())
(form_id, master, x, y, world_id, is_persistent, is_base_game, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, now(), now())
ON CONFLICT (form_id, master, world_id) DO UPDATE
SET (x, y, is_persistent, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, now())
SET (x, y, is_persistent, is_base_game, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, EXCLUDED.is_base_game, now())
RETURNING *",
form_id,
master,
x,
y,
world_id,
is_persistent
is_persistent,
is_base_game
)
.fetch_one(pool)
.await
@@ -85,6 +89,7 @@ pub async fn batched_insert<'a>(
let mut ys: Vec<Option<i32>> = vec![];
let mut world_ids: Vec<Option<i32>> = vec![];
let mut is_persistents: Vec<bool> = vec![];
let mut is_base_games: Vec<bool> = vec![];
batch.iter().for_each(|unsaved_cell| {
form_ids.push(unsaved_cell.form_id);
masters.push(unsaved_cell.master);
@@ -92,15 +97,16 @@ pub async fn batched_insert<'a>(
ys.push(unsaved_cell.y);
world_ids.push(unsaved_cell.world_id);
is_persistents.push(unsaved_cell.is_persistent);
is_base_games.push(unsaved_cell.is_base_game);
});
saved_cells.append(
// sqlx doesn't understand arrays of Options with the query_as! macro
&mut sqlx::query_as(
r#"INSERT INTO cells (form_id, master, x, y, world_id, is_persistent, created_at, updated_at)
SELECT *, now(), now() FROM UNNEST($1::int[], $2::text[], $3::int[], $4::int[], $5::int[], $6::bool[])
r#"INSERT INTO cells (form_id, master, x, y, world_id, is_persistent, is_base_game, created_at, updated_at)
SELECT *, now(), now() FROM UNNEST($1::int[], $2::text[], $3::int[], $4::int[], $5::int[], $6::bool[], $7::bool[])
ON CONFLICT (form_id, master, world_id) DO UPDATE
SET (x, y, is_persistent, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, now())
SET (x, y, is_persistent, is_base_game, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, EXCLUDED.is_base_game, now())
RETURNING *"#,
)
.bind(&form_ids)
@@ -109,6 +115,7 @@ pub async fn batched_insert<'a>(
.bind(&ys)
.bind(&world_ids)
.bind(&is_persistents)
.bind(&is_base_games)
.fetch_all(pool)
.await
.context("Failed to insert cells")?,
@@ -151,31 +158,62 @@ pub async fn get_cell_data(
world_id: i32,
x: i32,
y: i32,
is_base_game_only: bool,
) -> Result<CellData> {
sqlx::query_as!(
CellData,
r#"SELECT
cells.x,
cells.y,
cells.is_persistent,
cells.form_id,
COUNT(DISTINCT plugins.id) as plugins_count,
COUNT(DISTINCT files.id) as files_count,
COUNT(DISTINCT mods.id) as mods_count,
json_agg(DISTINCT mods.*) as mods
FROM cells
JOIN plugin_cells on cells.id = cell_id
JOIN plugins ON plugins.id = plugin_id
JOIN files ON files.id = plugins.file_id
JOIN mods ON mods.id = files.mod_id
WHERE cells.master = $1 AND cells.world_id = $2 AND cells.x = $3 and cells.y = $4
GROUP BY cells.x, cells.y, cells.is_persistent, cells.form_id"#,
master,
world_id,
x,
y
)
.fetch_one(pool)
.await
.context("Failed get cell data")
if is_base_game_only {
sqlx::query_as!(
CellData,
r#"SELECT
cells.x,
cells.y,
cells.is_persistent,
cells.form_id,
COUNT(DISTINCT plugins.id) as plugins_count,
COUNT(DISTINCT files.id) as files_count,
COUNT(DISTINCT mods.id) as mods_count,
json_agg(DISTINCT mods.*) as mods
FROM cells
JOIN plugin_cells on cells.id = cell_id
JOIN plugins ON plugins.id = plugin_id
JOIN files ON files.id = plugins.file_id
JOIN mods ON mods.id = files.mod_id
WHERE cells.master = $1 AND cells.world_id = $2 AND cells.x = $3 and cells.y = $4
GROUP BY cells.x, cells.y, cells.is_persistent, cells.form_id"#,
master,
world_id,
x,
y
)
.fetch_one(pool)
.await
.context("Failed get cell data")
} else {
sqlx::query_as!(
CellData,
r#"SELECT
cells.x,
cells.y,
cells.is_persistent,
cells.form_id,
COUNT(DISTINCT plugins.id) as plugins_count,
COUNT(DISTINCT files.id) as files_count,
COUNT(DISTINCT mods.id) as mods_count,
json_agg(DISTINCT mods.*) as mods
FROM cells
JOIN plugin_cells on cells.id = cell_id
JOIN plugins ON plugins.id = plugin_id
JOIN files ON files.id = plugins.file_id
JOIN mods ON mods.id = files.mod_id
WHERE cells.master = $1 AND cells.world_id = $2 AND cells.x = $3 and cells.y = $4 AND is_base_game = true
GROUP BY cells.x, cells.y, cells.is_persistent, cells.form_id"#,
master,
world_id,
x,
y
)
.fetch_one(pool)
.await
.context("Failed get cell data")
}
}

View File

@@ -15,7 +15,7 @@ use crate::models::{plugin_cell, plugin_cell::UnsavedPluginCell};
use crate::models::{plugin_world, plugin_world::UnsavedPluginWorld};
use crate::models::{world, world::UnsavedWorld};
fn get_local_form_id_and_master<'a>(
pub fn get_local_form_id_and_master<'a>(
form_id: u32,
masters: &'a [&str],
file_name: &'a str,
@@ -124,6 +124,7 @@ pub async fn process_plugin(
y: cell.y,
world_id,
is_persistent: cell.is_persistent,
is_base_game: false,
}
})
.collect();