Fix bad interior cell data in db

I had a bad unique index on the `cells` table. This commit does not fix
it but has two backfill scripts that will allow me to fixup the data
before creating the new unique index.

`is_base_game` identifies cells that are from the `Skyrim.esm` shipped
with the game (and not from some mod that happens to also have a
`Skyrim.esm` with a 0,0 cell with a different form id, see:
https://github.com/thallada/modmapper-web/issues/8).

`dump_cell_data` now only dumps cells from `is_base_game` cells.

`deduplicate_interior_cells` removes duplicate cell and plugin_cell rows
caused by the buggy index.

I will need to upgrade the database and create a new index in a later
commit.
This commit is contained in:
Tyler Hallada 2023-11-08 01:05:19 -05:00
parent 56864c7f29
commit e85edff30a
11 changed files with 984 additions and 669 deletions

1297
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -21,6 +21,7 @@ postgres=# ALTER DATABASE modmapper OWNER TO modmapper;
& 'C:\Program Files\PostgreSQL\13\bin\createdb.exe' -U postgres modmapper
& 'C:\Program Files\PostgreSQL\13\bin\psql.exe' -U postgres
postgres=# ALTER DATABASE modmapper OWNER TO modmapper;
postgres=# ALTER USER modmapper CREATEDB;
\password modmapper
```
@ -36,14 +37,20 @@ RUST_LOG=mod_mapper=debug
[`sqlx_cli`](https://github.com/launchbadge/sqlx/tree/master/sqlx-cli) with
`cargo install sqlx-cli --no-default-features --features postgres`
5. Run `sqlx migrate --source migrations run` which will run all the database migrations.
6. Get your personal Nexus API token from your profile settings and add it to the `.env` file:
6. Get your personal Nexus API token from your profile settings and add it to
the `.env` file:
```
NEXUS_API_KEY=...
```
7. Build the release binary by running `cargo build --release`.
8. See `./target/release/modmapper -h` for further commands or run `./scripts/update.sh` to start populating the database with scraped mods and dumping the data to JSON files.
8. Run `./target/release/modmapper --backfill-is-game-cell` to pre-populate the
database with worlds and cells from the base game's Skyrim.esm. (This is so
that the base game cells can later be differentiated from cells in plugins that
also happen to be named Skyrim.esm and have cells that reference a world with
the same form ID as Tamriel.)
9. See `./target/release/modmapper -h` for further commands or run `./scripts/update.sh` to start populating the database with scraped mods and dumping the data to JSON files.
## Sync and Backup Setup

1
data/skyrim.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
ALTER TABLE "cells" ADD COLUMN "is_base_game" BOOLEAN NOT NULL DEFAULT false;

View File

@ -0,0 +1,138 @@
/// There was a bug with the unique index on the `cells` table that was causing the same form_id
/// and master to be inserted multiple times. This function deduplicates those, choosing the cell
/// with `is_base_game = true` if there is one, otherwise randomly chooses one.
/// rows referencing the duplicate cells are updated to reference the chosen cell.
use anyhow::Result;
use serde::{Deserialize, Serialize};
use sqlx::postgres::PgDatabaseError;
use sqlx::types::Json;
use sqlx::FromRow;
use tracing::{info, warn};
const PAGE_SIZE: i64 = 100;
#[derive(Debug, Clone, Deserialize, Serialize, FromRow)]
pub struct CellId {
pub id: i32,
pub is_base_game: bool,
}
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct CellDuplicates {
pub ids: Option<Json<Vec<CellId>>>,
pub form_id: i32,
pub master: String,
}
pub async fn deduplicate_interior_cells(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
let mut page = 0;
loop {
info!("deduplicating interior cells page {}", page);
let duplicates = sqlx::query_as!(
CellDuplicates,
r#"SELECT
json_agg(
json_build_object(
'id', id,
'is_base_game', is_base_game
)) as "ids: Json<Vec<CellId>>",
form_id,
master
FROM cells
WHERE world_id IS NULL
GROUP BY (form_id, master)
HAVING COUNT(*) > 1
LIMIT $1
"#,
PAGE_SIZE,
)
.fetch_all(pool)
.await?;
if duplicates.is_empty() {
break;
}
for duplicate_cell in duplicates.into_iter() {
info!(
"deduplicating cells form_id={} master={}",
duplicate_cell.form_id, duplicate_cell.master
);
let duplicate_ids = duplicate_cell.ids.clone().unwrap();
let chosen_cell = duplicate_ids
.iter()
.find(|cell| cell.is_base_game)
.unwrap_or_else(|| {
duplicate_ids
.iter()
.next()
.expect("duplicate cell has no ids")
});
info!(
"choosing cell_id={} is_base_game={}",
chosen_cell.id, chosen_cell.is_base_game
);
// Update all plugin_cells cell_id references to point to the chosen cell
let duplicate_ids = duplicate_cell
.ids
.unwrap()
.iter()
.map(|cell| cell.id)
.collect::<Vec<_>>();
// First, I need to fix-up any duplicated plugin_cells rows caused by broken
// plugins that have multiple cells with the same form_id. For these duplicate
// plugin_cells with the same plugin_id, I just arbitrarily choose one and delete
// the others (since it's undefined behavior of which duplicate record should "win"
// out in this case anyways). In the case of exterior cells, where the duplicate
// interior cell bug is not a problem, the last processed cell record in the plugin
// wins since `process_plugin` uses an upsert method which updates existing
// `plugin_cells` if it tries to insert a new one that conflicts with an existing one.
// So I am effectively retroactively doing the same here for interior cells.
let plugin_cells_delete = sqlx::query!(
r#"DELETE FROM plugin_cells
WHERE id NOT IN (
SELECT MIN(id)
FROM plugin_cells
WHERE cell_id = ANY($1)
GROUP BY plugin_id
)
AND cell_id = ANY($1)
"#,
&duplicate_ids
)
.execute(pool)
.await?;
info!(
"deleted {} duplicate plugin_cells from broken plugins",
plugin_cells_delete.rows_affected()
);
let update = sqlx::query!(
r#"UPDATE plugin_cells
SET
cell_id = $1,
updated_at = now()
WHERE cell_id = ANY($2)"#,
chosen_cell.id,
&duplicate_ids
)
.execute(pool)
.await?;
info!("updated {} plugin_cells", update.rows_affected());
// Delete all cells that are not the chosen cell
let delete = sqlx::query!(
r#"DELETE FROM cells
WHERE id != $1 AND id = ANY($2)"#,
chosen_cell.id,
&duplicate_ids
)
.execute(pool)
.await?;
info!("deleted {} cells", delete.rows_affected());
}
page += 1;
}
Ok(())
}

View File

@ -0,0 +1,71 @@
use std::borrow::Borrow;
use std::fs::File;
use std::io::BufReader;
use anyhow::{Context, Result};
use skyrim_cell_dump::Plugin;
use tracing::info;
use crate::models::cell::{self, UnsavedCell};
use crate::models::world::{self, UnsavedWorld};
use crate::plugin_processor::get_local_form_id_and_master;
pub async fn backfill_is_base_game(pool: &sqlx::Pool<sqlx::Postgres>) -> Result<()> {
let file = File::open("./data/skyrim.json")?;
let reader = BufReader::new(file);
let plugin: Plugin =
serde_json::from_reader(reader).context("failed to deserialize data/skyrim.json")?;
let file_name = "Skyrim.esm";
let masters: Vec<&str> = plugin.header.masters.iter().map(|s| s.borrow()).collect();
let base_worlds: Vec<UnsavedWorld> = plugin
.worlds
.iter()
.map(|world| {
let (form_id, master) =
get_local_form_id_and_master(world.form_id, &masters, file_name)
.expect("form_id to be a valid i32");
UnsavedWorld { form_id, master }
})
.collect();
let db_worlds = world::batched_insert(pool, &base_worlds).await?;
info!("Upserted {} Skyrim.esm base worlds", db_worlds.len());
let base_cells: Vec<UnsavedCell> = plugin
.cells
.iter()
.map(|cell| {
let world_id = if let Some(world_form_id) = cell.world_form_id {
let (form_id, master) =
get_local_form_id_and_master(world_form_id, &masters, file_name)
.expect("form_id to be valid i32");
Some(
db_worlds
.iter()
.find(|&world| world.form_id == form_id && world.master == master)
.expect("cell references world in the plugin worlds")
.id,
)
} else {
None
};
let (form_id, master) =
get_local_form_id_and_master(cell.form_id, &masters, file_name)
.expect("form_id is a valid i32");
UnsavedCell {
form_id,
master,
x: cell.x,
y: cell.y,
world_id,
is_persistent: cell.is_persistent,
is_base_game: true, // the whole point of this function
}
})
.collect();
let db_cells = cell::batched_insert(pool, &base_cells).await?;
info!("Upserted {} Skyrim.esm base cells", db_cells.len());
// This works for exterior cells, but there's a bug with the unique index on cells that
// creates duplicate interior cells. To fix that, I need to upgrade postgres to
// 15 or later, migate the data to the new db cluster, consolidate all of the duplicate cells
// into one cell in a separate backfill command, then fix the unique index.
Ok(())
}

View File

@ -1,3 +1,7 @@
pub mod deduplicate_interior_cells;
pub mod is_translation;
pub mod is_base_game;
pub use deduplicate_interior_cells::deduplicate_interior_cells;
pub use is_translation::backfill_is_translation;
pub use is_base_game::backfill_is_base_game;

View File

@ -10,7 +10,7 @@ pub async fn dump_cell_data(pool: &sqlx::Pool<sqlx::Postgres>, dir: &str) -> Res
let mut cell_count = 0;
for x in -77..75 {
for y in -50..44 {
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y).await {
if let Ok(data) = cell::get_cell_data(pool, "Skyrim.esm", 1, x, y, true).await {
let path = format!("{}/{}", &dir, x);
let path = Path::new(&path);
create_dir_all(path)?;

View File

@ -13,7 +13,8 @@ mod nexus_scraper;
mod plugin_processor;
use commands::{
backfills::backfill_is_translation, download_tiles, dump_cell_data, dump_cell_edit_counts,
backfills::backfill_is_base_game, backfills::backfill_is_translation,
backfills::deduplicate_interior_cells, download_tiles, dump_cell_data, dump_cell_edit_counts,
dump_file_data, dump_games, dump_mod_cell_counts, dump_mod_data, dump_mod_search_index,
dump_plugin_data, update,
};
@ -77,6 +78,14 @@ struct Args {
#[argh(switch)]
backfill_is_translation: bool,
/// backfill the is_base_game column in the cells table (for Skyrim.esm)
#[argh(switch)]
backfill_is_base_game: bool,
/// deduplicate the interior cells with same form_id and master
#[argh(switch)]
deduplicate_interior_cells: bool,
/// when dumping data, only dump data for mods or files that have been updated since this date
#[argh(option, short = 'u')]
updated_after: Option<NaiveDateTime>,
@ -125,6 +134,12 @@ pub async fn main() -> Result<()> {
if args.backfill_is_translation {
return backfill_is_translation(&pool).await;
}
if args.backfill_is_base_game {
return backfill_is_base_game(&pool).await;
}
if args.deduplicate_interior_cells {
return deduplicate_interior_cells(&pool).await;
}
update(&pool, args.page, &args.game, args.full).await
}

View File

@ -15,6 +15,7 @@ pub struct Cell {
pub y: Option<i32>,
pub world_id: Option<i32>,
pub is_persistent: bool,
pub is_base_game: bool,
pub updated_at: NaiveDateTime,
pub created_at: NaiveDateTime,
}
@ -27,6 +28,7 @@ pub struct UnsavedCell<'a> {
pub y: Option<i32>,
pub world_id: Option<i32>,
pub is_persistent: bool,
pub is_base_game: bool,
}
#[derive(Debug, Serialize, Deserialize, FromRow)]
@ -50,22 +52,24 @@ pub async fn insert(
y: Option<i32>,
world_id: Option<i32>,
is_persistent: bool,
is_base_game: bool,
) -> Result<Cell> {
sqlx::query_as!(
Cell,
"INSERT INTO cells
(form_id, master, x, y, world_id, is_persistent, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, now(), now())
(form_id, master, x, y, world_id, is_persistent, is_base_game, created_at, updated_at)
VALUES ($1, $2, $3, $4, $5, $6, $7, now(), now())
ON CONFLICT (form_id, master, world_id) DO UPDATE
SET (x, y, is_persistent, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, now())
SET (x, y, is_persistent, is_base_game, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, EXCLUDED.is_base_game, now())
RETURNING *",
form_id,
master,
x,
y,
world_id,
is_persistent
is_persistent,
is_base_game
)
.fetch_one(pool)
.await
@ -85,6 +89,7 @@ pub async fn batched_insert<'a>(
let mut ys: Vec<Option<i32>> = vec![];
let mut world_ids: Vec<Option<i32>> = vec![];
let mut is_persistents: Vec<bool> = vec![];
let mut is_base_games: Vec<bool> = vec![];
batch.iter().for_each(|unsaved_cell| {
form_ids.push(unsaved_cell.form_id);
masters.push(unsaved_cell.master);
@ -92,15 +97,16 @@ pub async fn batched_insert<'a>(
ys.push(unsaved_cell.y);
world_ids.push(unsaved_cell.world_id);
is_persistents.push(unsaved_cell.is_persistent);
is_base_games.push(unsaved_cell.is_base_game);
});
saved_cells.append(
// sqlx doesn't understand arrays of Options with the query_as! macro
&mut sqlx::query_as(
r#"INSERT INTO cells (form_id, master, x, y, world_id, is_persistent, created_at, updated_at)
SELECT *, now(), now() FROM UNNEST($1::int[], $2::text[], $3::int[], $4::int[], $5::int[], $6::bool[])
r#"INSERT INTO cells (form_id, master, x, y, world_id, is_persistent, is_base_game, created_at, updated_at)
SELECT *, now(), now() FROM UNNEST($1::int[], $2::text[], $3::int[], $4::int[], $5::int[], $6::bool[], $7::bool[])
ON CONFLICT (form_id, master, world_id) DO UPDATE
SET (x, y, is_persistent, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, now())
SET (x, y, is_persistent, is_base_game, updated_at) =
(EXCLUDED.x, EXCLUDED.y, EXCLUDED.is_persistent, EXCLUDED.is_base_game, now())
RETURNING *"#,
)
.bind(&form_ids)
@ -109,6 +115,7 @@ pub async fn batched_insert<'a>(
.bind(&ys)
.bind(&world_ids)
.bind(&is_persistents)
.bind(&is_base_games)
.fetch_all(pool)
.await
.context("Failed to insert cells")?,
@ -151,31 +158,62 @@ pub async fn get_cell_data(
world_id: i32,
x: i32,
y: i32,
is_base_game_only: bool,
) -> Result<CellData> {
sqlx::query_as!(
CellData,
r#"SELECT
cells.x,
cells.y,
cells.is_persistent,
cells.form_id,
COUNT(DISTINCT plugins.id) as plugins_count,
COUNT(DISTINCT files.id) as files_count,
COUNT(DISTINCT mods.id) as mods_count,
json_agg(DISTINCT mods.*) as mods
FROM cells
JOIN plugin_cells on cells.id = cell_id
JOIN plugins ON plugins.id = plugin_id
JOIN files ON files.id = plugins.file_id
JOIN mods ON mods.id = files.mod_id
WHERE cells.master = $1 AND cells.world_id = $2 AND cells.x = $3 and cells.y = $4
GROUP BY cells.x, cells.y, cells.is_persistent, cells.form_id"#,
master,
world_id,
x,
y
)
.fetch_one(pool)
.await
.context("Failed get cell data")
if is_base_game_only {
sqlx::query_as!(
CellData,
r#"SELECT
cells.x,
cells.y,
cells.is_persistent,
cells.form_id,
COUNT(DISTINCT plugins.id) as plugins_count,
COUNT(DISTINCT files.id) as files_count,
COUNT(DISTINCT mods.id) as mods_count,
json_agg(DISTINCT mods.*) as mods
FROM cells
JOIN plugin_cells on cells.id = cell_id
JOIN plugins ON plugins.id = plugin_id
JOIN files ON files.id = plugins.file_id
JOIN mods ON mods.id = files.mod_id
WHERE cells.master = $1 AND cells.world_id = $2 AND cells.x = $3 and cells.y = $4
GROUP BY cells.x, cells.y, cells.is_persistent, cells.form_id"#,
master,
world_id,
x,
y
)
.fetch_one(pool)
.await
.context("Failed get cell data")
} else {
sqlx::query_as!(
CellData,
r#"SELECT
cells.x,
cells.y,
cells.is_persistent,
cells.form_id,
COUNT(DISTINCT plugins.id) as plugins_count,
COUNT(DISTINCT files.id) as files_count,
COUNT(DISTINCT mods.id) as mods_count,
json_agg(DISTINCT mods.*) as mods
FROM cells
JOIN plugin_cells on cells.id = cell_id
JOIN plugins ON plugins.id = plugin_id
JOIN files ON files.id = plugins.file_id
JOIN mods ON mods.id = files.mod_id
WHERE cells.master = $1 AND cells.world_id = $2 AND cells.x = $3 and cells.y = $4 AND is_base_game = true
GROUP BY cells.x, cells.y, cells.is_persistent, cells.form_id"#,
master,
world_id,
x,
y
)
.fetch_one(pool)
.await
.context("Failed get cell data")
}
}

View File

@ -15,7 +15,7 @@ use crate::models::{plugin_cell, plugin_cell::UnsavedPluginCell};
use crate::models::{plugin_world, plugin_world::UnsavedPluginWorld};
use crate::models::{world, world::UnsavedWorld};
fn get_local_form_id_and_master<'a>(
pub fn get_local_form_id_and_master<'a>(
form_id: u32,
masters: &'a [&str],
file_name: &'a str,
@ -124,6 +124,7 @@ pub async fn process_plugin(
y: cell.y,
world_id,
is_persistent: cell.is_persistent,
is_base_game: false,
}
})
.collect();