M1-rework: harden legacy-migration reconciliation to full-row equality
Audit finding (review-notes/M1-full-review-1.md, FINDING 1): _reconcile only checked primary-key presence, so a source row skipped by INSERT OR IGNORE due to a value difference against a pre-existing same-PK target row would false-pass. Compare ALL columns with SQLite's NULL-safe IS operator instead, so reconciliation is a true full-row guarantee (idempotent re-runs still pass because the rows match column-for-column). Add tests for the value-mismatch abort and for idempotency under full-row reconciliation. Remove the now-unused pk_cols parameter. pytest 97 passed; ruff clean (pre-existing only); data-safety grep still empty.
This commit is contained in:
@@ -53,17 +53,20 @@ def _sqlite_path_from_url(url: str) -> Path:
|
|||||||
def _reconcile(
|
def _reconcile(
|
||||||
conn: sqlite3.Connection,
|
conn: sqlite3.Connection,
|
||||||
table: str,
|
table: str,
|
||||||
pk_cols: list[str],
|
columns: list[str],
|
||||||
source_count: int,
|
source_count: int,
|
||||||
) -> int:
|
) -> int:
|
||||||
"""Verify every legacy source row is present in the main (app) table.
|
"""Verify every legacy source row is present in the main (app) table.
|
||||||
|
|
||||||
Returns the count of source rows present in main.
|
Matches on ALL columns using SQLite's NULL-safe IS operator so that nullable
|
||||||
Raises RuntimeError if any rows are missing.
|
columns (e.g. altitude) compare correctly. A row that was silently skipped
|
||||||
|
by INSERT OR IGNORE due to a value difference will NOT satisfy this predicate
|
||||||
|
even if its primary key is present in the target.
|
||||||
|
|
||||||
|
Returns the count of source rows whose full-row data is present in main.
|
||||||
|
Raises RuntimeError if any rows are missing or differ in value.
|
||||||
"""
|
"""
|
||||||
join_cond = " AND ".join(
|
join_cond = " AND ".join(f"m.{col} IS l.{col}" for col in columns)
|
||||||
f"m.{col} = l.{col}" for col in pk_cols
|
|
||||||
)
|
|
||||||
sql = (
|
sql = (
|
||||||
f"SELECT COUNT(*) FROM legacy.{table} l "
|
f"SELECT COUNT(*) FROM legacy.{table} l "
|
||||||
f"WHERE EXISTS (SELECT 1 FROM main.{table} m WHERE {join_cond})"
|
f"WHERE EXISTS (SELECT 1 FROM main.{table} m WHERE {join_cond})"
|
||||||
@@ -73,7 +76,7 @@ def _reconcile(
|
|||||||
missing = source_count - present
|
missing = source_count - present
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Reconciliation failed for table '{table}': "
|
f"Reconciliation failed for table '{table}': "
|
||||||
f"{missing} of {source_count} source rows are missing from the app DB."
|
f"{missing} of {source_count} source rows are missing or differing in the app DB."
|
||||||
)
|
)
|
||||||
return present
|
return present
|
||||||
|
|
||||||
@@ -114,7 +117,6 @@ def migrate_legacy_data(
|
|||||||
legacy_url=location_url,
|
legacy_url=location_url,
|
||||||
table="location",
|
table="location",
|
||||||
columns=["person", "datetime", "latitude", "longitude", "altitude"],
|
columns=["person", "datetime", "latitude", "longitude", "altitude"],
|
||||||
pk_cols=["person", "datetime"],
|
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -124,7 +126,6 @@ def migrate_legacy_data(
|
|||||||
legacy_url=poo_url,
|
legacy_url=poo_url,
|
||||||
table="poo_records",
|
table="poo_records",
|
||||||
columns=["timestamp", "status", "latitude", "longitude"],
|
columns=["timestamp", "status", "latitude", "longitude"],
|
||||||
pk_cols=["timestamp"],
|
|
||||||
dry_run=dry_run,
|
dry_run=dry_run,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -137,7 +138,6 @@ def _migrate_table(
|
|||||||
legacy_url: str | None,
|
legacy_url: str | None,
|
||||||
table: str,
|
table: str,
|
||||||
columns: list[str],
|
columns: list[str],
|
||||||
pk_cols: list[str],
|
|
||||||
dry_run: bool,
|
dry_run: bool,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Migrate a single table from a legacy DB into the app DB.
|
"""Migrate a single table from a legacy DB into the app DB.
|
||||||
@@ -186,8 +186,8 @@ def _migrate_table(
|
|||||||
(after_count,) = conn.execute(f"SELECT COUNT(*) FROM main.{table}").fetchone()
|
(after_count,) = conn.execute(f"SELECT COUNT(*) FROM main.{table}").fetchone()
|
||||||
copied = after_count - before_count
|
copied = after_count - before_count
|
||||||
|
|
||||||
# Reconciliation: every source row must be present
|
# Reconciliation: every source row must be present with matching values
|
||||||
_reconcile(conn, table, pk_cols, source_count)
|
_reconcile(conn, table, columns, source_count)
|
||||||
|
|
||||||
conn.execute("DETACH DATABASE legacy")
|
conn.execute("DETACH DATABASE legacy")
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
@@ -261,7 +261,7 @@ def test_reconcile_raises_on_missing_rows(tmp_path: Path) -> None:
|
|||||||
_reconcile(
|
_reconcile(
|
||||||
conn,
|
conn,
|
||||||
table="location",
|
table="location",
|
||||||
pk_cols=["person", "datetime"],
|
columns=["person", "datetime", "latitude", "longitude", "altitude"],
|
||||||
source_count=len(LOCATION_ROWS),
|
source_count=len(LOCATION_ROWS),
|
||||||
)
|
)
|
||||||
conn.execute("DETACH DATABASE legacy")
|
conn.execute("DETACH DATABASE legacy")
|
||||||
@@ -326,6 +326,63 @@ def test_cli_exits_nonzero_on_reconciliation_failure(
|
|||||||
assert exc_info.value.code != 0
|
assert exc_info.value.code != 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_reconcile_catches_value_mismatch_not_just_pk(tmp_path: Path) -> None:
|
||||||
|
"""Full-row reconciliation catches value mismatch that PK-only check would miss.
|
||||||
|
|
||||||
|
Scenario: the app DB is PRE-POPULATED with a row that shares the same PK as
|
||||||
|
a legacy source row but has DIFFERENT non-PK column values. INSERT OR IGNORE
|
||||||
|
skips the source row (PK conflict), so the target retains the stale data.
|
||||||
|
The old PK-only reconciliation would have incorrectly reported success.
|
||||||
|
The new full-row reconciliation must detect the mismatch and raise.
|
||||||
|
"""
|
||||||
|
app_path, app_url = _upgraded_app_db(tmp_path)
|
||||||
|
|
||||||
|
# Legacy source has a row: person="alice", datetime="2026-01-01T10:00:00Z",
|
||||||
|
# latitude=1.23, longitude=4.56, altitude=7.89
|
||||||
|
legacy_path = tmp_path / "locationRecorder.db"
|
||||||
|
_make_legacy_location_db(legacy_path, [("alice", "2026-01-01T10:00:00Z", 1.23, 4.56, 7.89)])
|
||||||
|
legacy_url = f"sqlite:///{legacy_path}"
|
||||||
|
|
||||||
|
# App DB is pre-populated with the SAME PK but DIFFERENT non-PK values
|
||||||
|
# (latitude/longitude/altitude all differ from the source row)
|
||||||
|
conn = sqlite3.connect(app_path)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO location (person, datetime, latitude, longitude, altitude) "
|
||||||
|
"VALUES (?, ?, ?, ?, ?)",
|
||||||
|
("alice", "2026-01-01T10:00:00Z", 99.0, 99.0, 99.0),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# migrate_legacy_data must raise: the source row's data is NOT in the target
|
||||||
|
# (INSERT OR IGNORE skipped it because of PK conflict, retaining the 99.0 values)
|
||||||
|
with pytest.raises(RuntimeError, match="Reconciliation failed"):
|
||||||
|
migrate_legacy_data(app_url, legacy_url, None)
|
||||||
|
|
||||||
|
|
||||||
|
def test_full_row_reconciliation_idempotent_on_identical_data(tmp_path: Path) -> None:
|
||||||
|
"""Second run on already-migrated data still reconciles cleanly.
|
||||||
|
|
||||||
|
When the target already holds identical rows (from the first run), the full-row
|
||||||
|
IS predicate matches every column and reconciliation passes (no raise).
|
||||||
|
"""
|
||||||
|
app_path, app_url = _upgraded_app_db(tmp_path)
|
||||||
|
legacy_path = tmp_path / "locationRecorder.db"
|
||||||
|
_make_legacy_location_db(legacy_path, LOCATION_ROWS)
|
||||||
|
legacy_url = f"sqlite:///{legacy_path}"
|
||||||
|
|
||||||
|
# First run: migrate all rows
|
||||||
|
result1 = migrate_legacy_data(app_url, legacy_url, None)
|
||||||
|
assert result1["location"]["copied"] == len(LOCATION_ROWS)
|
||||||
|
|
||||||
|
# Second run: rows already present, INSERT OR IGNORE skips all, full-row
|
||||||
|
# reconciliation must still pass because values are identical
|
||||||
|
result2 = migrate_legacy_data(app_url, legacy_url, None)
|
||||||
|
assert result2["location"]["copied"] == 0
|
||||||
|
assert result2["location"]["final"] == len(LOCATION_ROWS)
|
||||||
|
# No exception raised — idempotency holds under full-row reconciliation
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Test 4: dry_run
|
# Test 4: dry_run
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
Reference in New Issue
Block a user