M1-rework: harden legacy-migration reconciliation to full-row equality

Audit finding (review-notes/M1-full-review-1.md, FINDING 1): _reconcile only
checked primary-key presence, so a source row skipped by INSERT OR IGNORE due
to a value difference against a pre-existing same-PK target row would
false-pass. Compare ALL columns with SQLite's NULL-safe IS operator instead,
so reconciliation is a true full-row guarantee (idempotent re-runs still pass
because the rows match column-for-column). Add tests for the value-mismatch
abort and for idempotency under full-row reconciliation. Remove the now-unused
pk_cols parameter.

pytest 97 passed; ruff clean (pre-existing only); data-safety grep still empty.
This commit is contained in:
2026-06-12 19:05:56 +02:00
parent 2f634006d2
commit 1cbe6c46d2
2 changed files with 70 additions and 13 deletions
+12 -12
View File
@@ -53,17 +53,20 @@ def _sqlite_path_from_url(url: str) -> Path:
def _reconcile(
conn: sqlite3.Connection,
table: str,
pk_cols: list[str],
columns: list[str],
source_count: int,
) -> int:
"""Verify every legacy source row is present in the main (app) table.
Returns the count of source rows present in main.
Raises RuntimeError if any rows are missing.
Matches on ALL columns using SQLite's NULL-safe IS operator so that nullable
columns (e.g. altitude) compare correctly. A row that was silently skipped
by INSERT OR IGNORE due to a value difference will NOT satisfy this predicate
even if its primary key is present in the target.
Returns the count of source rows whose full-row data is present in main.
Raises RuntimeError if any rows are missing or differ in value.
"""
join_cond = " AND ".join(
f"m.{col} = l.{col}" for col in pk_cols
)
join_cond = " AND ".join(f"m.{col} IS l.{col}" for col in columns)
sql = (
f"SELECT COUNT(*) FROM legacy.{table} l "
f"WHERE EXISTS (SELECT 1 FROM main.{table} m WHERE {join_cond})"
@@ -73,7 +76,7 @@ def _reconcile(
missing = source_count - present
raise RuntimeError(
f"Reconciliation failed for table '{table}': "
f"{missing} of {source_count} source rows are missing from the app DB."
f"{missing} of {source_count} source rows are missing or differing in the app DB."
)
return present
@@ -114,7 +117,6 @@ def migrate_legacy_data(
legacy_url=location_url,
table="location",
columns=["person", "datetime", "latitude", "longitude", "altitude"],
pk_cols=["person", "datetime"],
dry_run=dry_run,
)
@@ -124,7 +126,6 @@ def migrate_legacy_data(
legacy_url=poo_url,
table="poo_records",
columns=["timestamp", "status", "latitude", "longitude"],
pk_cols=["timestamp"],
dry_run=dry_run,
)
@@ -137,7 +138,6 @@ def _migrate_table(
legacy_url: str | None,
table: str,
columns: list[str],
pk_cols: list[str],
dry_run: bool,
) -> dict:
"""Migrate a single table from a legacy DB into the app DB.
@@ -186,8 +186,8 @@ def _migrate_table(
(after_count,) = conn.execute(f"SELECT COUNT(*) FROM main.{table}").fetchone()
copied = after_count - before_count
# Reconciliation: every source row must be present
_reconcile(conn, table, pk_cols, source_count)
# Reconciliation: every source row must be present with matching values
_reconcile(conn, table, columns, source_count)
conn.execute("DETACH DATABASE legacy")
finally:
+58 -1
View File
@@ -261,7 +261,7 @@ def test_reconcile_raises_on_missing_rows(tmp_path: Path) -> None:
_reconcile(
conn,
table="location",
pk_cols=["person", "datetime"],
columns=["person", "datetime", "latitude", "longitude", "altitude"],
source_count=len(LOCATION_ROWS),
)
conn.execute("DETACH DATABASE legacy")
@@ -326,6 +326,63 @@ def test_cli_exits_nonzero_on_reconciliation_failure(
assert exc_info.value.code != 0
def test_reconcile_catches_value_mismatch_not_just_pk(tmp_path: Path) -> None:
"""Full-row reconciliation catches value mismatch that PK-only check would miss.
Scenario: the app DB is PRE-POPULATED with a row that shares the same PK as
a legacy source row but has DIFFERENT non-PK column values. INSERT OR IGNORE
skips the source row (PK conflict), so the target retains the stale data.
The old PK-only reconciliation would have incorrectly reported success.
The new full-row reconciliation must detect the mismatch and raise.
"""
app_path, app_url = _upgraded_app_db(tmp_path)
# Legacy source has a row: person="alice", datetime="2026-01-01T10:00:00Z",
# latitude=1.23, longitude=4.56, altitude=7.89
legacy_path = tmp_path / "locationRecorder.db"
_make_legacy_location_db(legacy_path, [("alice", "2026-01-01T10:00:00Z", 1.23, 4.56, 7.89)])
legacy_url = f"sqlite:///{legacy_path}"
# App DB is pre-populated with the SAME PK but DIFFERENT non-PK values
# (latitude/longitude/altitude all differ from the source row)
conn = sqlite3.connect(app_path)
conn.execute(
"INSERT INTO location (person, datetime, latitude, longitude, altitude) "
"VALUES (?, ?, ?, ?, ?)",
("alice", "2026-01-01T10:00:00Z", 99.0, 99.0, 99.0),
)
conn.commit()
conn.close()
# migrate_legacy_data must raise: the source row's data is NOT in the target
# (INSERT OR IGNORE skipped it because of PK conflict, retaining the 99.0 values)
with pytest.raises(RuntimeError, match="Reconciliation failed"):
migrate_legacy_data(app_url, legacy_url, None)
def test_full_row_reconciliation_idempotent_on_identical_data(tmp_path: Path) -> None:
"""Second run on already-migrated data still reconciles cleanly.
When the target already holds identical rows (from the first run), the full-row
IS predicate matches every column and reconciliation passes (no raise).
"""
app_path, app_url = _upgraded_app_db(tmp_path)
legacy_path = tmp_path / "locationRecorder.db"
_make_legacy_location_db(legacy_path, LOCATION_ROWS)
legacy_url = f"sqlite:///{legacy_path}"
# First run: migrate all rows
result1 = migrate_legacy_data(app_url, legacy_url, None)
assert result1["location"]["copied"] == len(LOCATION_ROWS)
# Second run: rows already present, INSERT OR IGNORE skips all, full-row
# reconciliation must still pass because values are identical
result2 = migrate_legacy_data(app_url, legacy_url, None)
assert result2["location"]["copied"] == 0
assert result2["location"]["final"] == len(LOCATION_ROWS)
# No exception raised — idempotency holds under full-row reconciliation
# ---------------------------------------------------------------------------
# Test 4: dry_run
# ---------------------------------------------------------------------------