M1-rework: harden legacy-migration reconciliation to full-row equality

Audit finding (review-notes/M1-full-review-1.md, FINDING 1): _reconcile only
checked primary-key presence, so a source row skipped by INSERT OR IGNORE due
to a value difference against a pre-existing same-PK target row would
false-pass. Compare ALL columns with SQLite's NULL-safe IS operator instead,
so reconciliation is a true full-row guarantee (idempotent re-runs still pass
because the rows match column-for-column). Add tests for the value-mismatch
abort and for idempotency under full-row reconciliation. Remove the now-unused
pk_cols parameter.

pytest 97 passed; ruff clean (pre-existing only); data-safety grep still empty.
This commit is contained in:
2026-06-12 19:05:56 +02:00
parent 2f634006d2
commit 1cbe6c46d2
2 changed files with 70 additions and 13 deletions
+12 -12
View File
@@ -53,17 +53,20 @@ def _sqlite_path_from_url(url: str) -> Path:
def _reconcile(
conn: sqlite3.Connection,
table: str,
pk_cols: list[str],
columns: list[str],
source_count: int,
) -> int:
"""Verify every legacy source row is present in the main (app) table.
Returns the count of source rows present in main.
Raises RuntimeError if any rows are missing.
Matches on ALL columns using SQLite's NULL-safe IS operator so that nullable
columns (e.g. altitude) compare correctly. A row that was silently skipped
by INSERT OR IGNORE due to a value difference will NOT satisfy this predicate
even if its primary key is present in the target.
Returns the count of source rows whose full-row data is present in main.
Raises RuntimeError if any rows are missing or differ in value.
"""
join_cond = " AND ".join(
f"m.{col} = l.{col}" for col in pk_cols
)
join_cond = " AND ".join(f"m.{col} IS l.{col}" for col in columns)
sql = (
f"SELECT COUNT(*) FROM legacy.{table} l "
f"WHERE EXISTS (SELECT 1 FROM main.{table} m WHERE {join_cond})"
@@ -73,7 +76,7 @@ def _reconcile(
missing = source_count - present
raise RuntimeError(
f"Reconciliation failed for table '{table}': "
f"{missing} of {source_count} source rows are missing from the app DB."
f"{missing} of {source_count} source rows are missing or differing in the app DB."
)
return present
@@ -114,7 +117,6 @@ def migrate_legacy_data(
legacy_url=location_url,
table="location",
columns=["person", "datetime", "latitude", "longitude", "altitude"],
pk_cols=["person", "datetime"],
dry_run=dry_run,
)
@@ -124,7 +126,6 @@ def migrate_legacy_data(
legacy_url=poo_url,
table="poo_records",
columns=["timestamp", "status", "latitude", "longitude"],
pk_cols=["timestamp"],
dry_run=dry_run,
)
@@ -137,7 +138,6 @@ def _migrate_table(
legacy_url: str | None,
table: str,
columns: list[str],
pk_cols: list[str],
dry_run: bool,
) -> dict:
"""Migrate a single table from a legacy DB into the app DB.
@@ -186,8 +186,8 @@ def _migrate_table(
(after_count,) = conn.execute(f"SELECT COUNT(*) FROM main.{table}").fetchone()
copied = after_count - before_count
# Reconciliation: every source row must be present
_reconcile(conn, table, pk_cols, source_count)
# Reconciliation: every source row must be present with matching values
_reconcile(conn, table, columns, source_count)
conn.execute("DETACH DATABASE legacy")
finally: