diff --git a/scripts/migrate_legacy_data.py b/scripts/migrate_legacy_data.py index d36bfae..5cd675f 100644 --- a/scripts/migrate_legacy_data.py +++ b/scripts/migrate_legacy_data.py @@ -53,17 +53,20 @@ def _sqlite_path_from_url(url: str) -> Path: def _reconcile( conn: sqlite3.Connection, table: str, - pk_cols: list[str], + columns: list[str], source_count: int, ) -> int: """Verify every legacy source row is present in the main (app) table. - Returns the count of source rows present in main. - Raises RuntimeError if any rows are missing. + Matches on ALL columns using SQLite's NULL-safe IS operator so that nullable + columns (e.g. altitude) compare correctly. A row that was silently skipped + by INSERT OR IGNORE due to a value difference will NOT satisfy this predicate + even if its primary key is present in the target. + + Returns the count of source rows whose full-row data is present in main. + Raises RuntimeError if any rows are missing or differ in value. """ - join_cond = " AND ".join( - f"m.{col} = l.{col}" for col in pk_cols - ) + join_cond = " AND ".join(f"m.{col} IS l.{col}" for col in columns) sql = ( f"SELECT COUNT(*) FROM legacy.{table} l " f"WHERE EXISTS (SELECT 1 FROM main.{table} m WHERE {join_cond})" @@ -73,7 +76,7 @@ def _reconcile( missing = source_count - present raise RuntimeError( f"Reconciliation failed for table '{table}': " - f"{missing} of {source_count} source rows are missing from the app DB." + f"{missing} of {source_count} source rows are missing or differing in the app DB." ) return present @@ -114,7 +117,6 @@ def migrate_legacy_data( legacy_url=location_url, table="location", columns=["person", "datetime", "latitude", "longitude", "altitude"], - pk_cols=["person", "datetime"], dry_run=dry_run, ) @@ -124,7 +126,6 @@ def migrate_legacy_data( legacy_url=poo_url, table="poo_records", columns=["timestamp", "status", "latitude", "longitude"], - pk_cols=["timestamp"], dry_run=dry_run, ) @@ -137,7 +138,6 @@ def _migrate_table( legacy_url: str | None, table: str, columns: list[str], - pk_cols: list[str], dry_run: bool, ) -> dict: """Migrate a single table from a legacy DB into the app DB. @@ -186,8 +186,8 @@ def _migrate_table( (after_count,) = conn.execute(f"SELECT COUNT(*) FROM main.{table}").fetchone() copied = after_count - before_count - # Reconciliation: every source row must be present - _reconcile(conn, table, pk_cols, source_count) + # Reconciliation: every source row must be present with matching values + _reconcile(conn, table, columns, source_count) conn.execute("DETACH DATABASE legacy") finally: diff --git a/tests/test_migrate_legacy_data.py b/tests/test_migrate_legacy_data.py index bd6fa47..88dedd5 100644 --- a/tests/test_migrate_legacy_data.py +++ b/tests/test_migrate_legacy_data.py @@ -261,7 +261,7 @@ def test_reconcile_raises_on_missing_rows(tmp_path: Path) -> None: _reconcile( conn, table="location", - pk_cols=["person", "datetime"], + columns=["person", "datetime", "latitude", "longitude", "altitude"], source_count=len(LOCATION_ROWS), ) conn.execute("DETACH DATABASE legacy") @@ -326,6 +326,63 @@ def test_cli_exits_nonzero_on_reconciliation_failure( assert exc_info.value.code != 0 +def test_reconcile_catches_value_mismatch_not_just_pk(tmp_path: Path) -> None: + """Full-row reconciliation catches value mismatch that PK-only check would miss. + + Scenario: the app DB is PRE-POPULATED with a row that shares the same PK as + a legacy source row but has DIFFERENT non-PK column values. INSERT OR IGNORE + skips the source row (PK conflict), so the target retains the stale data. + The old PK-only reconciliation would have incorrectly reported success. + The new full-row reconciliation must detect the mismatch and raise. + """ + app_path, app_url = _upgraded_app_db(tmp_path) + + # Legacy source has a row: person="alice", datetime="2026-01-01T10:00:00Z", + # latitude=1.23, longitude=4.56, altitude=7.89 + legacy_path = tmp_path / "locationRecorder.db" + _make_legacy_location_db(legacy_path, [("alice", "2026-01-01T10:00:00Z", 1.23, 4.56, 7.89)]) + legacy_url = f"sqlite:///{legacy_path}" + + # App DB is pre-populated with the SAME PK but DIFFERENT non-PK values + # (latitude/longitude/altitude all differ from the source row) + conn = sqlite3.connect(app_path) + conn.execute( + "INSERT INTO location (person, datetime, latitude, longitude, altitude) " + "VALUES (?, ?, ?, ?, ?)", + ("alice", "2026-01-01T10:00:00Z", 99.0, 99.0, 99.0), + ) + conn.commit() + conn.close() + + # migrate_legacy_data must raise: the source row's data is NOT in the target + # (INSERT OR IGNORE skipped it because of PK conflict, retaining the 99.0 values) + with pytest.raises(RuntimeError, match="Reconciliation failed"): + migrate_legacy_data(app_url, legacy_url, None) + + +def test_full_row_reconciliation_idempotent_on_identical_data(tmp_path: Path) -> None: + """Second run on already-migrated data still reconciles cleanly. + + When the target already holds identical rows (from the first run), the full-row + IS predicate matches every column and reconciliation passes (no raise). + """ + app_path, app_url = _upgraded_app_db(tmp_path) + legacy_path = tmp_path / "locationRecorder.db" + _make_legacy_location_db(legacy_path, LOCATION_ROWS) + legacy_url = f"sqlite:///{legacy_path}" + + # First run: migrate all rows + result1 = migrate_legacy_data(app_url, legacy_url, None) + assert result1["location"]["copied"] == len(LOCATION_ROWS) + + # Second run: rows already present, INSERT OR IGNORE skips all, full-row + # reconciliation must still pass because values are identical + result2 = migrate_legacy_data(app_url, legacy_url, None) + assert result2["location"]["copied"] == 0 + assert result2["location"]["final"] == len(LOCATION_ROWS) + # No exception raised — idempotency holds under full-row reconciliation + + # --------------------------------------------------------------------------- # Test 4: dry_run # ---------------------------------------------------------------------------