Files
home-automation/scripts/migrate_legacy_data.py
T
tliu93 1cbe6c46d2 M1-rework: harden legacy-migration reconciliation to full-row equality
Audit finding (review-notes/M1-full-review-1.md, FINDING 1): _reconcile only
checked primary-key presence, so a source row skipped by INSERT OR IGNORE due
to a value difference against a pre-existing same-PK target row would
false-pass. Compare ALL columns with SQLite's NULL-safe IS operator instead,
so reconciliation is a true full-row guarantee (idempotent re-runs still pass
because the rows match column-for-column). Add tests for the value-mismatch
abort and for idempotency under full-row reconciliation. Remove the now-unused
pk_cols parameter.

pytest 97 passed; ruff clean (pre-existing only); data-safety grep still empty.
2026-06-12 19:05:56 +02:00

268 lines
8.4 KiB
Python

"""One-time idempotent data migration: copy rows from legacy locationRecorder.db /
pooRecorder.db into the unified app DB's location / poo_records tables.
NOT part of the Alembic chain. Run manually, once, during production cut-over:
python -m scripts.migrate_legacy_data \\
--app-db sqlite:///./data/app.db \\
--location-db sqlite:///./data/locationRecorder.db \\
--poo-db sqlite:///./data/pooRecorder.db
Or rely on environment variables:
APP_DATABASE_URL, LOCATION_DATABASE_URL, POO_DATABASE_URL
Add --dry-run to preview row counts without writing anything.
Return value of migrate_legacy_data(): a dict shaped like:
{
"location": {"source": N, "copied": C, "skipped": bool, "final": F},
"poo_records": {"source": N, "copied": C, "skipped": bool, "final": F},
}
where:
source - rows in the legacy DB (0 when skipped)
copied - rows inserted by this run (0 when dry_run or skipped)
skipped - True when the legacy file was absent
final - rows present in the app table after the run (0 when dry_run)
"""
from __future__ import annotations
import argparse
import os
import sqlite3
import sys
from pathlib import Path
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _sqlite_path_from_url(url: str) -> Path:
"""Extract the filesystem path from a sqlite:///... URL.
If *url* does not start with 'sqlite:///', it is treated as a plain path.
"""
prefix = "sqlite:///"
if url.startswith(prefix):
return Path(url[len(prefix):])
return Path(url)
def _reconcile(
conn: sqlite3.Connection,
table: str,
columns: list[str],
source_count: int,
) -> int:
"""Verify every legacy source row is present in the main (app) table.
Matches on ALL columns using SQLite's NULL-safe IS operator so that nullable
columns (e.g. altitude) compare correctly. A row that was silently skipped
by INSERT OR IGNORE due to a value difference will NOT satisfy this predicate
even if its primary key is present in the target.
Returns the count of source rows whose full-row data is present in main.
Raises RuntimeError if any rows are missing or differ in value.
"""
join_cond = " AND ".join(f"m.{col} IS l.{col}" for col in columns)
sql = (
f"SELECT COUNT(*) FROM legacy.{table} l "
f"WHERE EXISTS (SELECT 1 FROM main.{table} m WHERE {join_cond})"
)
(present,) = conn.execute(sql).fetchone()
if present < source_count:
missing = source_count - present
raise RuntimeError(
f"Reconciliation failed for table '{table}': "
f"{missing} of {source_count} source rows are missing or differing in the app DB."
)
return present
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def migrate_legacy_data(
app_url: str,
location_url: str | None,
poo_url: str | None,
*,
dry_run: bool = False,
) -> dict:
"""Copy rows from legacy DBs into the app DB's location / poo_records tables.
Parameters
----------
app_url: sqlite:///... URL (or plain path) for the unified app DB.
location_url: sqlite:///... URL (or plain path) for the legacy location DB,
or None to skip that table.
poo_url: sqlite:///... URL (or plain path) for the legacy poo DB,
or None to skip that table.
dry_run: When True, gather counts only; perform no writes.
Returns a dict with per-table stats (see module docstring).
Raises RuntimeError on reconciliation failure (non-zero rows missing).
"""
app_path = _sqlite_path_from_url(app_url)
results: dict[str, dict] = {}
# --- location table ---
results["location"] = _migrate_table(
app_path=app_path,
legacy_url=location_url,
table="location",
columns=["person", "datetime", "latitude", "longitude", "altitude"],
dry_run=dry_run,
)
# --- poo_records table ---
results["poo_records"] = _migrate_table(
app_path=app_path,
legacy_url=poo_url,
table="poo_records",
columns=["timestamp", "status", "latitude", "longitude"],
dry_run=dry_run,
)
return results
def _migrate_table(
*,
app_path: Path,
legacy_url: str | None,
table: str,
columns: list[str],
dry_run: bool,
) -> dict:
"""Migrate a single table from a legacy DB into the app DB.
Returns a per-table stats dict.
"""
# If the caller passed None → treat as absent
if legacy_url is None:
return {"source": 0, "copied": 0, "skipped": True, "final": 0}
legacy_path = _sqlite_path_from_url(legacy_url)
# If the file doesn't exist → safe no-op
if not legacy_path.exists():
return {"source": 0, "copied": 0, "skipped": True, "final": 0}
col_list = ", ".join(columns)
conn = sqlite3.connect(app_path)
try:
conn.execute("ATTACH DATABASE ? AS legacy", (str(legacy_path),))
# Count source rows
(source_count,) = conn.execute(f"SELECT COUNT(*) FROM legacy.{table}").fetchone()
if dry_run:
conn.execute("DETACH DATABASE legacy")
return {
"source": source_count,
"copied": 0,
"skipped": False,
"final": 0,
}
# Count rows already in the target before this run
(before_count,) = conn.execute(f"SELECT COUNT(*) FROM main.{table}").fetchone()
# Idempotent insert — PK conflict → skip
conn.execute(
f"INSERT OR IGNORE INTO main.{table} ({col_list}) "
f"SELECT {col_list} FROM legacy.{table}"
)
conn.commit()
# Count rows now
(after_count,) = conn.execute(f"SELECT COUNT(*) FROM main.{table}").fetchone()
copied = after_count - before_count
# Reconciliation: every source row must be present with matching values
_reconcile(conn, table, columns, source_count)
conn.execute("DETACH DATABASE legacy")
finally:
conn.close()
return {
"source": source_count,
"copied": copied,
"skipped": False,
"final": after_count,
}
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Migrate legacy location/poo data into the unified app DB."
)
parser.add_argument(
"--app-db",
default=os.environ.get("APP_DATABASE_URL"),
help="sqlite:///... URL or path for the app DB "
"(default: $APP_DATABASE_URL)",
)
parser.add_argument(
"--location-db",
default=os.environ.get("LOCATION_DATABASE_URL"),
help="sqlite:///... URL or path for the legacy location DB "
"(default: $LOCATION_DATABASE_URL). Omit to skip location table.",
)
parser.add_argument(
"--poo-db",
default=os.environ.get("POO_DATABASE_URL"),
help="sqlite:///... URL or path for the legacy poo DB "
"(default: $POO_DATABASE_URL). Omit to skip poo_records table.",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Report counts only; do not write any rows.",
)
args = parser.parse_args()
if not args.app_db:
parser.error(
"App DB not specified. Pass --app-db or set APP_DATABASE_URL."
)
try:
results = migrate_legacy_data(
app_url=args.app_db,
location_url=args.location_db,
poo_url=args.poo_db,
dry_run=args.dry_run,
)
except RuntimeError as exc:
print(f"ERROR: {exc}", file=sys.stderr)
sys.exit(1)
prefix = "[DRY RUN] " if args.dry_run else ""
print(f"{prefix}Migration results:")
for table_name, stats in results.items():
if stats["skipped"]:
print(f" {table_name}: SKIPPED (legacy file absent or not provided)")
else:
print(
f" {table_name}: source={stats['source']}, "
f"copied={stats['copied']}, final={stats['final']}"
)
if __name__ == "__main__":
main()