From 7749a02706180bfa8dedbd27d6412d25374f90e9 Mon Sep 17 00:00:00 2001 From: zphinx Date: Mon, 11 May 2026 21:07:39 +0200 Subject: [PATCH] feat: add history UX and expand retention-focused roadmap --- ROADMAP.md | 191 ++++++++++++++++++++++++++++++++++++ src/tai/cli.py | 152 +++++++++++++++++++++++++++- src/tai/session_store.py | 54 ++++++++++ tests/test_cli.py | 106 ++++++++++++++++++++ tests/test_session_store.py | 50 ++++++++++ 5 files changed, 552 insertions(+), 1 deletion(-) diff --git a/ROADMAP.md b/ROADMAP.md index 0144ba1..bc2485f 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -321,3 +321,194 @@ ______________________________________________________________________ | 2026-05-04 | RAG vector store (Tier 2/3) | `chromadb` embedded mode (default) or `qdrant` self-hosted | | 2026-05-04 | RAG chunking unit | Command-boundary splitting — each collected command = one or more chunks | | 2026-05-04 | Runbook format | Markdown with YAML frontmatter, version-controlled in `runbooks/` directory | + +______________________________________________________________________ + +## End-State UX Goal + +After the current CLI and memory roadmap phases are stable, the long-term UX goal is a full-screen terminal TUI with an ncurses-style workflow. + +### Target End-State + +- Split-pane troubleshooting workspace (diagnostics, AI output, and command/input area) +- Live command/probe status with clear success/failure indicators +- In-session history browser for prior questions, retrieved evidence, and related past sessions +- Keyboard-first navigation for operators running in SSH-only environments + +### Delivery Approach + +- Keep shipping incremental CLI features first (current roadmap order remains unchanged) +- Promote stable workflows into TUI panels once behavior is proven in CLI mode +- Treat the TUI as a final UX consolidation milestone, not a blocker for core troubleshooting capabilities + +______________________________________________________________________ + +## Container Distribution Goal (Docker) + +After core CLI/TUI workflows stabilize, provide an official Docker image as an additional distribution target. + +### Container Execution Model (Decision) + +- Docker is a one-shot invocation target, not a daemon/service mode +- Each run executes a single `tai` command and exits +- State is persisted only through mounted host volumes + +### Why Docker Is Valuable Here + +- Reproducible runtime: pin Python and dependency versions to remove host-level drift +- Faster operator onboarding: run with one command instead of local Python setup +- Cleaner CI/CD release path: publish versioned images aligned with git tags +- Safer local footprint: isolate dependencies from the host OS package manager + +### Subgoals + +1. Base image and runtime hardening + +- Multi-stage Dockerfile with slim runtime image +- Non-root runtime user and minimal filesystem permissions +- Healthcheck for CLI startup and version command + +2. Runtime integration for SSH workflows + +- Documented mounts for `~/.ssh` (read-only where possible) and known-hosts handling +- Pass-through for SSH config when needed (`--ignore-ssh-config` behavior documented) +- Clear guidance for jump-host and bastion scenarios from inside the container +- Documented one-shot run examples for `tai run` and `tai history` + +3. Persistent data strategy + +- Required volume mount guidance for runbook store (`~/.tai/runbooks`) +- Required volume mount guidance for session memory/history (`~/.tai/sessions`) +- Optional bind mount for JSONL logs and report export artifacts +- Clear defaults for container paths and equivalent host path mappings + +4. Release and quality gates + +- Build and publish image on tagged releases +- Smoke tests in CI: probe mode, collect mode, and history command against mocked endpoints +- Version labeling (image tags and OCI metadata) tied to changelog/release tags + +### Data Retention and Lifecycle Policy + +Retention behavior must be explicit and configurable at runtime. Defaults should be conservative and documented. + +1. Retention classes + +- Session memory store (`~/.tai/sessions`): keep semantically indexed summaries for troubleshooting continuity +- Runbook store (`~/.tai/runbooks`): retain until explicitly replaced or pruned by sync policy +- JSONL logs and exported reports: operator-controlled retention with optional TTL cleanup + +2. Retention controls + +- Add CLI controls for age-based pruning (for example `--retain-days` on cleanup commands) +- Add host-scoped cleanup (delete history for one host) and full-store cleanup (all hosts) +- Add dry-run cleanup mode to show what would be deleted before applying changes + +3. No-persist mode + +- Add a documented ephemeral mode where no session memory or logs are written +- Ensure one-shot diagnostics can run in read-only operational contexts + +### Configuration and State Persistence Model + +Configuration and retained state should be predictable across container upgrades and host environments. + +1. Mount and path contract + +- Define canonical container paths for `~/.tai/runbooks`, `~/.tai/sessions`, and optional log/export paths +- Document required versus optional mounts and expected permissions for each +- Document UID/GID mapping guidance to prevent host volume ownership issues + +2. Schema and compatibility + +- Introduce explicit storage schema version metadata for persistent stores +- Define upgrade behavior for older stores (migrate, re-index, or fail with clear guidance) +- Add compatibility notes for image upgrades and rollback expectations + +3. Backup and recovery + +- Provide export/import workflows for session memory and runbook indexes +- Document minimal backup set and restore order for disaster recovery + +### Security and Privacy for Retained Data + +Persisted troubleshooting evidence can include sensitive operational data and must be handled accordingly. + +1. Data minimization + +- Add optional redaction hooks for common sensitive patterns before persistence +- Keep prompt-only transient data separate from persisted summary/index content + +2. Runtime hardening + +- Target non-root container execution with read-only root filesystem by default +- Require explicit writable mounts only for retained data locations + +3. Auditable behavior + +- Log retention-affecting operations (cleanup, purge, export/import) with timestamps and scope +- Define stable exit codes for cleanup and retention workflows to support automation + +### Kubernetes Position + +Kubernetes is out of scope for this delivery plan. + +- `tai` is currently an operator-invoked troubleshooting client, not a long-running service +- AI inference is external to `tai` (OpenAI-compatible endpoint), reducing the need for in-cluster model orchestration +- SSH key/config handling and per-operator context are simpler with local or single-container execution + +Kubernetes can be revisited only if `tai` evolves into a centralized multi-user service with queueing, RBAC, and shared tenancy requirements. + +______________________________________________________________________ + +## Final Long-Term Goal: Full Rust Migration + +This is a final-stage roadmap goal and remains explicitly out of near-term scope. +It should begin only after the Python implementation, TUI direction, Docker one-shot model, +and retention/persistence policies are stable and proven in production usage. + +### Why This Is the Final Goal + +- Improve execution latency and startup speed for both native runs and container one-shot invocations +- Produce a single, portable native binary with minimal runtime dependency footprint +- Strengthen reliability and memory safety under heavy log parsing and concurrent workflows +- Simplify long-term packaging and distribution across Linux targets + +### Migration Objectives + +1. Preserve feature parity first + +- Match existing CLI behavior, interactive workflows, RAG integration, runbook management, and history/session-memory features +- Keep command semantics and safety boundaries equivalent during transition + +2. Target both distribution modes + +- Native Rust binary for direct operator use +- Docker image built around the Rust binary for one-shot execution with mounted persistent volumes + +3. Keep compatibility guardrails + +- Define persistent data format compatibility or migration tooling for runbook/session stores +- Preserve operator-visible flags where practical to reduce migration friction + +### Suggested Delivery Phases + +1. Build baseline Rust CLI scaffold with feature-flagged parity checkpoints +2. Port SSH execution and read-only policy enforcement modules +3. Port planner, collectors, prompt composition, and AI client adapters +4. Port session memory/history and runbook workflows with migration tests +5. Port interactive UX/TUI layer and deprecate Python runtime path + +### Rust Toolchain End-State + +- Standardize on Cargo-based build/test/lint pipeline (`cargo fmt`, `cargo clippy`, `cargo test`) +- Add release profile optimization and reproducible build settings +- Publish signed native artifacts and Docker images derived from Rust release binaries + +### Decision Gate Before Starting + +Begin Rust migration only when: + +- Python roadmap milestones are complete and stable +- Container distribution and retention policy workflows are operationally validated +- A parity test matrix exists to prove behavior equivalence during migration diff --git a/src/tai/cli.py b/src/tai/cli.py index 22b048a..7029e66 100644 --- a/src/tai/cli.py +++ b/src/tai/cli.py @@ -39,6 +39,83 @@ app.add_typer(runbooks_app, name="runbooks") console = Console() +@app.command("history") +def history( + query: Annotated[ + str | None, + typer.Option("--query", help="Optional keyword to match issue/summary text."), + ] = None, + host: Annotated[ + str | None, + typer.Option("--host", help="Filter history by host."), + ] = None, + limit: Annotated[ + int, + typer.Option("--limit", min=1, help="Maximum number of sessions to display."), + ] = 20, + export: Annotated[ + str | None, + typer.Option("--export", help="Optional path to write results as Markdown."), + ] = None, + session_memory_path: Annotated[ + str, + typer.Option( + "--session-memory", + help="Path to persistent session memory store. Defaults to ~/.tai/sessions.", + ), + ] = "~/.tai/sessions", +) -> None: + """Search or list previously indexed troubleshooting sessions.""" + from pathlib import Path + + try: + store = SessionStore(session_memory_path) + except Exception as exc: # noqa: BLE001 + console.print(f"[red]Could not open session memory:[/red] {exc}") + raise typer.Exit(code=1) from exc + + try: + if query: + sessions = store.search_keyword(query, host=host, limit=limit) + else: + sessions = store.list_recent(host=host, limit=limit) + except Exception as exc: # noqa: BLE001 + console.print(f"[red]History query failed:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if not sessions: + scope = f" for host {host}" if host else "" + qualifier = f" matching '{query}'" if query else "" + console.print(f"[yellow]No session history found{scope}{qualifier}.[/yellow]") + return + + title = f"{len(sessions)} session(s)" + if host: + title += f" for host={host}" + if query: + title += f" matching '{query}'" + console.print(f"[bold]{title}[/bold]") + + for sess in sessions: + summary_line = (sess.summary or "").strip().splitlines() + excerpt = summary_line[0] if summary_line else "(no summary)" + if len(excerpt) > 120: + excerpt = f"{excerpt[:120].rstrip()}..." + console.print( + f" [green]{sess.session_id}[/green] host={sess.host} issue={sess.issue}\n" + f" [dim]{excerpt}[/dim]" + ) + + if export: + output_path = Path(export).expanduser().resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + _format_history_markdown(sessions, query=query, host=host), + encoding="utf-8", + ) + console.print(f"[green]✓ Exported[/green] {output_path}") + + @app.command() def run( issue: Annotated[str, typer.Argument(help="Ticket text or issue summary.")], @@ -363,7 +440,7 @@ async def _interactive_loop( console.print( Panel( "Ask questions directly, or use [bold]/collect[/bold], " - "[bold]/analyze[/bold], [bold]/help[/bold], [bold]/quit[/bold]", + "[bold]/analyze[/bold], [bold]/history[/bold], [bold]/help[/bold], [bold]/quit[/bold]", title="[bold cyan]Interactive Mode[/bold cyan]", border_style="cyan", padding=(0, 1), @@ -434,6 +511,7 @@ async def _interactive_loop( Panel( "[bold]/collect[/bold] — re-run diagnostics\n" "[bold]/analyze[/bold] — re-analyze current diagnostics\n" + "[bold]/history[/bold] — show prior sessions for this host\n" "[bold]/help[/bold] — show this message\n" "[bold]/quit[/bold] — end session\n" "[dim]Anything else is sent directly to the AI as a question.[/dim]", @@ -444,6 +522,52 @@ async def _interactive_loop( ) continue + if command.startswith("/history"): + if session_store is None: + console.print( + "[yellow]Session memory is disabled. " + "Use --session-memory to enable /history.[/yellow]" + ) + continue + + keyword = command.removeprefix("/history").strip() + try: + sessions = ( + session_store.search_keyword(keyword, host=req.host, limit=5) + if keyword + else session_store.list_recent(host=req.host, limit=5) + ) + except Exception as exc: # noqa: BLE001 + console.print(f"[yellow]History unavailable:[/yellow] {exc}") + continue + + if not sessions: + qualifier = f" matching '{keyword}'" if keyword else "" + console.print(f"[yellow]No prior sessions found{qualifier}.[/yellow]") + continue + + lines = [] + for sess in sessions: + issue = sess.issue.strip() or "(unknown issue)" + lines.append(f"- [bold]{sess.session_id}[/bold] — {issue}") + heading = "Prior sessions" + if keyword: + heading += f" matching '{keyword}'" + console.print( + Panel( + "\n".join(lines), + title=f"[bold cyan]{heading}[/bold cyan]", + border_style="cyan", + padding=(0, 1), + ) + ) + if logger is not None: + logger.log_event( + "interactive_history", + {"keyword": keyword or None, "count": len(sessions)}, + ) + continue + if command == "/collect": plan = plan_from_request(req) console.print(f"[cyan]Collecting diagnostics:[/cyan] {len(plan)} commands") @@ -926,6 +1050,32 @@ def _index_session_memory( logger.log_event("session_memory_error", {"error": str(exc)}) +def _format_history_markdown( + sessions: list[PastSession], + *, + query: str | None, + host: str | None, +) -> str: + """Render session history rows as a Markdown report.""" + lines = ["# tai session history", ""] + if host: + lines.append(f"Host filter: {host}") + if query: + lines.append(f"Keyword filter: {query}") + if host or query: + lines.append("") + + for sess in sessions: + lines.append(f"## {sess.session_id}") + lines.append(f"- Host: {sess.host}") + lines.append(f"- Issue: {sess.issue}") + lines.append("") + lines.append(sess.summary.strip() or "(no summary)") + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + # --------------------------------------------------------------------------- # runbooks sub-app # --------------------------------------------------------------------------- diff --git a/src/tai/session_store.py b/src/tai/session_store.py index 63ef633..7d428dc 100644 --- a/src/tai/session_store.py +++ b/src/tai/session_store.py @@ -98,6 +98,60 @@ class SessionStore: ) return sessions + def list_recent(self, *, host: str | None = None, limit: int = 20) -> list[PastSession]: + """Return recent indexed sessions, optionally filtered by host.""" + if limit < 1: + raise ValueError("limit must be >= 1") + count = self._collection.count() + if count == 0: + return [] + + results = self._collection.get( + include=["documents", "metadatas"], + limit=min(limit, count), + ) + ids = results.get("ids") or [] + docs = results.get("documents") or [] + metas = results.get("metadatas") or [] + + sessions: list[PastSession] = [] + for sid, doc, meta in zip(ids, docs, metas, strict=False): + sessions.append( + PastSession( + session_id=str(sid), + host=str(meta.get("host", "")), + issue=str(meta.get("issue", "")), + summary=str(doc), + ) + ) + + if host: + host_norm = host.strip().lower() + sessions = [s for s in sessions if s.host.lower() == host_norm] + + sessions.sort(key=lambda s: s.session_id, reverse=True) + return sessions[:limit] + + def search_keyword( + self, + keyword: str, + *, + host: str | None = None, + limit: int = 20, + ) -> list[PastSession]: + """Return recent sessions matching a keyword in issue or summary text.""" + term = keyword.strip().lower() + if not term: + return self.list_recent(host=host, limit=limit) + + all_recent = self.list_recent(host=host, limit=max(limit, self.count())) + filtered = [ + sess + for sess in all_recent + if term in sess.issue.lower() or term in sess.summary.lower() + ] + return filtered[:limit] + def _build_embed_text(*, host: str, issue: str, summary: str) -> str: """Build embedding text with host/issue context and summary excerpt.""" diff --git a/tests/test_cli.py b/tests/test_cli.py index 2f05dd7..657fd2a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,3 +1,4 @@ +from pathlib import Path from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock @@ -338,3 +339,108 @@ def test_interactive_rag_debug_prints_retrieval_scores(monkeypatch) -> None: # assert result.exit_code == 0 assert "RAG retrieve:" in result.stdout + + +def test_history_command_lists_sessions(monkeypatch) -> None: # type: ignore[no-untyped-def] + class FakeStore: + def __init__(self, _path: str) -> None: + pass + + def list_recent(self, *, host: str | None = None, limit: int = 20): + del limit + if host == "web01": + return [ + SimpleNamespace( + session_id="20260507T120000Z", + host="web01", + issue="nginx down", + summary="Root cause: bad config", + ) + ] + return [] + + monkeypatch.setattr("tai.cli.SessionStore", FakeStore) + + runner = CliRunner() + result = runner.invoke( + app, + ["history", "--session-memory", "~/.tai/sessions", "--host", "web01"], + ) + + assert result.exit_code == 0 + assert "session(s)" in result.stdout + assert "20260507T120000Z" in result.stdout + + +def test_history_command_exports_markdown(monkeypatch, tmp_path: Path) -> None: # type: ignore[no-untyped-def] + class FakeStore: + def __init__(self, _path: str) -> None: + pass + + def list_recent(self, *, host: str | None = None, limit: int = 20): + del host, limit + return [ + SimpleNamespace( + session_id="20260507T120000Z", + host="web01", + issue="nginx down", + summary="Root cause: bad config", + ) + ] + + monkeypatch.setattr("tai.cli.SessionStore", FakeStore) + export_path = tmp_path / "history.md" + + runner = CliRunner() + result = runner.invoke( + app, + ["history", "--session-memory", "~/.tai/sessions", "--export", str(export_path)], + ) + + assert result.exit_code == 0 + assert "Exported" in result.stdout + text = export_path.read_text(encoding="utf-8") + assert "# tai session history" in text + assert "nginx down" in text + + +def test_interactive_history_without_store_shows_hint(monkeypatch) -> None: # type: ignore[no-untyped-def] + _mock_session(monkeypatch) + + async def fake_collect_from_plan(_session, _plan) -> CollectionReport: # type: ignore[no-untyped-def] + return CollectionReport( + host="ssh.archflux.net", + items=[ + CollectedItem( + name="kernel", + result=SSHCommandResult( + command="uname -a", + exit_code=0, + stdout="Linux test", + stderr="", + ), + ), + ], + ) + + commands = iter(["/history", "/quit"]) + monkeypatch.setattr("tai.cli.collect_from_plan", fake_collect_from_plan) + monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands)) + monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True) + + runner = CliRunner() + result = runner.invoke( + app, + [ + "run", "apache failed", + "--host", + "ssh.archflux.net", + "--port", + "5566", + "--no-probe", + "--interactive", + ], + ) + + assert result.exit_code == 0 + assert "Session memory is disabled" in result.stdout diff --git a/tests/test_session_store.py b/tests/test_session_store.py index e66ed41..b96561e 100644 --- a/tests/test_session_store.py +++ b/tests/test_session_store.py @@ -77,3 +77,53 @@ def test_query_returns_past_sessions(tmp_path: Path) -> None: assert isinstance(results[0], PastSession) assert results[0].host == "web01" assert "package missing" in results[0].summary + + +def test_list_recent_returns_sessions_sorted_desc(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value + collection.count.return_value = 3 + collection.get.return_value = { + "ids": ["20260506T120000Z", "20260507T120000Z", "20260505T120000Z"], + "documents": ["older", "newer", "oldest"], + "metadatas": [ + {"host": "web01", "issue": "i1"}, + {"host": "web01", "issue": "i2"}, + {"host": "db01", "issue": "i3"}, + ], + } + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = SessionStore(tmp_path / "store") + results = store.list_recent(limit=2) + + assert len(results) == 2 + assert results[0].session_id == "20260507T120000Z" + assert results[1].session_id == "20260506T120000Z" + + +def test_search_keyword_filters_by_term_and_host(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value + collection.count.return_value = 3 + collection.get.return_value = { + "ids": ["20260505T120000Z", "20260506T120000Z", "20260507T120000Z"], + "documents": [ + "Root cause: nginx config typo", + "Root cause: package missing", + "Root cause: nginx port conflict", + ], + "metadatas": [ + {"host": "web01", "issue": "nginx fails"}, + {"host": "web01", "issue": "sssd fails"}, + {"host": "db01", "issue": "nginx start failed"}, + ], + } + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = SessionStore(tmp_path / "store") + results = store.search_keyword("nginx", host="web01", limit=5) + + assert len(results) == 1 + assert results[0].host == "web01" + assert "nginx" in results[0].issue.lower()