From 7749a02706180bfa8dedbd27d6412d25374f90e9 Mon Sep 17 00:00:00 2001
From: zphinx <cban@gmx.com>
Date: Mon, 11 May 2026 21:07:39 +0200
Subject: [PATCH] feat: add history UX and expand retention-focused roadmap

---
 ROADMAP.md                  | 191 ++++++++++++++++++++++++++++++++++++
 src/tai/cli.py              | 152 +++++++++++++++++++++++++++-
 src/tai/session_store.py    |  54 ++++++++++
 tests/test_cli.py           | 106 ++++++++++++++++++++
 tests/test_session_store.py |  50 ++++++++++
 5 files changed, 552 insertions(+), 1 deletion(-)

diff --git a/ROADMAP.md b/ROADMAP.md
index 0144ba1..bc2485f 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -321,3 +321,194 @@ ______________________________________________________________________
 | 2026-05-04 | RAG vector store (Tier 2/3) | `chromadb` embedded mode (default) or `qdrant` self-hosted |
 | 2026-05-04 | RAG chunking unit | Command-boundary splitting — each collected command = one or more chunks |
 | 2026-05-04 | Runbook format | Markdown with YAML frontmatter, version-controlled in `runbooks/` directory |
+
+______________________________________________________________________
+
+## End-State UX Goal
+
+After the current CLI and memory roadmap phases are stable, the long-term UX goal is a full-screen terminal TUI with an ncurses-style workflow.
+
+### Target End-State
+
+- Split-pane troubleshooting workspace (diagnostics, AI output, and command/input area)
+- Live command/probe status with clear success/failure indicators
+- In-session history browser for prior questions, retrieved evidence, and related past sessions
+- Keyboard-first navigation for operators running in SSH-only environments
+
+### Delivery Approach
+
+- Keep shipping incremental CLI features first (current roadmap order remains unchanged)
+- Promote stable workflows into TUI panels once behavior is proven in CLI mode
+- Treat the TUI as a final UX consolidation milestone, not a blocker for core troubleshooting capabilities
+
+______________________________________________________________________
+
+## Container Distribution Goal (Docker)
+
+After core CLI/TUI workflows stabilize, provide an official Docker image as an additional distribution target.
+
+### Container Execution Model (Decision)
+
+- Docker is a one-shot invocation target, not a daemon/service mode
+- Each run executes a single `tai` command and exits
+- State is persisted only through mounted host volumes
+
+### Why Docker Is Valuable Here
+
+- Reproducible runtime: pin Python and dependency versions to remove host-level drift
+- Faster operator onboarding: run with one command instead of local Python setup
+- Cleaner CI/CD release path: publish versioned images aligned with git tags
+- Safer local footprint: isolate dependencies from the host OS package manager
+
+### Subgoals
+
+1. Base image and runtime hardening
+
+- Multi-stage Dockerfile with slim runtime image
+- Non-root runtime user and minimal filesystem permissions
+- Healthcheck for CLI startup and version command
+
+2. Runtime integration for SSH workflows
+
+- Documented mounts for `~/.ssh` (read-only where possible) and known-hosts handling
+- Pass-through for SSH config when needed (`--ignore-ssh-config` behavior documented)
+- Clear guidance for jump-host and bastion scenarios from inside the container
+- Documented one-shot run examples for `tai run` and `tai history`
+
+3. Persistent data strategy
+
+- Required volume mount guidance for runbook store (`~/.tai/runbooks`)
+- Required volume mount guidance for session memory/history (`~/.tai/sessions`)
+- Optional bind mount for JSONL logs and report export artifacts
+- Clear defaults for container paths and equivalent host path mappings
+
+4. Release and quality gates
+
+- Build and publish image on tagged releases
+- Smoke tests in CI: probe mode, collect mode, and history command against mocked endpoints
+- Version labeling (image tags and OCI metadata) tied to changelog/release tags
+
+### Data Retention and Lifecycle Policy
+
+Retention behavior must be explicit and configurable at runtime. Defaults should be conservative and documented.
+
+1. Retention classes
+
+- Session memory store (`~/.tai/sessions`): keep semantically indexed summaries for troubleshooting continuity
+- Runbook store (`~/.tai/runbooks`): retain until explicitly replaced or pruned by sync policy
+- JSONL logs and exported reports: operator-controlled retention with optional TTL cleanup
+
+2. Retention controls
+
+- Add CLI controls for age-based pruning (for example `--retain-days` on cleanup commands)
+- Add host-scoped cleanup (delete history for one host) and full-store cleanup (all hosts)
+- Add dry-run cleanup mode to show what would be deleted before applying changes
+
+3. No-persist mode
+
+- Add a documented ephemeral mode where no session memory or logs are written
+- Ensure one-shot diagnostics can run in read-only operational contexts
+
+### Configuration and State Persistence Model
+
+Configuration and retained state should be predictable across container upgrades and host environments.
+
+1. Mount and path contract
+
+- Define canonical container paths for `~/.tai/runbooks`, `~/.tai/sessions`, and optional log/export paths
+- Document required versus optional mounts and expected permissions for each
+- Document UID/GID mapping guidance to prevent host volume ownership issues
+
+2. Schema and compatibility
+
+- Introduce explicit storage schema version metadata for persistent stores
+- Define upgrade behavior for older stores (migrate, re-index, or fail with clear guidance)
+- Add compatibility notes for image upgrades and rollback expectations
+
+3. Backup and recovery
+
+- Provide export/import workflows for session memory and runbook indexes
+- Document minimal backup set and restore order for disaster recovery
+
+### Security and Privacy for Retained Data
+
+Persisted troubleshooting evidence can include sensitive operational data and must be handled accordingly.
+
+1. Data minimization
+
+- Add optional redaction hooks for common sensitive patterns before persistence
+- Keep prompt-only transient data separate from persisted summary/index content
+
+2. Runtime hardening
+
+- Target non-root container execution with read-only root filesystem by default
+- Require explicit writable mounts only for retained data locations
+
+3. Auditable behavior
+
+- Log retention-affecting operations (cleanup, purge, export/import) with timestamps and scope
+- Define stable exit codes for cleanup and retention workflows to support automation
+
+### Kubernetes Position
+
+Kubernetes is out of scope for this delivery plan.
+
+- `tai` is currently an operator-invoked troubleshooting client, not a long-running service
+- AI inference is external to `tai` (OpenAI-compatible endpoint), reducing the need for in-cluster model orchestration
+- SSH key/config handling and per-operator context are simpler with local or single-container execution
+
+Kubernetes can be revisited only if `tai` evolves into a centralized multi-user service with queueing, RBAC, and shared tenancy requirements.
+
+______________________________________________________________________
+
+## Final Long-Term Goal: Full Rust Migration
+
+This is a final-stage roadmap goal and remains explicitly out of near-term scope.
+It should begin only after the Python implementation, TUI direction, Docker one-shot model,
+and retention/persistence policies are stable and proven in production usage.
+
+### Why This Is the Final Goal
+
+- Improve execution latency and startup speed for both native runs and container one-shot invocations
+- Produce a single, portable native binary with minimal runtime dependency footprint
+- Strengthen reliability and memory safety under heavy log parsing and concurrent workflows
+- Simplify long-term packaging and distribution across Linux targets
+
+### Migration Objectives
+
+1. Preserve feature parity first
+
+- Match existing CLI behavior, interactive workflows, RAG integration, runbook management, and history/session-memory features
+- Keep command semantics and safety boundaries equivalent during transition
+
+2. Target both distribution modes
+
+- Native Rust binary for direct operator use
+- Docker image built around the Rust binary for one-shot execution with mounted persistent volumes
+
+3. Keep compatibility guardrails
+
+- Define persistent data format compatibility or migration tooling for runbook/session stores
+- Preserve operator-visible flags where practical to reduce migration friction
+
+### Suggested Delivery Phases
+
+1. Build baseline Rust CLI scaffold with feature-flagged parity checkpoints
+2. Port SSH execution and read-only policy enforcement modules
+3. Port planner, collectors, prompt composition, and AI client adapters
+4. Port session memory/history and runbook workflows with migration tests
+5. Port interactive UX/TUI layer and deprecate Python runtime path
+
+### Rust Toolchain End-State
+
+- Standardize on Cargo-based build/test/lint pipeline (`cargo fmt`, `cargo clippy`, `cargo test`)
+- Add release profile optimization and reproducible build settings
+- Publish signed native artifacts and Docker images derived from Rust release binaries
+
+### Decision Gate Before Starting
+
+Begin Rust migration only when:
+
+- Python roadmap milestones are complete and stable
+- Container distribution and retention policy workflows are operationally validated
+- A parity test matrix exists to prove behavior equivalence during migration
diff --git a/src/tai/cli.py b/src/tai/cli.py
index 22b048a..7029e66 100644
--- a/src/tai/cli.py
+++ b/src/tai/cli.py
@@ -39,6 +39,83 @@ app.add_typer(runbooks_app, name="runbooks")
 console = Console()
 
 
+@app.command("history")
+def history(
+    query: Annotated[
+        str | None,
+        typer.Option("--query", help="Optional keyword to match issue/summary text."),
+    ] = None,
+    host: Annotated[
+        str | None,
+        typer.Option("--host", help="Filter history by host."),
+    ] = None,
+    limit: Annotated[
+        int,
+        typer.Option("--limit", min=1, help="Maximum number of sessions to display."),
+    ] = 20,
+    export: Annotated[
+        str | None,
+        typer.Option("--export", help="Optional path to write results as Markdown."),
+    ] = None,
+    session_memory_path: Annotated[
+        str,
+        typer.Option(
+            "--session-memory",
+            help="Path to persistent session memory store. Defaults to ~/.tai/sessions.",
+        ),
+    ] = "~/.tai/sessions",
+) -> None:
+    """Search or list previously indexed troubleshooting sessions."""
+    from pathlib import Path
+
+    try:
+        store = SessionStore(session_memory_path)
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]Could not open session memory:[/red] {exc}")
+        raise typer.Exit(code=1) from exc
+
+    try:
+        if query:
+            sessions = store.search_keyword(query, host=host, limit=limit)
+        else:
+            sessions = store.list_recent(host=host, limit=limit)
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]History query failed:[/red] {exc}")
+        raise typer.Exit(code=1) from exc
+
+    if not sessions:
+        scope = f" for host {host}" if host else ""
+        qualifier = f" matching '{query}'" if query else ""
+        console.print(f"[yellow]No session history found{scope}{qualifier}.[/yellow]")
+        return
+
+    title = f"{len(sessions)} session(s)"
+    if host:
+        title += f" for host={host}"
+    if query:
+        title += f" matching '{query}'"
+    console.print(f"[bold]{title}[/bold]")
+
+    for sess in sessions:
+        summary_line = (sess.summary or "").strip().splitlines()
+        excerpt = summary_line[0] if summary_line else "(no summary)"
+        if len(excerpt) > 120:
+            excerpt = f"{excerpt[:120].rstrip()}..."
+        console.print(
+            f"  [green]{sess.session_id}[/green] host={sess.host} issue={sess.issue}\n"
+            f"    [dim]{excerpt}[/dim]"
+        )
+
+    if export:
+        output_path = Path(export).expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(
+            _format_history_markdown(sessions, query=query, host=host),
+            encoding="utf-8",
+        )
+        console.print(f"[green]✓ Exported[/green] {output_path}")
+
+
 @app.command()
 def run(
     issue: Annotated[str, typer.Argument(help="Ticket text or issue summary.")],
@@ -363,7 +440,7 @@ async def _interactive_loop(
     console.print(
         Panel(
             "Ask questions directly, or use [bold]/collect[/bold], "
-            "[bold]/analyze[/bold], [bold]/help[/bold], [bold]/quit[/bold]",
+            "[bold]/analyze[/bold], [bold]/history[/bold], [bold]/help[/bold], [bold]/quit[/bold]",
             title="[bold cyan]Interactive Mode[/bold cyan]",
             border_style="cyan",
             padding=(0, 1),
@@ -434,6 +511,7 @@ async def _interactive_loop(
                 Panel(
                     "[bold]/collect[/bold]  — re-run diagnostics\n"
                     "[bold]/analyze[/bold]  — re-analyze current diagnostics\n"
+                    "[bold]/history[/bold]  — show prior sessions for this host\n"
                     "[bold]/help[/bold]     — show this message\n"
                     "[bold]/quit[/bold]     — end session\n"
                     "[dim]Anything else is sent directly to the AI as a question.[/dim]",
@@ -444,6 +522,52 @@ async def _interactive_loop(
             )
             continue
 
+        if command.startswith("/history"):
+            if session_store is None:
+                console.print(
+                    "[yellow]Session memory is disabled. "
+                    "Use --session-memory to enable /history.[/yellow]"
+                )
+                continue
+
+            keyword = command.removeprefix("/history").strip()
+            try:
+                sessions = (
+                    session_store.search_keyword(keyword, host=req.host, limit=5)
+                    if keyword
+                    else session_store.list_recent(host=req.host, limit=5)
+                )
+            except Exception as exc:  # noqa: BLE001
+                console.print(f"[yellow]History unavailable:[/yellow] {exc}")
+                continue
+
+            if not sessions:
+                qualifier = f" matching '{keyword}'" if keyword else ""
+                console.print(f"[yellow]No prior sessions found{qualifier}.[/yellow]")
+                continue
+
+            lines = []
+            for sess in sessions:
+                issue = sess.issue.strip() or "(unknown issue)"
+                lines.append(f"- [bold]{sess.session_id}[/bold] — {issue}")
+            heading = "Prior sessions"
+            if keyword:
+                heading += f" matching '{keyword}'"
+            console.print(
+                Panel(
+                    "\n".join(lines),
+                    title=f"[bold cyan]{heading}[/bold cyan]",
+                    border_style="cyan",
+                    padding=(0, 1),
+                )
+            )
+            if logger is not None:
+                logger.log_event(
+                    "interactive_history",
+                    {"keyword": keyword or None, "count": len(sessions)},
+                )
+            continue
+
         if command == "/collect":
             plan = plan_from_request(req)
             console.print(f"[cyan]Collecting diagnostics:[/cyan] {len(plan)} commands")
@@ -926,6 +1050,32 @@ def _index_session_memory(
             logger.log_event("session_memory_error", {"error": str(exc)})
 
 
+def _format_history_markdown(
+    sessions: list[PastSession],
+    *,
+    query: str | None,
+    host: str | None,
+) -> str:
+    """Render session history rows as a Markdown report."""
+    lines = ["# tai session history", ""]
+    if host:
+        lines.append(f"Host filter: {host}")
+    if query:
+        lines.append(f"Keyword filter: {query}")
+    if host or query:
+        lines.append("")
+
+    for sess in sessions:
+        lines.append(f"## {sess.session_id}")
+        lines.append(f"- Host: {sess.host}")
+        lines.append(f"- Issue: {sess.issue}")
+        lines.append("")
+        lines.append(sess.summary.strip() or "(no summary)")
+        lines.append("")
+
+    return "\n".join(lines).rstrip() + "\n"
+
+
 # ---------------------------------------------------------------------------
 # runbooks sub-app
 # ---------------------------------------------------------------------------
diff --git a/src/tai/session_store.py b/src/tai/session_store.py
index 63ef633..7d428dc 100644
--- a/src/tai/session_store.py
+++ b/src/tai/session_store.py
@@ -98,6 +98,60 @@ class SessionStore:
                 )
         return sessions
 
+    def list_recent(self, *, host: str | None = None, limit: int = 20) -> list[PastSession]:
+        """Return recent indexed sessions, optionally filtered by host."""
+        if limit < 1:
+            raise ValueError("limit must be >= 1")
+        count = self._collection.count()
+        if count == 0:
+            return []
+
+        results = self._collection.get(
+            include=["documents", "metadatas"],
+            limit=min(limit, count),
+        )
+        ids = results.get("ids") or []
+        docs = results.get("documents") or []
+        metas = results.get("metadatas") or []
+
+        sessions: list[PastSession] = []
+        for sid, doc, meta in zip(ids, docs, metas, strict=False):
+            sessions.append(
+                PastSession(
+                    session_id=str(sid),
+                    host=str(meta.get("host", "")),
+                    issue=str(meta.get("issue", "")),
+                    summary=str(doc),
+                )
+            )
+
+        if host:
+            host_norm = host.strip().lower()
+            sessions = [s for s in sessions if s.host.lower() == host_norm]
+
+        sessions.sort(key=lambda s: s.session_id, reverse=True)
+        return sessions[:limit]
+
+    def search_keyword(
+        self,
+        keyword: str,
+        *,
+        host: str | None = None,
+        limit: int = 20,
+    ) -> list[PastSession]:
+        """Return recent sessions matching a keyword in issue or summary text."""
+        term = keyword.strip().lower()
+        if not term:
+            return self.list_recent(host=host, limit=limit)
+
+        all_recent = self.list_recent(host=host, limit=max(limit, self.count()))
+        filtered = [
+            sess
+            for sess in all_recent
+            if term in sess.issue.lower() or term in sess.summary.lower()
+        ]
+        return filtered[:limit]
+
 
 def _build_embed_text(*, host: str, issue: str, summary: str) -> str:
     """Build embedding text with host/issue context and summary excerpt."""
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 2f05dd7..657fd2a 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, MagicMock
 
@@ -338,3 +339,108 @@ def test_interactive_rag_debug_prints_retrieval_scores(monkeypatch) -> None:  #
 
     assert result.exit_code == 0
     assert "RAG retrieve:" in result.stdout
+
+
+def test_history_command_lists_sessions(monkeypatch) -> None:  # type: ignore[no-untyped-def]
+    class FakeStore:
+        def __init__(self, _path: str) -> None:
+            pass
+
+        def list_recent(self, *, host: str | None = None, limit: int = 20):
+            del limit
+            if host == "web01":
+                return [
+                    SimpleNamespace(
+                        session_id="20260507T120000Z",
+                        host="web01",
+                        issue="nginx down",
+                        summary="Root cause: bad config",
+                    )
+                ]
+            return []
+
+    monkeypatch.setattr("tai.cli.SessionStore", FakeStore)
+
+    runner = CliRunner()
+    result = runner.invoke(
+        app,
+        ["history", "--session-memory", "~/.tai/sessions", "--host", "web01"],
+    )
+
+    assert result.exit_code == 0
+    assert "session(s)" in result.stdout
+    assert "20260507T120000Z" in result.stdout
+
+
+def test_history_command_exports_markdown(monkeypatch, tmp_path: Path) -> None:  # type: ignore[no-untyped-def]
+    class FakeStore:
+        def __init__(self, _path: str) -> None:
+            pass
+
+        def list_recent(self, *, host: str | None = None, limit: int = 20):
+            del host, limit
+            return [
+                SimpleNamespace(
+                    session_id="20260507T120000Z",
+                    host="web01",
+                    issue="nginx down",
+                    summary="Root cause: bad config",
+                )
+            ]
+
+    monkeypatch.setattr("tai.cli.SessionStore", FakeStore)
+    export_path = tmp_path / "history.md"
+
+    runner = CliRunner()
+    result = runner.invoke(
+        app,
+        ["history", "--session-memory", "~/.tai/sessions", "--export", str(export_path)],
+    )
+
+    assert result.exit_code == 0
+    assert "Exported" in result.stdout
+    text = export_path.read_text(encoding="utf-8")
+    assert "# tai session history" in text
+    assert "nginx down" in text
+
+
+def test_interactive_history_without_store_shows_hint(monkeypatch) -> None:  # type: ignore[no-untyped-def]
+    _mock_session(monkeypatch)
+
+    async def fake_collect_from_plan(_session, _plan) -> CollectionReport:  # type: ignore[no-untyped-def]
+        return CollectionReport(
+            host="ssh.archflux.net",
+            items=[
+                CollectedItem(
+                    name="kernel",
+                    result=SSHCommandResult(
+                        command="uname -a",
+                        exit_code=0,
+                        stdout="Linux test",
+                        stderr="",
+                    ),
+                ),
+            ],
+        )
+
+    commands = iter(["/history", "/quit"])
+    monkeypatch.setattr("tai.cli.collect_from_plan", fake_collect_from_plan)
+    monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands))
+    monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True)
+
+    runner = CliRunner()
+    result = runner.invoke(
+        app,
+        [
+            "run", "apache failed",
+            "--host",
+            "ssh.archflux.net",
+            "--port",
+            "5566",
+            "--no-probe",
+            "--interactive",
+        ],
+    )
+
+    assert result.exit_code == 0
+    assert "Session memory is disabled" in result.stdout
diff --git a/tests/test_session_store.py b/tests/test_session_store.py
index e66ed41..b96561e 100644
--- a/tests/test_session_store.py
+++ b/tests/test_session_store.py
@@ -77,3 +77,53 @@ def test_query_returns_past_sessions(tmp_path: Path) -> None:
     assert isinstance(results[0], PastSession)
     assert results[0].host == "web01"
     assert "package missing" in results[0].summary
+
+
+def test_list_recent_returns_sessions_sorted_desc(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value
+    collection.count.return_value = 3
+    collection.get.return_value = {
+        "ids": ["20260506T120000Z", "20260507T120000Z", "20260505T120000Z"],
+        "documents": ["older", "newer", "oldest"],
+        "metadatas": [
+            {"host": "web01", "issue": "i1"},
+            {"host": "web01", "issue": "i2"},
+            {"host": "db01", "issue": "i3"},
+        ],
+    }
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = SessionStore(tmp_path / "store")
+        results = store.list_recent(limit=2)
+
+    assert len(results) == 2
+    assert results[0].session_id == "20260507T120000Z"
+    assert results[1].session_id == "20260506T120000Z"
+
+
+def test_search_keyword_filters_by_term_and_host(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value
+    collection.count.return_value = 3
+    collection.get.return_value = {
+        "ids": ["20260505T120000Z", "20260506T120000Z", "20260507T120000Z"],
+        "documents": [
+            "Root cause: nginx config typo",
+            "Root cause: package missing",
+            "Root cause: nginx port conflict",
+        ],
+        "metadatas": [
+            {"host": "web01", "issue": "nginx fails"},
+            {"host": "web01", "issue": "sssd fails"},
+            {"host": "db01", "issue": "nginx start failed"},
+        ],
+    }
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = SessionStore(tmp_path / "store")
+        results = store.search_keyword("nginx", host="web01", limit=5)
+
+    assert len(results) == 1
+    assert results[0].host == "web01"
+    assert "nginx" in results[0].issue.lower()