From 57f4c0efaa0a4b58c42c59f9c53d23546a5cf8fa Mon Sep 17 00:00:00 2001
From: zphinx <cban@gmx.com>
Date: Wed, 6 May 2026 04:48:41 +0200
Subject: [PATCH] feat: complete RAG runbook workflow and release docs

---
 CHANGELOG.md                |  54 ++++---
 README.md                   | 214 ++++++++++++++++++++--------
 ROADMAP.md                  |  55 ++++----
 docs/ARCHITECTURE.md        |  85 +++++++++++
 pyproject.toml              |   2 +-
 runbooks/apparmor.md        |  86 ++++++++++++
 runbooks/disk.md            | 106 ++++++++++++++
 runbooks/docker.md          | 120 ++++++++++++++++
 runbooks/kernel.md          | 117 ++++++++++++++++
 runbooks/nginx.md           |  99 +++++++++++++
 runbooks/postgres.md        | 107 ++++++++++++++
 runbooks/selinux.md         | 112 +++++++++++++++
 runbooks/ssh.md             | 100 +++++++++++++
 runbooks/sssd.md            | 115 +++++++++++++++
 runbooks/wayland.md         |  89 ++++++++++++
 runbooks/x2go.md            | 106 ++++++++++++++
 runbooks/xorg.md            |  94 +++++++++++++
 src/tai/chroma_telemetry.py |  24 ++++
 src/tai/cli.py              | 273 ++++++++++++++++++++++++++++++++++--
 src/tai/plan.py             |  27 ++++
 src/tai/prompt_builder.py   |  84 ++++++++++-
 src/tai/runbook_store.py    | 268 +++++++++++++++++++++++++++++++++++
 tests/test_ai.py            |   1 +
 tests/test_cli.py           |  31 ++--
 tests/test_plan.py          |  25 ++++
 tests/test_runbook_store.py | 253 +++++++++++++++++++++++++++++++++
 26 files changed, 2510 insertions(+), 137 deletions(-)
 create mode 100644 docs/ARCHITECTURE.md
 create mode 100644 runbooks/apparmor.md
 create mode 100644 runbooks/disk.md
 create mode 100644 runbooks/docker.md
 create mode 100644 runbooks/kernel.md
 create mode 100644 runbooks/nginx.md
 create mode 100644 runbooks/postgres.md
 create mode 100644 runbooks/selinux.md
 create mode 100644 runbooks/ssh.md
 create mode 100644 runbooks/sssd.md
 create mode 100644 runbooks/wayland.md
 create mode 100644 runbooks/x2go.md
 create mode 100644 runbooks/xorg.md
 create mode 100644 src/tai/chroma_telemetry.py
 create mode 100644 src/tai/runbook_store.py
 create mode 100644 tests/test_runbook_store.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cfd146d..bbb0180 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,27 +10,37 @@ ______________________________________________________________________
 
 ### Added
 
-- `README.md` — project overview, description, example workflow, supported distributions, and suggested tooling
-- `ROADMAP.md` — phased development plan covering decisions, data collection, AI integration, CLI design, and hardening
-- `CHANGELOG.md` — this file; established changelog tracking for the project
-- `.gitea/workflows/ci.yml` — Gitea Actions CI workflow for push and pull request events
-- Gitea CI now uses native `git` checkout and system Python setup to avoid host-executor JavaScript action path issues
-- Gitea native checkout now uses `CI_GIT_TOKEN` repository secret for authenticated fetch from private repos
-- Gitea CI now installs dependencies in a local `.venv` to avoid Debian/PEP 668 externally-managed pip errors
-- Python package scaffold with `src` layout and project metadata in `pyproject.toml`
-- Initial CLI entrypoint with agreed SSH flags: `--identity-file`, `--jump-host`, and `--ignore-ssh-config`
-- Input parsing/validation module and core request model
-- SSH configuration scaffold module for upcoming connection/read-only execution work
-- Implemented SSH module with real key-based command execution via system `ssh`
-- Added explicit SSH port support across CLI, input parsing, request model, and SSH client (`--port`, e.g. 5566)
-- Added live SSH connectivity probe (`uname -a`) enabled by default, with `--no-probe` opt-out and non-zero exit on failure
-- Added baseline diagnostics collection via `--collect`, including service, journal, disk, and network checks
-- Read-only command policy enforcement (allowlist + blocked shell operators)
-- Added byte-limited SSH output capture with truncation markers for large command output
-- Test scaffold (`pytest`) with initial parser and CLI coverage
-- SSH test coverage for policy checks, SSH argument construction, and config summary behavior
-- CI workflow for lint (`ruff`), type-check (`mypy`), and tests (`pytest`)
-- CI coverage expanded with Markdown formatting checks (`mdformat --check`) and YAML linting (`yamllint`)
+- Nothing yet.
+
+______________________________________________________________________
+
+## [0.4.0] - 2026-05-06
+
+### Added
+
+- `runbooks/` corpus with service troubleshooting guides: `ssh`, `nginx`, `postgres`, `disk`, `kernel`, `docker`, `sssd`, `xorg`, `wayland`, `x2go`, `selinux`, `apparmor`
+- Runbook knowledge store module `src/tai/runbook_store.py` (persistent ChromaDB-backed index and query)
+- Chroma telemetry no-op client `src/tai/chroma_telemetry.py` to suppress noisy local telemetry errors
+- `tai runbooks` command group with:
+	- `sync` for indexing all Markdown runbooks
+	- `list` for listing indexed metadata
+	- `add` for indexing a single runbook file
+- `--runbooks` option on `tai run` to enable Tier 2 runbook retrieval
+- Initial analysis RAG path using retrieved diagnostic chunks (`build_analysis_message_with_chunks`)
+- Follow-up RAG path updates with tighter `top_k` and runbook context injection
+- AI runtime controls:
+	- `--ai-timeout-seconds`
+	- `--ai-max-tokens`
+- Non-streaming AI completion path for improved local backend reliability
+- Service/subsystem presence probes in collection plans:
+	- unit-file checks
+	- expected binary path checks
+	- status/journal/config probes for recognized services including `sssd`
+- Prompt instruction for "component absent or not installed" interpretation when presence signals are missing
+- Runbook store unit tests in `tests/test_runbook_store.py`
+- CLI tests updated for `tai run` subcommand and non-streaming completion mocks
+- README refreshed with current CLI, architecture layout, RAG/runbook workflow, and usage examples
+- `docs/ARCHITECTURE.md` with end-to-end flow, module responsibilities, safety boundaries, and fallback behavior
 
 ### Removed
 
@@ -44,3 +54,5 @@ ______________________________________________________________________
 - SSH bastion support: `--jump-host` flag using SSH native ProxyJump
 - SSH config behavior: use `~/.ssh/config` by default; allow override via `--ignore-ssh-config`
 - Interface: **interactive REPL** for v0.1; `textual`-based TUI (split-pane) for v0.2+
+- RAG Tier 1 strategy: semantic diagnostic chunk retrieval with local embeddings
+- RAG Tier 2 strategy: Markdown runbooks persisted in embedded ChromaDB
diff --git a/README.md b/README.md
index d25e954..5571b0c 100644
--- a/README.md
+++ b/README.md
@@ -1,112 +1,202 @@
-# tai — Linux AI Troubleshooting Agent
+# tai - Linux AI Troubleshooting Agent
 
-`tai` is an agentic AI-driven troubleshooting tool for Linux systems. It autonomously investigates issues on remote hosts via SSH, analyzes relevant logs and configuration files, and provides a clear diagnosis along with suggested remediation steps — all without making any changes to the target system.
+`tai` is a read-only Linux troubleshooting assistant that connects to remote hosts via SSH, collects diagnostics, and runs grounded AI analysis using local models.
 
-## Overview
+The project is designed for operators who want AI speed without losing operational safety or evidence traceability.
 
-Given a problem description and a target hostname, `tai` connects to the remote system over SSH, gathers relevant data (logs, configuration files, service status, etc.), and uses a locally-hosted AI model to reason about the root cause and recommend solutions.
+## What tai Does
 
-The agent operates in **read-only mode at all times**. It will never modify the target system under any circumstances — all suggestions are presented to the human troubleshooter for review and action.
+- Runs safe, read-only remote checks over SSH
+- Builds a diagnostics collection plan from issue text
+- Supports one-shot analysis and interactive follow-up mode
+- Uses local AI backends (OpenAI-compatible endpoint, typically Ollama)
+- Uses RAG over collected diagnostics (Tier 1)
+- Uses persistent runbook retrieval with ChromaDB (Tier 2)
+- Emits structured Markdown analysis with evidence and actions
+- Can log session and retrieval telemetry locally as JSONL
 
-## Supported Distributions
+## Safety Model
 
-- Ubuntu
-- Debian
-- RHEL
-- Rocky Linux
+`tai` enforces read-only command policy on all remote commands.
 
-## Example Workflow
+- Allowlist based command validation
+- Blocked shell operators (`>`, `>>`, `<`, `|`, `&&`, `||`, `;`)
+- No write/mutation actions are executed on target hosts
 
-A troubleshooter receives a ticket reporting that the Apache service on a remote server has failed to start. They provide `tai` with:
+The tool may suggest remediation commands in output, but does not execute them.
 
-1. The ticket description or error message
-1. The hostname of the affected system
-1. Any relevant directories to focus on
+## Current Feature Set
 
-`tai` then connects to the host, reads through system logs, service configurations, and any other related files, and returns a structured analysis of the likely cause along with recommended next steps.
+### Core CLI
 
-## Suggested Tooling
+- `tai run ...` main troubleshooting entrypoint
+- SSH options: host, port, identity file, jump host, SSH config control
+- Live probe mode (`uname -a`)
+- Diagnostics collection mode
+- AI analysis mode
+- Interactive loop with `/collect`, `/analyze`, `/help`, `/quit`
 
-| Component | Tool |
-|-----------|------|
-| AI inference backend | [Ollama](https://ollama.com) |
-| Chat model | `gemma3:4b`, `llama3.1:8b`, or `qwen2.5:7b` |
-| Embedding model | `nomic-embed-text` (via Ollama) |
-| Vector store | [ChromaDB](https://www.trychroma.com) (embedded, local) |
-| Language | Python 3.11+ |
+### AI and Prompting
 
-______________________________________________________________________
+- OpenAI-compatible AI client
+- Configurable model, timeout, token budget
+- Guardrails to keep responses evidence-based
+- Initial and follow-up prompts grounded in collected diagnostics
+- Non-streaming completion path for local backend reliability
 
-## How-To: Setting Up the AI Backend (Arch Linux + RTX 3080)
+### RAG and Knowledge
 
-`tai` uses [Ollama](https://ollama.com) as its local AI backend. It exposes an OpenAI-compatible HTTP API that `tai` talks to — no cloud services, no data leaving your machine.
+- Tier 1: semantic retrieval of diagnostic chunks per question
+- Tier 2: persistent runbook knowledge base with ChromaDB
+- Runbook retrieval injected as separate prompt context
+- Retrieval debug output (`--rag-debug`)
+- Full-context fallback if retrieval/indexing fails
 
-An RTX 3080 (10 GB VRAM) comfortably runs 7–8B parameter models at 4-bit quantisation.
+### Runbook Management
 
-### 1. Install CUDA and Ollama
+- `tai runbooks sync --path ./runbooks --store ~/.tai/runbooks`
+- `tai runbooks list --store ~/.tai/runbooks`
+- `tai runbooks add <file> --store ~/.tai/runbooks`
 
-```bash
-# CUDA runtime (skip if already installed)
-sudo pacman -S cuda
+### Presence and Absence Signals
 
-# Ollama with CUDA support from the AUR
-yay -S ollama-cuda
-# or: paru -S ollama-cuda
+For recognized services/subsystems (for example `sssd`, `docker`, `x2go`, `xorg`, `wayland`, `selinux`, `apparmor`), collection includes:
 
-# Enable and start the service
-sudo systemctl enable --now ollama
+- service unit-file discovery (`systemctl list-unit-files ...`)
+- binary presence checks via `ls -l <expected path>`
+- service status and journals
+- selected config path probes where defined
+
+This improves analysis quality for "component missing/not installed" scenarios.
+
+## Repository Layout
+
+```text
+src/tai/
+  cli.py                # CLI commands and orchestration
+  ssh_client.py         # SSH execution + read-only policy
+  collectors.py         # execution of collection plans
+  plan.py               # issue -> command plan builder
+  ai_client.py          # OpenAI-compatible AI + embeddings client
+  ai_guardrails.py      # response guardrails/validation
+  prompt_builder.py     # prompt composition
+  rag_retriever.py      # diagnostic chunk retrieval
+  runbook_store.py      # persistent ChromaDB runbook index/query
+  chroma_telemetry.py   # no-op Chroma telemetry client
+  session_log.py        # JSONL session logging
+  input_parser.py       # CLI input validation
+  models.py             # domain request models
+
+runbooks/
+  *.md                  # Markdown runbooks with frontmatter
+
+tests/
+  test_*.py             # unit and CLI coverage
 ```
 
-### 2. Pull a chat model
+## Installation
 
 ```bash
-ollama pull gemma3:4b       # ~3 GB — fast, good for sysadmin tasks
-ollama pull llama3.1:8b     # ~5 GB — stronger reasoning
-ollama pull qwen2.5:7b      # ~4.5 GB — strong structured output
+python -m venv .venv
+source .venv/bin/activate
+pip install -e .
 ```
 
-### 3. Pull the embedding model
-
-`tai` uses `nomic-embed-text` to embed diagnostic data and runbooks for semantic retrieval (RAG). Pull it on the same host as Ollama:
+RAG runbook storage requires optional dependencies:
 
 ```bash
-ollama pull nomic-embed-text   # ~274 MB
+pip install -e .[rag]
 ```
 
-Verify it loaded:
+Development dependencies:
 
 ```bash
-curl http://localhost:11434/api/embeddings \
-  -d '{"model":"nomic-embed-text","prompt":"test"}'
+pip install -e .[dev]
 ```
 
-A JSON response with an `"embedding"` array confirms it is ready.
+## AI Backend Setup (Ollama)
 
-### 4. Verify the chat model works
+`tai` expects an OpenAI-compatible API endpoint, defaulting to `http://localhost:11434/v1`.
 
 ```bash
-ollama run gemma3:4b "what causes a systemd service to enter failed state?"
+ollama pull gemma3:4b
+ollama pull nomic-embed-text
 ```
 
-### 5. Verify the HTTP API is running
-
-`tai` communicates with Ollama over its OpenAI-compatible REST API:
+Quick backend check:
 
 ```bash
 curl http://localhost:11434/api/generate \
   -d '{"model":"gemma3:4b","prompt":"hello","stream":false}'
 ```
 
-A JSON response with a `response` field confirms everything is working.
+## Usage
 
-### 6. Point tai at your Ollama instance
-
-Once `tai` AI integration is complete, use these flags:
+### Basic Probe and Collect
 
 ```bash
-tai "nginx failing to start" --host web01 \
-  --ai-host http://localhost:11434 \
-  --model gemma3:4b
+tai run "nginx failing to start" \
+  --host web01 \
+  --probe \
+  --collect
 ```
 
-The default values for `--ai-host` and `--model` will be `http://localhost:11434` and `gemma3:4b` respectively, so for local use you won't need to specify them explicitly.
+### Analyze with RAG and Runbooks
+
+```bash
+tai run "why isnt sssd working?" \
+  --host ssh.archflux.net \
+  --port 5566 \
+  --probe --collect --analyze \
+  --runbooks ~/.tai/runbooks \
+  --rag-debug \
+  --ai-timeout-seconds 45 \
+  --ai-max-tokens 300
+```
+
+### Interactive Session
+
+```bash
+tai run "docker daemon keeps failing" \
+  --host app01 \
+  --collect \
+  --interactive \
+  --runbooks ~/.tai/runbooks
+```
+
+## Runbook Workflow
+
+1. Write Markdown runbooks in `runbooks/` with frontmatter keys: `service`, `symptoms`, `tags`.
+1. Sync the store.
+1. Pass `--runbooks <store-path>` to `tai run`.
+
+Example:
+
+```bash
+tai runbooks sync --path ./runbooks --store ~/.tai/runbooks
+tai runbooks list --store ~/.tai/runbooks
+```
+
+## Testing
+
+```bash
+pytest
+```
+
+Focused suites:
+
+```bash
+pytest tests/test_plan.py tests/test_ai.py tests/test_cli.py
+```
+
+## Known Limits
+
+- Service-specific presence checks currently apply to recognized service/subsystem names.
+- Package-manager-level presence checks are not yet in the default read-only command allowlist.
+- Tier 3 persistent session memory is not implemented yet.
+
+## Changelog and Roadmap
+
+- See `CHANGELOG.md` for release history.
+- See `ROADMAP.md` for phase status and next milestones.
+- See `docs/ARCHITECTURE.md` for module-level architecture and data flow.
diff --git a/ROADMAP.md b/ROADMAP.md
index 208ae12..6a1e8ef 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -18,10 +18,11 @@ These must be resolved before meaningful development can begin.
 
 ### AI Backend & Model
 
-- [ ] Confirm use of [vLLM](https://github.com/vllm-project/vllm) as the inference backend
-- [ ] Confirm `gemma4:a4b` as the default model (or select an alternative)
+- [x] OpenAI-compatible backend client implemented (`AIClient`)
+- [x] Default local backend profile wired for Ollama (`http://localhost:11434/v1`)
+- [x] Default model profile set to `gemma3:4b` (override via `--model`)
 - [ ] Define minimum hardware requirements for running the model locally
-- [ ] Decide whether the AI backend is bundled, self-hosted externally, or user-supplied
+- [x] AI backend is user-supplied/self-hosted
 
 ### SSH Strategy
 
@@ -38,7 +39,7 @@ These must be resolved before meaningful development can begin.
 ### Scope & Constraints
 
 - [ ] Define the supported scope of issues (services, network, disk, kernel, etc.)
-- [ ] Confirm read-only guarantee — document exactly what "read-only" means in practice
+- [x] Read-only guarantee implemented with command allowlist + blocked shell operator policy
 - [x] **Decision: interactive REPL mode for v0.1, full TUI for v0.2+**
   - v0.1: chat-loop REPL launched from CLI; human can follow up, correct, and redirect the agent
   - v0.2+: `textual`-based TUI with split panes (collected data | AI output | input bar)
@@ -52,7 +53,7 @@ Basic project scaffolding and connectivity.
 
 - [x] Finalise repository structure and language toolchain
 - [x] Set up CI pipeline (linting, tests)
-- [ ] Implement SSH connection module
+- [x] Implement SSH connection module
   - [x] Define SSH config model and probe interface scaffold
   - [x] Connect to remote host
   - [x] Execute read-only commands (e.g. `journalctl`, `systemctl status`, `cat`)
@@ -68,15 +69,15 @@ ______________________________________________________________________
 
 Define what information the agent gathers and how.
 
-- [ ] Identify the canonical set of data sources per issue type:
+- [x] Identify a baseline canonical set of data sources per issue type:
   - Service failures: `journalctl`, `systemctl`, service config files
   - Network issues: `ip`, `ss`, `netstat`, firewall rules
   - Disk issues: `df`, `du`, `dmesg`, `smartctl`
   - General: `/var/log/syslog`, `/var/log/messages`, `dmesg`
-- [ ] Implement pluggable "collector" modules per data source
-- [ ] Implement directory traversal for user-specified paths (read-only)
+- [x] Implement collectors and plan builder for baseline issue categories
+- [x] Implement directory traversal for user-specified paths (read-only)
 - [ ] Add support for per-distro variations (Ubuntu vs RHEL path differences, etc.)
-- [ ] Write tests with mocked SSH output
+- [x] Write tests with mocked SSH output
 
 ______________________________________________________________________
 
@@ -84,12 +85,12 @@ ______________________________________________________________________
 
 Wire collected data into the local AI model.
 
-- [ ] Implement vLLM client module
-- [ ] Design prompt template: system context, collected data, issue description → diagnosis
-- [ ] Implement response parsing and structured output (root cause + suggested steps)
-- [ ] Tune context window usage — handle truncation for large log outputs
-- [ ] Add streaming support for long AI responses
-- [ ] Evaluate and test model output quality on common issue types
+- [x] Implement OpenAI-compatible AI client module
+- [x] Design prompt templates for initial and follow-up analysis
+- [x] Implement response guardrail checks and structured response headings
+- [x] Tune context usage with RAG retrieval and chunk/runbook truncation budgets
+- [x] Implement reliable non-streaming completion path for local backends
+- [ ] Continue output quality tuning and grounding evaluation on real hosts
 
 ______________________________________________________________________
 
@@ -97,11 +98,11 @@ ______________________________________________________________________
 
 Polish the interface for real-world use.
 
-- [ ] Design CLI interface (flags, subcommands, interactive prompts)
-- [ ] Implement structured output: diagnosis, confidence, recommended actions
-- [ ] Add `--verbose` / `--debug` mode showing raw collected data
+- [x] Design CLI interface with run command, interactive prompts, and runbook subcommands
+- [x] Implement structured output sections (Root Cause, Evidence, Recommended Actions)
+- [x] Add RAG debug mode (`--rag-debug`) showing retrieval scores
 - [ ] Support output to file or clipboard
-- [ ] Write man page / `--help` documentation
+- [x] Provide comprehensive `--help` command documentation via Typer options
 
 ______________________________________________________________________
 
@@ -135,19 +136,21 @@ model weights alone. Three tiers of increasing capability, each buildable indepe
 
 | Decision | Options | Recommendation | Status |
 |---|---|---|---|
-| Embedding model | `nomic-embed-text`, `mxbai-embed-large`, `all-minilm` | `nomic-embed-text` via Ollama (local, 274MB, strong perf) | ⬜ Pending |
-| Vector store — Tier 1 | In-memory numpy cosine, `faiss-cpu` | numpy (zero deps) for session scope | ⬜ Pending |
-| Vector store — Tier 2/3 | `chromadb`, `qdrant`, `weaviate`, `pgvector` | `chromadb` (embedded mode, no server needed) or `qdrant` (self-hosted, REST API, production-grade) | ⬜ Pending |
-| Chunking strategy | Fixed token, sentence-aware, command-boundary | Command-boundary splitting (natural unit for diagnostics) | ⬜ Pending |
+| Embedding model | `nomic-embed-text`, `mxbai-embed-large`, `all-minilm` | `nomic-embed-text` via Ollama (local, 274MB, strong perf) | ✅ Implemented |
+| Vector store — Tier 1 | In-memory numpy cosine, `faiss-cpu` | numpy (zero deps) for session scope | ✅ Implemented |
+| Vector store — Tier 2/3 | `chromadb`, `qdrant`, `weaviate`, `pgvector` | `chromadb` embedded mode | ✅ Tier 2 Implemented |
+| Chunking strategy | Fixed token, sentence-aware, command-boundary | Command-boundary splitting (natural unit for diagnostics) | ✅ Implemented |
 | Hybrid retrieval | Semantic only, BM25 only, hybrid | Hybrid (BM25 keyword + cosine semantic) for best recall | ⬜ Pending |
 | Reranking | None, cross-encoder (`ms-marco-MiniLM`), LLM-as-judge | Cross-encoder rerank pass before prompt injection | ⬜ Pending |
-| Runbook format | Markdown, YAML, JSON | Markdown (human-editable, version-controllable) | ⬜ Pending |
+| Runbook format | Markdown, YAML, JSON | Markdown (human-editable, version-controllable) | ✅ Implemented |
 | Session index storage | Local `~/.tai/`, configurable path | `~/.tai/sessions/` with ChromaDB collection | ⬜ Pending |
 
 ---
 
 ### Tier 1 — Diagnostic Chunk Retrieval (in-memory, per-session)
 
+Status: ✅ Implemented
+
 **Problem:** Current flow injects all collected output into the prompt as one block.
 On busy hosts this floods the context window with irrelevant output, degrading quality.
 
@@ -180,6 +183,8 @@ On busy hosts this floods the context window with irrelevant output, degrading q
 
 ### Tier 2 — Runbook Knowledge Base (persistent, ChromaDB)
 
+Status: ✅ Implemented
+
 **Problem:** AI improvises remediation steps from training data, which may be wrong for
 specific environments, distros, or internal conventions.
 
@@ -214,6 +219,8 @@ specific environments, distros, or internal conventions.
 
 ### Tier 3 — Session Memory Index (institutional learning)
 
+Status: ⬜ Pending
+
 **Problem:** Every session starts from zero. Repeat incidents on the same host or
 same issue type get no benefit from past work.
 
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
new file mode 100644
index 0000000..ac0cbee
--- /dev/null
+++ b/docs/ARCHITECTURE.md
@@ -0,0 +1,85 @@
+# Architecture
+
+This document describes tai's current runtime architecture, module responsibilities, and data flow.
+
+## High-Level Flow
+
+1. User runs `tai run` with issue text and target host settings.
+1. CLI validates input and opens a shared SSH session.
+1. Probe and collection run against a read-only command plan.
+1. Collection output is converted into diagnostic chunks.
+1. Optional RAG retrieval selects top-k chunks per question.
+1. Optional runbook retrieval selects top-k runbook chunks from ChromaDB.
+1. Prompt builder composes system + user message.
+1. AI completion returns analysis.
+1. Guardrails validate response quality signals.
+1. Optional session logger writes JSONL events.
+
+## Module Layout
+
+- `src/tai/cli.py`
+  - Command definitions (`run`, `runbooks sync/list/add`)
+  - Orchestration across SSH, collection, RAG, prompts, AI, and logging
+- `src/tai/input_parser.py`
+  - User input validation and request normalization
+- `src/tai/models.py`
+  - Core dataclasses (`TroubleshootRequest`)
+- `src/tai/ssh_client.py`
+  - SSH invocation
+  - Read-only command policy validation
+  - Probe and command execution helpers
+- `src/tai/plan.py`
+  - Issue keyword/service extraction
+  - Command plan generation
+  - Service/subsystem presence probes (unit files, binaries)
+- `src/tai/collectors.py`
+  - Executes command plans and builds `CollectionReport`
+- `src/tai/rag_retriever.py`
+  - Command-output chunking
+  - Embedding wrapper structures
+  - Similarity retrieval and scoring
+- `src/tai/runbook_store.py`
+  - Persistent ChromaDB runbook indexing and querying
+- `src/tai/chroma_telemetry.py`
+  - No-op telemetry adapter for Chroma local usage
+- `src/tai/prompt_builder.py`
+  - Prompt assembly for full-context and retrieved-context paths
+- `src/tai/ai_client.py`
+  - OpenAI-compatible completions and embeddings client
+- `src/tai/ai_guardrails.py`
+  - Lightweight response guardrails and warnings
+- `src/tai/session_log.py`
+  - Optional JSONL event logging
+
+## Data Stores
+
+- Runbook store (Tier 2): local ChromaDB path, default `~/.tai/runbooks`
+- Session logs: optional JSONL file configured by `--log-file`
+
+## Retrieval Layers
+
+- Tier 1 (implemented): in-memory semantic retrieval over diagnostic chunks
+- Tier 2 (implemented): persistent semantic retrieval over runbook corpus
+- Tier 3 (pending): persistent retrieval over prior sessions
+
+## Safety Boundaries
+
+Read-only policy is enforced before each remote command execution.
+
+- Allowed command families are explicitly enumerated.
+- Shell composition operators are blocked.
+- Commands that fail execution are recorded and surfaced to the model as non-evidence.
+
+## Failure and Fallback Behavior
+
+- If RAG indexing fails, analysis falls back to full-context prompts.
+- If runbook store is unavailable, analysis proceeds without runbook context.
+- If AI call fails, CLI exits with non-zero status and displays an error.
+
+## Test Coverage Highlights
+
+- Planner behavior and service detection
+- Prompt formatting and guardrail-sensitive messaging
+- CLI command behavior and interactive loop controls
+- Runbook store parsing/index/query behavior (with mocked Chroma)
+- SSH policy validation and command execution contract
diff --git a/pyproject.toml b/pyproject.toml
index 165d664..9bcd8d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "tai"
-version = "0.1.0"
+version = "0.4.0"
 description = "Linux AI-driven troubleshooting agent"
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/runbooks/apparmor.md b/runbooks/apparmor.md
new file mode 100644
index 0000000..7546d0b
--- /dev/null
+++ b/runbooks/apparmor.md
@@ -0,0 +1,86 @@
+---
+service: apparmor
+symptoms: permission denied despite correct unix permissions, apparmor deny logs, service blocked by profile, executable transition denied, path access denied, snap confinement issue, profile in complain mode
+tags: apparmor, security, profile, aa-status, audit, confinement, complain, enforce, snap
+---
+
+## Symptoms
+
+- Application gets `Permission denied` even though Unix permissions look correct
+- Service starts in complain mode but fails in enforce mode
+- Log shows AppArmor `DENIED` entries
+- Binary works when profile is disabled but fails when confinement is enabled
+- Snap or packaged app cannot access expected files or sockets
+
+## Diagnostics
+
+### Check AppArmor status and loaded profiles
+
+```
+aa-status
+systemctl status apparmor
+```
+
+Confirm whether the profile is loaded and whether it is in enforce or complain mode.
+
+### Check denial logs
+
+```
+journalctl -k | grep -i apparmor
+journalctl -b | grep -i DENIED
+dmesg | grep -i apparmor
+```
+
+AppArmor denials usually identify the profile, operation, and path that was blocked.
+
+### Inspect the active profile
+
+```
+find /etc/apparmor.d -maxdepth 2 -type f | sort
+cat /etc/apparmor.d/<profile>
+```
+
+Look for missing file path rules, capability rules, and `ix`/`px` execution transitions.
+
+### Check complain vs enforce mode
+
+```
+aa-status | grep complain
+```
+
+If the issue only occurs in enforce mode, the profile is too restrictive rather than the app being broken.
+
+### Check profile parser and reload
+
+```
+apparmor_parser -r /etc/apparmor.d/<profile>
+aa-status
+```
+
+Syntax or include errors can prevent an updated profile from loading.
+
+## Remediation
+
+**Profile too restrictive:**
+Add the missing path, capability, or network rule to the profile, then reload AppArmor.
+
+If the denial pattern is repetitive, use AppArmor tooling to review and refine the profile instead of disabling confinement globally.
+
+**Need to observe without blocking:**
+Temporarily switch the profile to complain mode:
+```
+aa-complain /etc/apparmor.d/<profile>
+```
+
+**Return to enforcement after fixing rules:**
+```
+aa-enforce /etc/apparmor.d/<profile>
+```
+
+**Profile reload after changes:**
+```
+apparmor_parser -r /etc/apparmor.d/<profile>
+systemctl reload apparmor
+```
+
+Do not disable AppArmor globally when the issue is isolated to a single profile.
diff --git a/runbooks/disk.md b/runbooks/disk.md
new file mode 100644
index 0000000..fcd713d
--- /dev/null
+++ b/runbooks/disk.md
@@ -0,0 +1,106 @@
+---
+service: disk
+symptoms: no space left on device, disk full, inode exhaustion, df shows 100%, du large files, write failed, cannot create file, filesystem read-only, ext4 error
+tags: disk, filesystem, storage, inodes, df, du, ext4, xfs, lvm, partition, full, space
+---
+
+## Symptoms
+
+- `No space left on device` — disk or inode exhaustion
+- `df -h` shows a filesystem at 100% (or near 100%)
+- `df -i` shows inode usage at 100% — file count exhausted even if byte space is free
+- Filesystem remounted read-only — kernel detected errors and protected itself
+- Services failing to write logs, create temp files, or open sockets
+
+## Diagnostics
+
+### Overall disk usage
+
+```
+df -h
+df -i
+```
+
+`df -h` shows byte space; `df -i` shows inode usage. Both can be independently exhausted.
+Note which filesystem is full (`/`, `/var`, `/tmp`, `/home`, etc.).
+
+### Find the large directories
+
+```
+du -sh /* 2>/dev/null | sort -rh | head -20
+du -sh /var/* 2>/dev/null | sort -rh | head -20
+du -sh /var/log/* 2>/dev/null | sort -rh | head -20
+```
+
+### Find large individual files
+
+```
+find / -xdev -type f -size +100M 2>/dev/null | sort -k5 -rn
+find /var/log -type f -size +50M 2>/dev/null
+```
+
+### Find deleted-but-open files holding space
+
+```
+lsof +L1 2>/dev/null | grep -v "^COMMAND"
+```
+
+Files deleted while a process still has them open do not free space until the process releases the file descriptor.
+
+### Inode exhaustion — find directories with many small files
+
+```
+find / -xdev -printf '%h\n' 2>/dev/null | sort | uniq -c | sort -rn | head -20
+```
+
+### Filesystem errors (after a crash or read-only remount)
+
+```
+dmesg | grep -i 'ext4\|xfs\|btrfs\|error\|corrupt'
+journalctl -k | grep -i 'filesystem\|disk\|io error'
+```
+
+### LVM / partition layout
+
+```
+lsblk
+pvs
+vgs
+lvs
+```
+
+## Remediation
+
+**Large log files — truncate safely (do NOT rm while in use):**
+```
+truncate -s 0 /var/log/<logfile>
+```
+Or configure log rotation in `/etc/logrotate.d/`.
+
+**Old journal logs eating space:**
+```
+journalctl --disk-usage
+journalctl --vacuum-size=500M
+journalctl --vacuum-time=30d
+```
+
+**Deleted-but-open files — restart the holding process to release space:**
+Identify the PID from `lsof +L1`, then:
+```
+systemctl restart <service>
+```
+
+**Inode exhaustion — remove many small files:**
+Common culprits: PHP session files in `/var/lib/php/sessions/`, old apt cache, tmp dirs.
+```
+find /var/lib/php/sessions -type f -mtime +7 -delete
+apt-get clean
+find /tmp -type f -mtime +3 -delete
+```
+
+**Extend LVM volume (if free extents exist in the volume group):**
+```
+lvextend -l +100%FREE /dev/<vg>/<lv>
+resize2fs /dev/<vg>/<lv>      # ext4
+xfs_growfs /mountpoint         # xfs
+```
diff --git a/runbooks/docker.md b/runbooks/docker.md
new file mode 100644
index 0000000..ca75513
--- /dev/null
+++ b/runbooks/docker.md
@@ -0,0 +1,120 @@
+---
+service: docker
+symptoms: cannot connect to docker daemon, docker daemon failed to start, docker socket permission denied, containers cannot resolve dns, docker network broken, daemon.json conflict, docker oom, unable to remove filesystem
+tags: docker, dockerd, containerd, container, daemon, daemon.json, cgroup, dns, docker0, socket, compose
+---
+
+## Symptoms
+
+- `Cannot connect to the Docker daemon. Is the docker daemon running on this host?`
+- `permission denied` on `/var/run/docker.sock`
+- `dockerd` fails to start after a `daemon.json` change
+- Containers cannot resolve DNS or pull images
+- Docker bridge/network disappears or container networking breaks after boot
+- Container or daemon is killed by the kernel OOM killer
+- `Error: Unable to remove filesystem` when removing a container
+
+## Diagnostics
+
+### Check daemon health and client target
+
+```
+docker info
+systemctl is-active docker
+systemctl status docker
+ps -ef | grep dockerd
+env | grep DOCKER_HOST
+```
+
+If `DOCKER_HOST` is set incorrectly, the CLI may be talking to the wrong daemon.
+
+### Check daemon logs and startup failures
+
+```
+journalctl -u docker -n 200
+journalctl -u containerd -n 100
+cat /etc/docker/daemon.json
+systemctl cat docker
+```
+
+Look for conflicts between `daemon.json` keys and systemd startup flags, especially duplicate `hosts` settings.
+
+### Check socket permissions and group access
+
+```
+ls -la /var/run/docker.sock
+id
+getent group docker
+ls -la ~/.docker/
+```
+
+If the user was added to the `docker` group recently, a new login shell may be required.
+
+### Check kernel, cgroups, and memory pressure
+
+```
+uname -r
+free -h
+dmesg | grep -i -E 'docker|cgroup|oom|killed process'
+```
+
+Low memory, missing kernel features, or cgroup issues can stop containers or the daemon.
+
+### Check Docker networking and DNS
+
+```
+docker network ls
+ip addr show docker0
+sysctl net.ipv4.ip_forward
+cat /etc/resolv.conf
+ps aux | grep dnsmasq
+```
+
+Loopback DNS resolvers in `/etc/resolv.conf` often break container DNS unless Docker is given explicit nameservers.
+
+### Check storage and stuck mounts
+
+```
+df -h /var/lib/docker
+docker system df
+lsof /var/lib/docker
+```
+
+Bind-mounting `/var/lib/docker` into other containers can keep container filesystems busy and block removal.
+
+## Remediation
+
+**Daemon not running or client aimed at the wrong host:**
+Unset an incorrect `DOCKER_HOST`, then start the daemon:
+```
+unset DOCKER_HOST
+systemctl restart docker
+```
+
+**`daemon.json` conflicts with systemd flags:**
+Remove duplicate settings or create a systemd override so `dockerd` is started without conflicting flags.
+
+**Permission denied on Docker socket:**
+Add the user to the `docker` group, then re-login:
+```
+usermod -aG docker $USER
+newgrp docker
+```
+
+If `~/.docker/` was created by `sudo`, fix ownership:
+```
+sudo chown "$USER":"$USER" "$HOME/.docker" -R
+sudo chmod g+rwx "$HOME/.docker" -R
+```
+
+**Container DNS broken:**
+Configure explicit DNS servers in `/etc/docker/daemon.json`, then restart Docker.
+
+**Docker networking disappears after boot:**
+Stop the host network manager from managing Docker interfaces and confirm `net.ipv4.ip_forward=1`.
+
+**OOM kills:**
+Treat this as host memory pressure first; reduce workload, add memory, or enforce container memory limits.
+
+**Unable to remove filesystem:**
+Find the process holding the path open with `lsof`, then stop that process or the container bind-mounting `/var/lib/docker`.
\ No newline at end of file
diff --git a/runbooks/kernel.md b/runbooks/kernel.md
new file mode 100644
index 0000000..fb42c1c
--- /dev/null
+++ b/runbooks/kernel.md
@@ -0,0 +1,117 @@
+---
+service: kernel
+symptoms: OOM kill, out of memory, high load average, kernel panic, segfault, soft lockup, CPU steal, system unresponsive, zombie processes, NMI watchdog
+tags: kernel, oom, memory, load, cpu, panic, dmesg, segfault, lockup, swap, zombie
+---
+
+## Symptoms
+
+- `Out of memory: Kill process <pid>` in dmesg — OOM killer fired
+- Load average far above CPU count — system overloaded or I/O blocked
+- `kernel: BUG: soft lockup` — CPU stuck in kernel code
+- `segfault at ...` in dmesg — process crashed due to invalid memory access
+- `kernel panic` — unrecoverable kernel error (visible only on console or serial)
+- Many zombie (`Z`) processes in `ps` output
+- High `%steal` in `top`/`vmstat` — hypervisor CPU contention
+
+## Diagnostics
+
+### Recent kernel messages
+
+```
+dmesg -T | tail -100
+dmesg -T | grep -iE 'error|warn|oom|kill|panic|oops|fault|hung|lockup'
+journalctl -k -n 200
+```
+
+### OOM events
+
+```
+dmesg -T | grep -i 'out of memory\|oom_kill\|killed process'
+```
+
+The log shows which process was killed, its RSS at time of kill, and available memory.
+
+### Memory usage
+
+```
+free -h
+cat /proc/meminfo | head -30
+vmstat -s
+```
+
+`MemAvailable` is the key metric. If it is near zero and swap is also exhausted, OOM kills are imminent.
+
+### Swap
+
+```
+swapon --show
+cat /proc/swaps
+vmstat 1 5
+```
+
+High `si`/`so` (swap-in/swap-out) in `vmstat` indicates active swapping and likely memory pressure.
+
+### Load average and CPU
+
+```
+uptime
+top -b -n1 | head -30
+mpstat -P ALL 1 3
+```
+
+Load average above 2× CPU count sustained over 15 minutes is concerning.
+High `%iowait` indicates processes blocked on disk I/O, not CPU-bound load.
+
+### Process memory usage
+
+```
+ps aux --sort=-%mem | head -20
+ps aux --sort=-%cpu | head -20
+```
+
+### Zombie processes
+
+```
+ps aux | awk '$8=="Z"'
+```
+
+Zombies cannot be killed; the parent must `wait()` for them or be killed itself.
+
+### I/O wait and disk health
+
+```
+iostat -x 1 3
+dmesg -T | grep -iE 'i/o error|hard resetting link|ata.*error|blk_update_request'
+```
+
+Persistent I/O errors alongside high load suggest failing storage.
+
+## Remediation
+
+**Memory pressure / frequent OOM kills:**
+Identify the largest memory consumers from `ps aux --sort=-%mem`.
+Consider increasing swap, adding RAM, tuning `vm.overcommit_memory`, or scaling the workload.
+Do NOT just raise `vm.overcommit_ratio` without understanding the root consumer.
+
+**Adjust OOM killer scoring for critical services (temporary, resets on reboot):**
+```
+echo -17 > /proc/<pid>/oom_adj        # legacy
+echo -1000 > /proc/<pid>/oom_score_adj  # current kernels
+```
+
+**Swap exhausted — add a swapfile:**
+```
+fallocate -l 2G /swapfile
+chmod 600 /swapfile
+mkswap /swapfile
+swapon /swapfile
+```
+
+**High I/O wait — find the I/O-heavy process:**
+```
+iotop -a -o -b -n3
+```
+
+**Zombie reaping — if parent is stuck:**
+Kill the parent process (it will reap children on exit), then verify zombies disappear.
diff --git a/runbooks/nginx.md b/runbooks/nginx.md
new file mode 100644
index 0000000..173007f
--- /dev/null
+++ b/runbooks/nginx.md
@@ -0,0 +1,99 @@
+---
+service: nginx
+symptoms: 502 Bad Gateway, 504 Gateway Timeout, upstream connection refused, nginx not starting, failed to bind socket, permission denied reading config, configuration test failed
+tags: nginx, web, http, https, proxy, upstream, reverse-proxy, load-balancer
+---
+
+## Symptoms
+
+- `502 Bad Gateway` — nginx reached the upstream but got an invalid response, or upstream is down
+- `504 Gateway Timeout` — upstream took too long to respond
+- `111: Connection refused` in nginx error log — upstream process is not running or not on the expected port
+- `nginx.service: Start request repeated too quickly` — crash-loop; check error log
+- `[emerg] bind() to 0.0.0.0:80 failed (98: Address already in use)` — port conflict
+- `[emerg] open() ... failed (13: Permission denied)` — file permission issue
+
+## Diagnostics
+
+### Service status
+
+```
+systemctl status nginx
+```
+
+### Config test
+
+```
+nginx -t
+```
+
+A config error is the most common reason for nginx failing to start or reload.
+
+### Error log
+
+```
+journalctl -u nginx -n 100
+tail -n 100 /var/log/nginx/error.log
+```
+
+For 502/504 errors look for: `connect() failed`, `upstream timed out`, `no live upstreams`.
+
+### Access log — recent requests
+
+```
+tail -n 50 /var/log/nginx/access.log
+```
+
+### Check upstream services
+
+For `proxy_pass` targets, verify the upstream is running:
+```
+systemctl status <upstream-service>
+ss -tlnp | grep <upstream-port>
+```
+
+Common upstreams: `gunicorn`, `uwsgi`, `node`, `puma`, `php-fpm`.
+
+### Port binding conflicts
+
+```
+ss -tlnp | grep ':80\|:443'
+```
+
+### Config files
+
+```
+cat /etc/nginx/nginx.conf
+ls /etc/nginx/sites-enabled/
+cat /etc/nginx/sites-enabled/<vhost>
+```
+
+Check `proxy_pass`, `upstream` blocks, `proxy_connect_timeout`, `proxy_read_timeout`.
+
+## Remediation
+
+**Upstream service not running:**
+Start the upstream service, then verify nginx resumes proxying.
+
+**Config syntax error:**
+Fix the error shown by `nginx -t`, then:
+```
+systemctl reload nginx
+```
+
+**Port already in use:**
+Find the conflicting process with `ss -tlnp | grep :80`, stop it, then restart nginx.
+
+**Upstream timeouts — increase timeouts (caution: treat the slow upstream as the root cause):**
+```nginx
+proxy_connect_timeout 10s;
+proxy_read_timeout 60s;
+proxy_send_timeout 60s;
+```
+
+**Permission denied on log or socket file:**
+```
+ls -la /var/log/nginx/
+ls -la /run/nginx.pid
+chown -R www-data:www-data /var/log/nginx/
+```
diff --git a/runbooks/postgres.md b/runbooks/postgres.md
new file mode 100644
index 0000000..b64f032
--- /dev/null
+++ b/runbooks/postgres.md
@@ -0,0 +1,107 @@
+---
+service: postgres
+symptoms: connection refused port 5432, FATAL password authentication failed, replication lag, disk full, out of shared memory, too many connections, relation does not exist, could not connect to the primary
+tags: postgres, postgresql, database, replication, pg, psql, disk, connections
+---
+
+## Symptoms
+
+- `could not connect to server: Connection refused` — postgres not running or not on port 5432
+- `FATAL:  password authentication failed for user "<user>"` — wrong credentials or pg_hba mismatch
+- `FATAL:  too many connections` — connection pool exhausted
+- `ERROR:  could not resize shared memory segment` / `out of shared memory` — shared_buffers too high for system
+- `PANIC:  could not write to file "pg_wal/..."` — disk full on WAL directory
+- Replication lag growing — standby falling behind primary
+- `FATAL:  could not connect to the primary server` — standby cannot reach primary
+
+## Diagnostics
+
+### Service status
+
+```
+systemctl status postgresql
+systemctl status postgresql@<version>-main
+```
+
+### PostgreSQL logs
+
+```
+journalctl -u postgresql -n 100
+tail -n 100 /var/log/postgresql/postgresql-*.log
+```
+
+### Is postgres listening?
+
+```
+ss -tlnp | grep 5432
+```
+
+### Disk space (WAL and data directory are the critical paths)
+
+```
+df -h
+du -sh /var/lib/postgresql/
+du -sh /var/lib/postgresql/*/main/pg_wal/
+```
+
+A full disk on the pg_wal partition causes a PANIC and hard crash.
+
+### Connection count
+
+```sql
+SELECT count(*), state FROM pg_stat_activity GROUP BY state;
+SELECT setting FROM pg_settings WHERE name = 'max_connections';
+```
+
+### Replication lag (run on primary)
+
+```sql
+SELECT client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn,
+       (sent_lsn - replay_lsn) AS lag_bytes
+FROM pg_stat_replication;
+```
+
+### pg_hba.conf — authentication rules
+
+```
+cat /etc/postgresql/*/main/pg_hba.conf
+```
+
+Entries are matched top-to-bottom. `reject` or missing entry for the client IP causes auth failure even with correct credentials.
+
+### Shared memory / kernel settings
+
+```
+cat /proc/sys/kernel/shmmax
+cat /etc/postgresql/*/main/postgresql.conf | grep shared_buffers
+```
+
+`shared_buffers` must not exceed ~40% of RAM; kernel `shmmax` must accommodate it.
+
+## Remediation
+
+**Postgres not running:**
+```
+systemctl start postgresql
+```
+Check logs immediately after start for the failure reason.
+
+**Authentication failure (pg_hba mismatch):**
+Add or update the correct entry in `pg_hba.conf`, then reload:
+```
+systemctl reload postgresql
+```
+
+**Too many connections — increase limit (requires restart):**
+In `postgresql.conf`:
+```
+max_connections = 200
+```
+Or deploy a connection pooler (`pgbouncer`).
+
+**Disk full on WAL:**
+Identify and remove old base backups or archived WAL segments under `/var/lib/postgresql/*/main/pg_wal/`.
+Do NOT delete pg_wal files directly — use `pg_archivecleanup` or let archiving catch up.
+
+**Replication lag — standby too far behind:**
+Check network bandwidth and I/O on standby. If `wal_receiver_status_interval` lag is large, increase `wal_sender_timeout` temporarily.
diff --git a/runbooks/selinux.md b/runbooks/selinux.md
new file mode 100644
index 0000000..3bef20b
--- /dev/null
+++ b/runbooks/selinux.md
@@ -0,0 +1,112 @@
+---
+service: selinux
+symptoms: permission denied despite correct unix permissions, service blocked by selinux, avc denied, file context mismatch, port binding denied, boolean missing, domain transition failure
+tags: selinux, avc, enforcing, security, policy, restorecon, audit, sealert, semanage
+---
+
+## Symptoms
+
+- Service gets `Permission denied` even though file ownership and mode look correct
+- Process cannot bind to a port or open a file after a config change
+- AVC denials appear in audit logs
+- App works when SELinux is permissive but fails in enforcing mode
+- Newly created files under custom paths are inaccessible to a confined service
+
+## Diagnostics
+
+### Confirm SELinux mode and policy
+
+```
+getenforce
+sestatus
+cat /etc/selinux/config
+```
+
+If SELinux is `Permissive`, denials are logged but not enforced.
+
+### Check AVC denials
+
+```
+auditctl -s
+ausearch -m AVC,USER_AVC,SELINUX_ERR,USER_SELINUX_ERR -ts recent
+journalctl -t setroubleshoot -n 50
+dmesg | grep -i -e type=1300 -e type=1400
+```
+
+AVC denials are the primary source of truth for SELinux policy failures.
+
+If AVCs are missing but SELinux still appears involved, temporarily disable `dontaudit` rules to expose hidden denials:
+```
+semodule -DB
+```
+Re-enable them after reproducing the issue:
+```
+semodule -B
+```
+
+### Inspect file contexts
+
+```
+ls -lZ /path/to/file
+ps -eZ | grep <service>
+matchpathcon -V /path/to/file
+```
+
+A service can have correct Unix permissions and still fail if the SELinux context is wrong.
+
+### Check port labeling and booleans
+
+```
+semanage port -l | grep <port>
+getsebool -a | grep <service-or-feature>
+semanage boolean -l | grep <service-or-feature>
+```
+
+Custom ports often require explicit SELinux port labels.
+
+### Check for relabeling needs
+
+```
+restorecon -nRv /path
+matchpathcon /path/to/file
+sealert -l "*"
+```
+
+`restorecon -n` shows what would change without modifying labels.
+
+`sealert` is often the fastest way to turn a raw AVC into a concrete fix, but treat `audit2allow` suggestions as a last resort, not a first response.
+
+## Remediation
+
+**Wrong file context:**
+Restore the default context:
+```
+restorecon -Rv /path
+```
+
+**Custom application path needs persistent labeling:**
+```
+semanage fcontext -a -t <type> '/custom/path(/.*)?'
+restorecon -Rv /custom/path
+```
+
+**Custom port binding denied:**
+Add the port label required by the service type:
+```
+semanage port -a -t <port_type> -p tcp <port>
+```
+
+**Boolean disabled:**
+Enable the needed boolean persistently:
+```
+setsebool -P <boolean_name> on
+```
+
+**Still unsure whether SELinux is the blocker:**
+Temporarily switch to permissive mode and reproduce the issue:
+```
+setenforce 0
+```
+If the problem still occurs, SELinux is not the root cause.
+
+Do not disable SELinux or generate custom policy modules as a first response. Fix labels, booleans, or port mappings first.
diff --git a/runbooks/ssh.md b/runbooks/ssh.md
new file mode 100644
index 0000000..2714f21
--- /dev/null
+++ b/runbooks/ssh.md
@@ -0,0 +1,100 @@
+---
+service: ssh
+symptoms: connection refused, authentication failed, host key mismatch, permission denied, timeout connecting, no route to host
+tags: ssh, sshd, openssh, authentication, network, connectivity
+---
+
+## Symptoms
+
+- `ssh: connect to host <hostname> port 22: Connection refused`
+- `Permission denied (publickey)` — key not accepted or wrong user
+- `WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!` — host key mismatch
+- `Connection timed out` — firewall blocking or host unreachable
+- `No route to host` — routing issue or host is down
+
+## Diagnostics
+
+### Is sshd running?
+
+```
+systemctl status sshd
+systemctl status ssh
+```
+
+A stopped or failed sshd is the most common cause of "connection refused".
+
+### Check sshd configuration
+
+```
+sshd -t
+cat /etc/ssh/sshd_config
+```
+
+Look for: `PasswordAuthentication`, `PubkeyAuthentication yes`, `AuthorizedKeysFile`.
+
+### Check authorised keys
+
+```
+ls -la ~/.ssh/
+cat ~/.ssh/authorized_keys
+```
+
+Permissions must be: `~/.ssh` → `700`, `authorized_keys` → `600`.
+Wrong permissions cause silent auth failure even with the correct key.
+
+### Check sshd logs
+
+```
+journalctl -u sshd -n 100
+journalctl -u ssh -n 100
+grep sshd /var/log/auth.log | tail -50
+```
+
+Look for: `Invalid user`, `Failed publickey`, `Connection reset by peer`, `Too many authentication failures`.
+
+### Check listening port
+
+```
+ss -tlnp | grep sshd
+netstat -tlnp | grep :22
+```
+
+If sshd is running but not listening on the expected port, check `Port` in `/etc/ssh/sshd_config`.
+
+### Firewall rules
+
+```
+iptables -L INPUT -n -v
+nft list ruleset
+ufw status verbose
+```
+
+A DROP rule on port 22 causes silent timeouts, not "connection refused".
+
+## Remediation
+
+**sshd not running:**
+```
+systemctl enable --now sshd
+```
+
+**Wrong permissions on authorized_keys:**
+```
+chmod 700 ~/.ssh
+chmod 600 ~/.ssh/authorized_keys
+chown -R $USER:$USER ~/.ssh
+```
+
+**sshd config error:**
+Fix the error reported by `sshd -t`, then:
+```
+systemctl restart sshd
+```
+
+**Host key mismatch (expected after reinstall/reprovisioning):**
+Remove the old key from the client:
+```
+ssh-keygen -R <hostname>
+```
+Only do this if you are certain the host was intentionally reprovisioned.
+If the key change is unexpected, treat as a potential MITM and investigate before connecting.
diff --git a/runbooks/sssd.md b/runbooks/sssd.md
new file mode 100644
index 0000000..0a862ae
--- /dev/null
+++ b/runbooks/sssd.md
@@ -0,0 +1,115 @@
+---
+service: sssd
+symptoms: login denied, user not found, id command hangs, sudo rules missing, ldap auth failure, kerberos failure, cache stale, offline authentication not working
+tags: sssd, ldap, kerberos, ad, identity, auth, pam, nss, sudo
+---
+
+## Symptoms
+
+- `id <user>` hangs or returns no such user for a domain account
+- SSH or console login fails for directory-backed users
+- Group membership is missing or incomplete
+- `sudo` rules from LDAP/AD do not appear
+- Authentication works intermittently or only after cache flush
+- Offline authentication fails when the directory is unreachable
+
+## Diagnostics
+
+### Check service health
+
+```
+systemctl status sssd
+sssctl domain-list
+sssctl config-check
+cat /etc/nsswitch.conf
+```
+
+A running daemon with a valid config and `sss` present in `nsswitch.conf` are the first prerequisites.
+
+### Check identity resolution
+
+```
+id <user>
+getent passwd <user>
+getent group <group>
+```
+
+If NSS lookups fail, the issue is often in SSSD configuration, connectivity, or cache.
+
+### Check SSSD logs
+
+```
+journalctl -u sssd -n 100
+ls -la /var/log/sssd/
+tail -n 100 /var/log/sssd/*.log
+sssctl logs-fetch
+```
+
+Look for: backend offline, LDAP bind failures, Kerberos errors, TLS problems, and access provider denials.
+
+If the issue is unclear, raise `debug_level=6` in the relevant `[nss]`, `[pam]`, and `[domain/<name>]` sections. Raising debug only in `[sssd]` is not enough for most real failures.
+
+### Check domain reachability
+
+```
+sssctl domain-status <domain>
+ping <ldap-or-ad-host>
+dig -t SRV _ldap._tcp.<domain>
+cat /etc/resolv.conf
+```
+
+If the identity provider is unreachable, SSSD may serve cached data only or fail entirely.
+
+### Check Kerberos and LDAP configuration
+
+```
+cat /etc/sssd/sssd.conf
+cat /etc/krb5.conf
+kinit <user>
+klist
+ldapsearch -ZZ -x -H ldap://<server> -b <base-dn>
+```
+
+Look for wrong realm names, bad server addresses, TLS settings, and access filters.
+
+For AD or IPA providers, Kerberos and DNS are often the real dependency chain: broken SRV lookup, keytab issues, or a slow KDC will surface as SSSD failures.
+
+### Check cache and permissions
+
+```
+ls -la /var/lib/sss/db/
+sssctl cache-status
+sssctl cache-expire -E
+```
+
+`/etc/sssd/sssd.conf` must usually be mode `600` or SSSD will refuse to start.
+
+Do not wipe cache files blindly on an offline system that depends on cached logins.
+
+## Remediation
+
+**Config syntax or permission issue:**
+Fix `sssd.conf`, set secure permissions, then restart:
+```
+chmod 600 /etc/sssd/sssd.conf
+systemctl restart sssd
+```
+
+**Stale cache:**
+Clear cache carefully, then repopulate with a fresh lookup:
+```
+sss_cache -E
+id <user>
+```
+
+**Kerberos failure:**
+Validate time sync, realm, keytab credentials, and KDC reachability before changing LDAP settings.
+
+**Backend offline or `sdap_async_sys_connect request failed`:**
+Treat as DNS/network first. Validate SRV records and TLS handshake before increasing `ldap_network_timeout` or `ldap_search_timeout`.
+
+**Access denied despite successful lookup:**
+Check `access_provider`, LDAP filters, HBAC rules, or AD group-based access restrictions.
+
+**No `pam_sss` messages at all:**
+The PAM stack is likely misconfigured. Fix the PAM/authselect profile before changing SSSD itself.
diff --git a/runbooks/wayland.md b/runbooks/wayland.md
new file mode 100644
index 0000000..1c7b6aa
--- /dev/null
+++ b/runbooks/wayland.md
@@ -0,0 +1,89 @@
+---
+service: wayland
+symptoms: wayland session fails, gdm falls back to xorg, black screen on login, fractional scaling broken, screen sharing broken, remote desktop broken, wlroots crash, compositor crash
+tags: wayland, compositor, gnome, kde, mutter, wlroots, pipewire, xwayland, graphics
+---
+
+## Symptoms
+
+- User selects a Wayland session but is returned to login
+- GDM or another display manager falls back to Xorg
+- Screen sharing, remote desktop, or clipboard integration is broken
+- Apps requiring XWayland fail while native Wayland apps work
+- Fractional scaling or multi-monitor layout behaves incorrectly
+- Wayland compositor crashes after login
+
+## Diagnostics
+
+### Confirm the active session type
+
+```
+echo $XDG_SESSION_TYPE
+loginctl show-session $XDG_SESSION_ID -p Type
+echo $WAYLAND_DISPLAY
+```
+
+If the session type is `x11`, you are not debugging an active Wayland session.
+
+### Check display manager and compositor logs
+
+```
+systemctl status gdm
+journalctl -b | grep -iE 'wayland|mutter|kwin|wlroots|xwayland'
+journalctl -b | grep -i 'renderer for'
+```
+
+Look for compositor crashes, GPU driver incompatibilities, and forced Xorg fallback messages.
+
+### Check XWayland and PipeWire components
+
+```
+which Xwayland
+systemctl --user status pipewire
+systemctl --user status xdg-desktop-portal
+systemctl --user status xdg-desktop-portal-gnome
+systemctl --user status xdg-desktop-portal-kde
+xlsclients -l
+```
+
+Broken screen sharing is often a PipeWire or portal issue, not a compositor issue.
+
+`xlsclients -l` helps identify apps that are actually running under XWayland rather than native Wayland.
+
+### Check GPU compatibility
+
+```
+lspci -k | grep -A3 -E 'VGA|3D|Display'
+lsmod | grep -E 'nvidia|nouveau|amdgpu|i915'
+```
+
+Wayland support quality depends heavily on the GPU driver stack.
+
+### Check environment and session overrides
+
+```
+env | grep -E 'WAYLAND|XDG|GDK_BACKEND|QT_QPA_PLATFORM'
+cat /etc/gdm/custom.conf
+wayland-info
+```
+
+Environment overrides can force apps onto X11 or disable Wayland entirely.
+
+For NVIDIA systems, confirm the compositor is using a supported buffer path (GBM on current drivers is the expected default).
+
+## Remediation
+
+**Wayland disabled in display manager config:**
+Check `WaylandEnable=false` or similar settings and remove the override if unintended.
+
+**Fallback to Xorg on unsupported GPU stack:**
+Upgrade or change the graphics driver; Wayland stability is often limited by the driver, not the compositor.
+
+**Screen sharing broken:**
+Fix PipeWire and `xdg-desktop-portal` services before changing compositor settings.
+
+**XWayland-only app failures:**
+Treat them separately from native Wayland issues; confirm `Xwayland` is installed and launching.
+
+**Remote desktop, VM, or game input grabbing is broken:**
+This is often a Wayland protocol/compositor support limitation, not a generic keyboard bug. Check compositor support for pointer constraints, relative pointer, and keyboard shortcut inhibit protocols.
diff --git a/runbooks/x2go.md b/runbooks/x2go.md
new file mode 100644
index 0000000..7d88fc7
--- /dev/null
+++ b/runbooks/x2go.md
@@ -0,0 +1,106 @@
+---
+service: x2go
+symptoms: x2go session fails to start, x2go black screen, x2go disconnects immediately, no desktop in session, authentication failure, x2go agent not starting, sound forwarding broken
+tags: x2go, nx, remote-desktop, x2goserver, x2goclient, session, desktop, xauth
+---
+
+## Symptoms
+
+- X2Go login succeeds but the session immediately disconnects
+- Black screen after login
+- Session is created but no desktop appears
+- `x2goruncommand error` or `X2Go Agent got stuck in state`
+- Sound, clipboard, or drive sharing fails while login itself works
+- Authentication works over SSH but X2Go session startup fails
+
+## Diagnostics
+
+### Check X2Go services and packages
+
+```
+systemctl status x2goserver
+systemctl status sshd
+rpm -qa | grep x2go
+apt list --installed | grep x2go
+which x2golistsessions
+```
+
+X2Go depends on working SSH plus installed `x2goserver` and `x2goserver-xsession` components.
+
+### Check X2Go logs
+
+```
+journalctl -u x2goserver -n 100
+journalctl -u sshd -n 100
+ls -la ~/.x2go/
+find ~/.x2go -type f -maxdepth 2 -print
+x2golistsessions
+```
+
+Look for session startup failures, agent crashes, and auth helper errors.
+
+### Check desktop environment startup command
+
+```
+cat /etc/x2go/Xsession
+cat ~/.xsession
+cat ~/.Xclients
+```
+
+A missing or broken desktop session command is a common cause of black screens.
+
+### Check X11 and xauth availability
+
+```
+which xauth
+xauth -V
+ls -la ~/.Xauthority
+which sshfs
+```
+
+X2Go requires a working X11 session setup. Missing `xauth` or a bad `.Xauthority` often breaks startup.
+
+Filesystem and folder-sharing features may also depend on `sshfs` being installed.
+
+### Check session limits and stale sessions
+
+```
+x2golistsessions
+x2gocleansessions
+ulimit -a
+loginctl list-sessions
+```
+
+Stale sessions or per-user process limits can prevent a new desktop from starting.
+
+### Check desktop dependencies
+
+```
+which startxfce4
+which mate-session
+which startplasma-x11
+env | grep -E 'DESKTOP|XDG'
+```
+
+If the selected desktop command does not exist, X2Go may connect and then terminate immediately.
+
+## Remediation
+
+**Missing or broken desktop startup command:**
+Set the session to a known-good desktop such as XFCE and verify the binary exists.
+
+**Corrupt Xauthority or stale X2Go session files:**
+Remove stale session state and regenerate auth files:
+```
+rm -f ~/.Xauthority
+rm -rf ~/.x2go/C-*
+```
+
+**Missing `xauth` or X11 helpers:**
+Install the missing X11 packages, then retry the session.
+
+**Required server packages missing:**
+Install `x2goserver` and `x2goserver-xsession` first, then retry before debugging desktop startup.
+
+**SSH works but X2Go session fails:**
+Treat it as a desktop startup or X11 auth problem, not an SSH transport problem.
diff --git a/runbooks/xorg.md b/runbooks/xorg.md
new file mode 100644
index 0000000..cfa54e5
--- /dev/null
+++ b/runbooks/xorg.md
@@ -0,0 +1,94 @@
+---
+service: xorg
+symptoms: xorg black screen, display manager loop, no screens found, failed to start X server, GPU driver error, xrandr missing outputs, login screen not appearing
+tags: xorg, x11, display, gpu, drm, xrandr, gdm, sddm, lightdm
+---
+
+## Symptoms
+
+- Black screen after graphical boot
+- Display manager loops back to login
+- `no screens found` in Xorg log
+- External monitors are missing or not detected
+- X server fails after a driver update
+- `startx` exits immediately with display or device errors
+
+## Diagnostics
+
+### Check display manager and Xorg service path
+
+```
+systemctl status display-manager
+systemctl status gdm
+systemctl status sddm
+systemctl status lightdm
+```
+
+If the display manager is failing, inspect its logs before focusing on Xorg itself.
+
+### Check Xorg logs
+
+```
+find /var/log -name 'Xorg*.log' -o -name 'Xorg.*.log'
+grep -E '\(EE\)|\(WW\)' /var/log/Xorg.0.log
+journalctl -b | grep -iE 'xorg|gdm|sddm|lightdm'
+ls -la ~/.local/share/xorg/
+```
+
+Look for: `no screens found`, GPU module load failures, and permission/device access errors.
+
+On rootless Xorg, logs are often under `~/.local/share/xorg/Xorg.0.log` instead of `/var/log/`.
+
+### Check DRM and GPU driver state
+
+```
+lspci -k | grep -A3 -E 'VGA|3D|Display'
+lsmod | grep -E 'nouveau|nvidia|amdgpu|i915'
+dmesg | grep -iE 'drm|gpu|nvidia|amdgpu|i915'
+```
+
+Driver mismatches after kernel updates are a common cause of X startup failures.
+
+### Check monitor detection and permissions
+
+```
+loginctl session-status
+xrandr --query
+ls -la /dev/dri/
+ps -o user= -C Xorg
+```
+
+If `/dev/dri/*` permissions or seat assignment are wrong, X may fail to access the GPU.
+
+### Check X configuration files
+
+```
+find /etc/X11 -maxdepth 3 -type f
+cat /etc/X11/xorg.conf
+cat /etc/X11/xorg.conf.d/*.conf
+ls -la ~/.xinitrc ~/.xserverrc
+```
+
+Custom `Device`, `Monitor`, or `Screen` sections often break auto-detection.
+
+An empty or broken `.xinitrc` can produce a black screen even when the X server itself started correctly.
+
+## Remediation
+
+**Bad static Xorg config:**
+Move custom config aside and let auto-detection work unless the hardware truly needs manual config.
+
+**Driver mismatch after update:**
+Reinstall the GPU driver package matching the running kernel and reboot or restart the display manager.
+
+**`no screens found`:**
+Check whether the correct DRM module loaded and whether the display manager is running on the expected seat.
+
+**Display manager loop:**
+Correlate Xorg errors with PAM/auth logs; some loops are session startup failures, not graphics failures.
+
+**Framebuffer mode failure:**
+If X falls back to `fbdev` and errors with framebuffer/bus ID messages, remove the generic `fbdev` driver package and let Xorg use the proper modesetting or vendor driver.
+
+**`SocketCreateListener() failed`:**
+Check for stale sockets in `/tmp/.X11-unix`, especially after previous root-run Xorg sessions.
diff --git a/src/tai/chroma_telemetry.py b/src/tai/chroma_telemetry.py
new file mode 100644
index 0000000..310c65e
--- /dev/null
+++ b/src/tai/chroma_telemetry.py
@@ -0,0 +1,24 @@
+"""Local no-op telemetry implementation for ChromaDB.
+
+ChromaDB expects a product telemetry client component. Some local package
+combinations emit noisy PostHog errors even when anonymized telemetry is
+disabled, so tai wires ChromaDB to this no-op client instead.
+"""
+
+from __future__ import annotations
+
+from chromadb.config import System
+from chromadb.telemetry.product import ProductTelemetryClient, ProductTelemetryEvent
+from overrides import override
+
+
+class NoOpProductTelemetryClient(ProductTelemetryClient):
+    """Telemetry client that intentionally drops all events."""
+
+    def __init__(self, system: System):
+        super().__init__(system)
+
+    @override
+    def capture(self, event: ProductTelemetryEvent) -> None:
+        del event
+        return None
\ No newline at end of file
diff --git a/src/tai/cli.py b/src/tai/cli.py
index e8b8f31..b6eb5a3 100644
--- a/src/tai/cli.py
+++ b/src/tai/cli.py
@@ -21,16 +21,20 @@ from tai.input_parser import InputValidationError, build_request
 from tai.models import TroubleshootRequest
 from tai.plan import plan_from_request
 from tai.prompt_builder import (
+    build_analysis_message_with_chunks,
     build_followup_message,
     build_message_with_chunks,
     build_system_prompt,
     build_user_message,
 )
 from tai.rag_retriever import EmbeddedChunk, chunk_report, retrieve_scored
+from tai.runbook_store import RunbookChunk, RunbookStore
 from tai.session_log import SessionLogger
 from tai.ssh_client import SSHClient, SSHCommandResult, SSHConnectionConfig, SSHSession
 
 app = typer.Typer(no_args_is_help=True, add_completion=False)
+runbooks_app = typer.Typer(no_args_is_help=True, help="Manage the runbook knowledge base.")
+app.add_typer(runbooks_app, name="runbooks")
 console = Console()
 
 
@@ -98,6 +102,20 @@ def run(
         str,
         typer.Option("--ai-key", help="API key for the AI backend (not needed for Ollama)."),
     ] = "ollama",
+    ai_timeout_seconds: Annotated[
+        float,
+        typer.Option(
+            "--ai-timeout-seconds",
+            help="Timeout for AI requests/generation in seconds.",
+        ),
+    ] = 120.0,
+    ai_max_tokens: Annotated[
+        int,
+        typer.Option(
+            "--ai-max-tokens",
+            help="Upper bound for generated completion tokens.",
+        ),
+    ] = 1024,
     log_file: Annotated[
         str | None,
         typer.Option(
@@ -126,6 +144,13 @@ def run(
             help="Print retrieved chunk names/scores and log per-question retrieval metrics.",
         ),
     ] = False,
+    runbooks_path: Annotated[
+        str | None,
+        typer.Option(
+            "--runbooks",
+            help="Path to a synced runbook ChromaDB store. Enables Tier 2 RAG.",
+        ),
+    ] = None,
 ) -> None:
     """Start an interactive troubleshooting session scaffold."""
     try:
@@ -161,11 +186,27 @@ def run(
     if not (probe or collect or analyze or interactive):
         return  # nothing SSH-related requested
 
-    ai_config = AIConfig(host=ai_host, model=model, api_key=ai_key, embed_model=embed_model)
+    ai_config = AIConfig(
+        host=ai_host,
+        model=model,
+        api_key=ai_key,
+        timeout_seconds=ai_timeout_seconds,
+        max_tokens=ai_max_tokens,
+        embed_model=embed_model,
+    )
     logger = SessionLogger.create(log_file) if log_file else None
     if analyze or interactive:
         console.print(f"[cyan]AI:[/cyan] {AIClient(ai_config).summary()}")
 
+    runbook_store: RunbookStore | None = None
+    if runbooks_path is not None:
+        try:
+            runbook_store = RunbookStore(runbooks_path)
+            rb_count = runbook_store.count()
+            console.print(f"[dim]Runbooks: {rb_count} indexed at {runbooks_path}[/dim]")
+        except Exception as exc:  # noqa: BLE001
+            console.print(f"[yellow]Runbook store unavailable:[/yellow] {exc}")
+
     try:
         asyncio.run(
             _async_main(
@@ -178,6 +219,7 @@ def run(
                 ai_config=ai_config,
                 no_rag=no_rag,
                 rag_debug=rag_debug,
+                runbook_store=runbook_store,
                 logger=logger,
             )
         )
@@ -202,6 +244,7 @@ async def _async_main(
     ai_config: AIConfig,
     no_rag: bool,
     rag_debug: bool,
+    runbook_store: RunbookStore | None,
     logger: SessionLogger | None,
 ) -> None:
     """Open a single SSH session and run probe / collection / analysis through it."""
@@ -249,7 +292,15 @@ async def _async_main(
                 )
 
         if analyze and report is not None:
-            _run_analysis(ai_config, req.issue, report, logger=logger)
+            _run_analysis(
+                ai_config,
+                req.issue,
+                report,
+                no_rag=no_rag,
+                rag_debug=rag_debug,
+                runbook_store=runbook_store,
+                logger=logger,
+            )
 
         if interactive:
             await _interactive_loop(
@@ -259,6 +310,7 @@ async def _async_main(
                 report,
                 no_rag=no_rag,
                 rag_debug=rag_debug,
+                runbook_store=runbook_store,
                 logger=logger,
             )
 
@@ -271,6 +323,7 @@ async def _interactive_loop(
     *,
     no_rag: bool = False,
     rag_debug: bool = False,
+    runbook_store: RunbookStore | None = None,
     logger: SessionLogger | None,
 ) -> None:
     """Run a follow-up loop for collecting and conversational analysis."""
@@ -421,6 +474,7 @@ async def _interactive_loop(
                 prior_questions,
                 embedded_chunks=embedded_chunks,
                 rag_debug=rag_debug,
+                runbook_store=runbook_store,
                 logger=logger,
             )
             prior_questions.append("/analyze")
@@ -477,6 +531,7 @@ async def _interactive_loop(
             prior_questions,
             embedded_chunks=embedded_chunks,
             rag_debug=rag_debug,
+            runbook_store=runbook_store,
             logger=logger,
         )
         prior_questions.append(command)
@@ -539,6 +594,9 @@ def _run_analysis(
     issue: str,
     report: CollectionReport,
     *,
+    no_rag: bool = False,
+    rag_debug: bool = False,
+    runbook_store: RunbookStore | None = None,
     logger: SessionLogger | None,
 ) -> None:
     """Send collected data to the AI and stream the analysis to stdout."""
@@ -547,12 +605,45 @@ def _run_analysis(
     console.print()
     ai = AIClient(ai_config)
     system_prompt = build_system_prompt()
-    user_message = build_user_message(issue, report)
+    runbook_chunks = _query_runbooks(runbook_store, issue, ai, top_k=1)
+
+    user_message: str
+    if no_rag:
+        user_message = build_user_message(issue, report, runbook_chunks=runbook_chunks or None)
+    else:
+        try:
+            chunks = chunk_report(report)
+            embedded = [EmbeddedChunk(chunk=c, embedding=ai.embed(c.content)) for c in chunks]
+            q_embedding = ai.embed(issue)
+            scored = retrieve_scored(q_embedding, embedded, top_k=3)
+            if rag_debug:
+                pairs = ", ".join(
+                    f"{chunk.name}={score:.3f}" for chunk, score in scored
+                )
+                console.print(f"[dim]RAG retrieve (initial):[/dim] {pairs or 'no matches'}")
+            selected = [chunk for chunk, _score in scored]
+            if selected:
+                user_message = build_analysis_message_with_chunks(
+                    issue,
+                    report.host,
+                    selected,
+                    runbook_chunks=runbook_chunks or None,
+                )
+            else:
+                user_message = build_user_message(issue, report, runbook_chunks=runbook_chunks or None)
+        except Exception as exc:  # noqa: BLE001
+            console.print(
+                "[yellow]RAG unavailable for initial analysis; using full-context fallback.[/yellow]"
+            )
+            if logger is not None:
+                logger.log_event("rag_index", {"status": "fallback", "error": str(exc)})
+            user_message = build_user_message(issue, report, runbook_chunks=runbook_chunks or None)
     try:
-        chunks: list[str] = []
-        for chunk in ai.stream(system_prompt, user_message):
-            chunks.append(chunk)
-        response = "".join(chunks)
+        response = _complete_ai_response(
+            ai,
+            system_prompt,
+            user_message,
+        )
         console.print(Markdown(response))
 
         warnings = validate_ai_response(response)
@@ -596,6 +687,7 @@ def _run_followup_analysis(
     *,
     embedded_chunks: list[EmbeddedChunk] | None = None,
     rag_debug: bool = False,
+    runbook_store: RunbookStore | None = None,
     logger: SessionLogger | None,
 ) -> str:
     """Run grounded follow-up analysis re-anchored to current diagnostics.
@@ -609,6 +701,7 @@ def _run_followup_analysis(
     console.print()
     ai = AIClient(ai_config)
     system_prompt = build_system_prompt()
+    runbook_chunks = _query_runbooks(runbook_store, question, ai, top_k=1)
 
     user_message: str
     retrieved_names: list[str] = []
@@ -620,7 +713,7 @@ def _run_followup_analysis(
         retrieval_start = perf_counter()
         try:
             q_embedding = ai.embed(question)
-            scored = retrieve_scored(q_embedding, embedded_chunks, top_k=5)
+            scored = retrieve_scored(q_embedding, embedded_chunks, top_k=3)
             retrieval_ms = (perf_counter() - retrieval_start) * 1000.0
             retrieved_names = [chunk.name for chunk, _score in scored]
             retrieved_scores = [round(score, 4) for _chunk, score in scored]
@@ -630,6 +723,7 @@ def _run_followup_analysis(
                 [chunk for chunk, _score in scored],
                 question,
                 prior_questions,
+                runbook_chunks=runbook_chunks or None,
             )
             if rag_debug:
                 pairs = ", ".join(
@@ -644,10 +738,16 @@ def _run_followup_analysis(
                 "[yellow]RAG unavailable (query embedding failed); using full-context "
                 "fallback.[/yellow]"
             )
-            user_message = build_followup_message(issue, report, question, prior_questions)
+            user_message = build_followup_message(
+                issue, report, question, prior_questions,
+                runbook_chunks=runbook_chunks or None,
+            )
     else:
         fallback_reason = "rag not indexed"
-        user_message = build_followup_message(issue, report, question, prior_questions)
+        user_message = build_followup_message(
+            issue, report, question, prior_questions,
+            runbook_chunks=runbook_chunks or None,
+        )
 
     if logger is not None:
         logger.log_event(
@@ -665,10 +765,11 @@ def _run_followup_analysis(
         )
 
     try:
-        chunks: list[str] = []
-        for chunk in ai.stream(system_prompt, user_message):
-            chunks.append(chunk)
-        response = "".join(chunks)
+        response = _complete_ai_response(
+            ai,
+            system_prompt,
+            user_message,
+        )
         console.print(Markdown(response))
         console.print(Rule(style="dim"))
 
@@ -696,6 +797,150 @@ def _run_followup_analysis(
         raise typer.Exit(code=1) from exc
 
 
+def _complete_ai_response(
+    ai: AIClient,
+    system_prompt: str,
+    user_message: str,
+) -> str:
+    """Return a full AI completion in one request.
+
+    Some local backends intermittently stall on streaming before yielding a first
+    token; using a non-streaming completion path is more reliable for CLI runs.
+    """
+    return ai.complete(system_prompt, user_message).content
+
+
+def _query_runbooks(
+    store: RunbookStore | None,
+    question: str,
+    ai: AIClient,
+    *,
+    top_k: int = 3,
+) -> list[RunbookChunk]:
+    """Query the runbook store silently; returns empty list on any failure."""
+    if store is None:
+        return []
+    try:
+        return store.query(question, ai, top_k=top_k)
+    except Exception:  # noqa: BLE001
+        return []
+
+
+# ---------------------------------------------------------------------------
+# runbooks sub-app
+# ---------------------------------------------------------------------------
+
+
+@runbooks_app.command("sync")
+def runbooks_sync(
+    path: Annotated[
+        str,
+        typer.Option("--path", help="Directory containing runbook Markdown files."),
+    ] = "./runbooks",
+    store_path: Annotated[
+        str,
+        typer.Option("--store", help="ChromaDB store path. Defaults to ~/.tai/runbooks."),
+    ] = "~/.tai/runbooks",
+    ai_host: Annotated[
+        str,
+        typer.Option("--ai-host", help="OpenAI-compatible AI backend URL."),
+    ] = DEFAULT_AI_HOST,
+    embed_model: Annotated[
+        str,
+        typer.Option("--embed-model", help="Embedding model name."),
+    ] = DEFAULT_EMBED_MODEL,
+    ai_key: Annotated[
+        str,
+        typer.Option("--ai-key", help="API key for the AI backend."),
+    ] = "ollama",
+) -> None:
+    """Embed and index all runbooks from PATH into the persistent store."""
+    from pathlib import Path
+
+    runbooks_dir = Path(path).expanduser().resolve()
+    if not runbooks_dir.is_dir():
+        console.print(f"[red]Directory not found:[/red] {runbooks_dir}")
+        raise typer.Exit(code=1)
+
+    ai_config = AIConfig(host=ai_host, model="", api_key=ai_key, embed_model=embed_model)
+    ai = AIClient(ai_config)
+
+    try:
+        store = RunbookStore(store_path)
+        count = store.sync(runbooks_dir, ai)
+        console.print(f"[green]✓ Synced {count} runbook(s)[/green] → {store_path}")
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]Sync failed:[/red] {exc}")
+        raise typer.Exit(code=1) from exc
+
+
+@runbooks_app.command("list")
+def runbooks_list(
+    store_path: Annotated[
+        str,
+        typer.Option("--store", help="ChromaDB store path. Defaults to ~/.tai/runbooks."),
+    ] = "~/.tai/runbooks",
+) -> None:
+    """List all indexed runbooks and their metadata."""
+    try:
+        store = RunbookStore(store_path)
+        entries = store.list_indexed()
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]Could not open store:[/red] {exc}")
+        raise typer.Exit(code=1) from exc
+
+    if not entries:
+        console.print("[yellow]No runbooks indexed.[/yellow] Run [bold]tai runbooks sync[/bold].")
+        return
+
+    console.print(f"[bold]{len(entries)} indexed runbook(s):[/bold]")
+    for entry in sorted(entries, key=lambda e: e.get("title", "")):
+        title = entry.get("title", "?")
+        service = entry.get("service", "")
+        tags = entry.get("tags", "")
+        console.print(f"  [green]{title}[/green]  service={service}  tags={tags}")
+
+
+@runbooks_app.command("add")
+def runbooks_add(
+    file: Annotated[str, typer.Argument(help="Path to a single runbook Markdown file.")],
+    store_path: Annotated[
+        str,
+        typer.Option("--store", help="ChromaDB store path. Defaults to ~/.tai/runbooks."),
+    ] = "~/.tai/runbooks",
+    ai_host: Annotated[
+        str,
+        typer.Option("--ai-host", help="OpenAI-compatible AI backend URL."),
+    ] = DEFAULT_AI_HOST,
+    embed_model: Annotated[
+        str,
+        typer.Option("--embed-model", help="Embedding model name."),
+    ] = DEFAULT_EMBED_MODEL,
+    ai_key: Annotated[
+        str,
+        typer.Option("--ai-key", help="API key for the AI backend."),
+    ] = "ollama",
+) -> None:
+    """Embed and index a single runbook file into the persistent store."""
+    from pathlib import Path
+
+    runbook_path = Path(file).expanduser().resolve()
+    if not runbook_path.is_file():
+        console.print(f"[red]File not found:[/red] {runbook_path}")
+        raise typer.Exit(code=1)
+
+    ai_config = AIConfig(host=ai_host, model="", api_key=ai_key, embed_model=embed_model)
+    ai = AIClient(ai_config)
+
+    try:
+        store = RunbookStore(store_path)
+        store.sync_single(runbook_path, ai)
+        console.print(f"[green]✓ Indexed[/green] {runbook_path.name} → {store_path}")
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]Add failed:[/red] {exc}")
+        raise typer.Exit(code=1) from exc
+
+
 def main() -> None:
     """Console script entrypoint."""
     app()
diff --git a/src/tai/plan.py b/src/tai/plan.py
index e3c76a8..c6d6701 100644
--- a/src/tai/plan.py
+++ b/src/tai/plan.py
@@ -91,6 +91,7 @@ _KNOWN_SERVICES: list[str] = [
     "docker",
     "containerd",
     "kubelet",
+    "sssd",
     "sshd",
     "postfix",
     "dovecot",
@@ -107,6 +108,11 @@ _KNOWN_SERVICES: list[str] = [
     "crond",
     "rsyslog",
     "auditd",
+    "selinux",
+    "apparmor",
+    "xorg",
+    "wayland",
+    "x2go",
     "firewalld",
     "haproxy",
     "varnish",
@@ -121,6 +127,7 @@ _SERVICE_CONFIGS: dict[str, list[str]] = {
     "mysqld": ["/etc/my.cnf"],
     "mariadb": ["/etc/mysql/mariadb.conf.d/50-server.cnf"],
     "postgresql": ["/etc/postgresql"],
+    "sssd": ["/etc/sssd/sssd.conf"],
     "sshd": ["/etc/ssh/sshd_config"],
     "postfix": ["/etc/postfix/main.cf"],
     "haproxy": ["/etc/haproxy/haproxy.cfg"],
@@ -128,6 +135,18 @@ _SERVICE_CONFIGS: dict[str, list[str]] = {
     "redis-server": ["/etc/redis/redis.conf"],
     "fail2ban": ["/etc/fail2ban/jail.conf"],
     "ufw": ["/etc/ufw/ufw.conf"],
+    "x2go": ["/etc/x2go"],
+}
+
+_SERVICE_BINARIES: dict[str, list[str]] = {
+    "docker": ["/usr/bin/docker", "/usr/bin/dockerd"],
+    "sssd": ["/usr/sbin/sssd", "/usr/bin/sssctl"],
+    "sshd": ["/usr/sbin/sshd", "/usr/bin/ssh"],
+    "x2go": ["/usr/bin/x2golistsessions", "/usr/bin/x2goruncommand"],
+    "xorg": ["/usr/bin/Xorg", "/usr/bin/xrandr"],
+    "wayland": ["/usr/bin/wayland-info", "/usr/bin/Xwayland"],
+    "selinux": ["/usr/sbin/getenforce", "/usr/sbin/sestatus"],
+    "apparmor": ["/usr/sbin/aa-status", "/sbin/apparmor_parser"],
 }
 
 # ---------------------------------------------------------------------------
@@ -200,6 +219,12 @@ def plan_from_request(request: TroubleshootRequest) -> CollectionPlan:
         if svc in seen:
             continue
         seen.add(svc)
+        plan.add(
+            f"unit-file-{svc}",
+            f"systemctl list-unit-files {svc}.service --no-pager --no-legend",
+        )
+        for idx, binary_path in enumerate(_SERVICE_BINARIES.get(svc, []), start=1):
+            plan.add(f"binary-{svc}-{idx}", f"ls -l {binary_path}")
         plan.add(f"service-{svc}", f"systemctl status {svc}")
         plan.add(f"journal-{svc}", f"journalctl -u {svc} -n 100 --no-pager")
         for cfg_path in _SERVICE_CONFIGS.get(svc, []):
@@ -242,3 +267,5 @@ def _extract_services(issue: str) -> list[str]:
         if words & svc_words:
             found.append(svc)
     return found
+
+
diff --git a/src/tai/prompt_builder.py b/src/tai/prompt_builder.py
index 6094123..ede0607 100644
--- a/src/tai/prompt_builder.py
+++ b/src/tai/prompt_builder.py
@@ -4,6 +4,7 @@ from __future__ import annotations
 
 from tai.collectors import CollectionReport
 from tai.rag_retriever import Chunk
+from tai.runbook_store import RunbookChunk
 
 _SYSTEM_PROMPT = """\
 You are an expert Linux systems administrator and troubleshooting assistant.
@@ -19,6 +20,8 @@ Important rules:
 - For every root-cause claim, quote at least one exact snippet from collected output in backticks.
 - If a command shows "could not be executed (SSH error)" it means the remote host blocked or
   rejected that specific command — it is not evidence about the service or system state.
+- If service presence checks show a unit, binary, package, or config is missing, treat that as
+    evidence the component may be absent or not installed, not as proof that the component is broken.
 - If there is not enough data to diagnose the issue, say so plainly and list exactly what
   additional commands or log files would be needed.
 - Keep the response short. Skip sections that have nothing useful to say.
@@ -28,18 +31,56 @@ Important rules:
 - Format with clear sections: **Root Cause**, **Evidence**, **Recommended Actions**.
 """
 
+_MAX_RUNBOOK_CHARS = 500
+_MAX_DIAGNOSTIC_CHUNK_CHARS = 700
+
 
 def build_system_prompt() -> str:
     """Return the static system prompt for the troubleshooting agent."""
     return _SYSTEM_PROMPT.strip()
 
 
-def build_user_message(issue: str, report: CollectionReport) -> str:
+def _format_runbook_context(runbook_chunks: list[RunbookChunk]) -> str:
+    """Format retrieved runbook chunks as a Markdown context section."""
+    lines: list[str] = ["## Runbook context\n"]
+    lines.append(
+        "The following runbooks are relevant to this issue. "
+        "Use them to ground your diagnosis and recommendations in known procedures.\n"
+    )
+    for rb in runbook_chunks:
+        tag_str = f" — tags: {', '.join(rb.tags)}" if rb.tags else ""
+        content = rb.content.strip()
+        if len(content) > _MAX_RUNBOOK_CHARS:
+            content = content[:_MAX_RUNBOOK_CHARS].rstrip() + "\n...[truncated runbook context]"
+        lines.append(f"### Runbook: {rb.title} ({rb.service}){tag_str}\n")
+        lines.append(content)
+        lines.append("")
+    return "\n".join(lines)
+
+
+def _format_diagnostic_chunk(content: str) -> str:
+    """Cap diagnostic chunk size before prompt injection."""
+    text = content.strip()
+    if len(text) <= _MAX_DIAGNOSTIC_CHUNK_CHARS:
+        return text
+    return text[:_MAX_DIAGNOSTIC_CHUNK_CHARS].rstrip() + "\n...[truncated diagnostic context]"
+
+
+def build_user_message(
+    issue: str,
+    report: CollectionReport,
+    *,
+    runbook_chunks: list[RunbookChunk] | None = None,
+) -> str:
     """Format *issue* and *report* into the user message sent to the AI."""
     lines: list[str] = []
 
     lines.append(f"## Issue reported\n\n{issue}\n")
     lines.append(f"## Target host\n\n{report.host}\n")
+
+    if runbook_chunks:
+        lines.append(_format_runbook_context(runbook_chunks))
+
     lines.append("## Collected diagnostics\n")
 
     skipped: list[str] = []
@@ -83,9 +124,11 @@ def build_followup_message(
     report: CollectionReport,
     question: str,
     prior_questions: list[str],
+    *,
+    runbook_chunks: list[RunbookChunk] | None = None,
 ) -> str:
     """Build a grounded follow-up message that re-anchors to diagnostics each turn."""
-    base = build_user_message(issue, report)
+    base = build_user_message(issue, report, runbook_chunks=runbook_chunks)
     lines: list[str] = [base, "## Follow-up"]
 
     if prior_questions:
@@ -112,11 +155,15 @@ def build_message_with_chunks(
     chunks: list[Chunk],
     question: str,
     prior_questions: list[str],
+    *,
+    runbook_chunks: list[RunbookChunk] | None = None,
 ) -> str:
     """Build a follow-up message using only semantically retrieved diagnostic chunks.
 
     Used by the RAG path: instead of sending the full report, only the top-k
     most relevant chunks are included, reducing token usage and focusing the AI.
+    If *runbook_chunks* are provided they are injected as a separate context
+    section before the follow-up question.
     """
     lines: list[str] = []
     lines.append(f"## Issue reported\n\n{issue}\n")
@@ -125,9 +172,12 @@ def build_message_with_chunks(
 
     for chunk in chunks:
         lines.append(f"### {chunk.name}\n")
-        lines.append(chunk.content)
+        lines.append(_format_diagnostic_chunk(chunk.content))
         lines.append("")
 
+    if runbook_chunks:
+        lines.append(_format_runbook_context(runbook_chunks))
+
     lines.append("## Follow-up")
 
     if prior_questions:
@@ -146,3 +196,31 @@ def build_message_with_chunks(
         "hypothesis unless newly retrieved evidence directly contradicts it."
     )
     return "\n".join(lines)
+
+
+def build_analysis_message_with_chunks(
+    issue: str,
+    host: str,
+    chunks: list[Chunk],
+    *,
+    runbook_chunks: list[RunbookChunk] | None = None,
+) -> str:
+    """Build an initial analysis message from retrieved diagnostic chunks."""
+    lines: list[str] = []
+    lines.append(f"## Issue reported\n\n{issue}\n")
+    lines.append(f"## Target host\n\n{host}\n")
+
+    if runbook_chunks:
+        lines.append(_format_runbook_context(runbook_chunks))
+
+    lines.append("## Most relevant diagnostics (retrieved by semantic similarity)\n")
+    for chunk in chunks:
+        lines.append(f"### {chunk.name}\n")
+        lines.append(_format_diagnostic_chunk(chunk.content))
+        lines.append("")
+
+    lines.append(
+        "Use the diagnostics above to provide an initial analysis. "
+        "If evidence is insufficient, state exactly what is missing."
+    )
+    return "\n".join(lines)
diff --git a/src/tai/runbook_store.py b/src/tai/runbook_store.py
new file mode 100644
index 0000000..42778e6
--- /dev/null
+++ b/src/tai/runbook_store.py
@@ -0,0 +1,268 @@
+"""Persistent runbook knowledge base backed by ChromaDB (Tier 2).
+
+Runbooks are Markdown files with YAML-style frontmatter describing a service,
+its typical symptoms, and tags used for retrieval matching.  The store embeds
+each runbook via AIClient and persists the collection so that queries across
+sessions are instant (no re-embedding on startup).
+
+Typical flow
+------------
+1. User runs ``tai runbooks --sync ./runbooks`` once (or after adding files).
+2. On each analysis turn, the store is queried with the user's question and the
+   top-k matching runbooks are injected as ``## Runbook Context`` in the prompt.
+"""
+
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from tai.ai_client import AIClient
+
+DEFAULT_STORE_PATH = "~/.tai/runbooks"
+_COLLECTION_NAME = "tai_runbooks"
+
+
+# ---------------------------------------------------------------------------
+# Data types
+# ---------------------------------------------------------------------------
+
+
+@dataclass(slots=True)
+class RunbookChunk:
+    """A retrieved runbook document ready for prompt injection."""
+
+    title: str
+    service: str
+    tags: list[str]
+    content: str
+
+
+@dataclass
+class RunbookMeta:
+    """Parsed frontmatter metadata from a runbook file."""
+
+    service: str = ""
+    symptoms: list[str] = field(default_factory=list)
+    tags: list[str] = field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+# Frontmatter parsing
+# ---------------------------------------------------------------------------
+
+_FRONTMATTER_RE = re.compile(r"^\s*---\s*\n(.*?)\n---\s*\n", re.DOTALL)
+_KV_RE = re.compile(r"^(\w+)\s*:\s*(.+)$")
+
+
+def _parse_frontmatter(text: str) -> tuple[RunbookMeta, str]:
+    """Split YAML-style frontmatter from Markdown body.
+
+    Supports simple ``key: value`` and ``key: item1, item2`` syntax only.
+    Returns ``(meta, body)``; if no frontmatter found, meta has empty fields.
+    """
+    meta = RunbookMeta()
+    match = _FRONTMATTER_RE.match(text)
+    if not match:
+        return meta, text
+
+    for line in match.group(1).splitlines():
+        kv = _KV_RE.match(line.strip())
+        if not kv:
+            continue
+        key, value = kv.group(1).lower(), kv.group(2).strip()
+        if key == "service":
+            meta.service = value
+        elif key == "symptoms":
+            meta.symptoms = [s.strip() for s in value.split(",") if s.strip()]
+        elif key == "tags":
+            meta.tags = [t.strip() for t in value.split(",") if t.strip()]
+
+    body = text[match.end():]
+    return meta, body
+
+
+# ---------------------------------------------------------------------------
+# RunbookStore
+# ---------------------------------------------------------------------------
+
+
+class RunbookStore:
+    """ChromaDB-backed store for runbook documents.
+
+    Parameters
+    ----------
+    store_path:
+        Directory where ChromaDB persists its data.
+        Defaults to ``~/.tai/runbooks``.
+    """
+
+    def __init__(self, store_path: str | Path = DEFAULT_STORE_PATH) -> None:
+        import chromadb  # optional dep — imported lazily
+
+        path = Path(store_path).expanduser().resolve()
+        path.mkdir(parents=True, exist_ok=True)
+        settings = None
+        try:
+            from chromadb.config import Settings
+
+            settings = Settings(
+                anonymized_telemetry=False,
+                chroma_product_telemetry_impl="tai.chroma_telemetry.NoOpProductTelemetryClient",
+                chroma_telemetry_impl="tai.chroma_telemetry.NoOpProductTelemetryClient",
+            )
+        except (ImportError, ModuleNotFoundError):
+            # Test doubles may replace `chromadb` with a lightweight mock that
+            # does not expose the real config module.
+            settings = None
+
+        if settings is None:
+            self._client = chromadb.PersistentClient(path=str(path))
+        else:
+            self._client = chromadb.PersistentClient(path=str(path), settings=settings)
+        self._collection = self._client.get_or_create_collection(
+            name=_COLLECTION_NAME,
+            metadata={"hnsw:space": "cosine"},
+        )
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def sync(self, runbooks_dir: Path, ai: AIClient) -> int:
+        """Embed and upsert all ``*.md`` files from *runbooks_dir*.
+
+        Existing documents with the same ID are updated if the file content
+        has changed.  Returns the number of runbooks successfully indexed.
+        """
+        runbooks_dir = Path(runbooks_dir).expanduser().resolve()
+        if not runbooks_dir.is_dir():
+            raise FileNotFoundError(f"Runbooks directory not found: {runbooks_dir}")
+
+        files = sorted(runbooks_dir.glob("*.md"))
+        if not files:
+            return 0
+
+        ids: list[str] = []
+        documents: list[str] = []
+        embeddings: list[list[float]] = []
+        metadatas: list[dict[str, str]] = []
+
+        for path in files:
+            raw = path.read_text(encoding="utf-8")
+            meta, body = _parse_frontmatter(raw)
+
+            # Embed the full document (frontmatter stripped) for semantic recall
+            embed_text = _build_embed_text(path.stem, meta, body)
+            embedding = ai.embed(embed_text)
+
+            ids.append(path.stem)
+            documents.append(body.strip())
+            embeddings.append(embedding)
+            metadatas.append(
+                {
+                    "title": path.stem,
+                    "service": meta.service,
+                    "tags": ", ".join(meta.tags),
+                    "symptoms": ", ".join(meta.symptoms),
+                }
+            )
+
+        self._collection.upsert(
+            ids=ids,
+            documents=documents,
+            embeddings=embeddings,
+            metadatas=metadatas,
+        )
+        return len(ids)
+
+    def sync_single(self, runbook_path: Path, ai: AIClient) -> None:
+        """Embed and upsert a single runbook file."""
+        path = Path(runbook_path).expanduser().resolve()
+        if not path.is_file():
+            raise FileNotFoundError(f"Runbook not found: {path}")
+
+        raw = path.read_text(encoding="utf-8")
+        meta, body = _parse_frontmatter(raw)
+        embed_text = _build_embed_text(path.stem, meta, body)
+        embedding = ai.embed(embed_text)
+
+        self._collection.upsert(
+            ids=[path.stem],
+            documents=[body.strip()],
+            embeddings=[embedding],
+            metadatas=[
+                {
+                    "title": path.stem,
+                    "service": meta.service,
+                    "tags": ", ".join(meta.tags),
+                    "symptoms": ", ".join(meta.symptoms),
+                }
+            ],
+        )
+
+    def query(self, question: str, ai: AIClient, *, top_k: int = 3) -> list[RunbookChunk]:
+        """Return the *top_k* most relevant runbooks for *question*.
+
+        Returns an empty list if the collection is empty or if the AI backend
+        is unavailable — callers should handle an empty result gracefully.
+        """
+        if self._collection.count() == 0:
+            return []
+
+        q_embedding = ai.embed(question)
+        results = self._collection.query(
+            query_embeddings=[q_embedding],
+            n_results=min(top_k, self._collection.count()),
+            include=["documents", "metadatas"],
+        )
+
+        chunks: list[RunbookChunk] = []
+        docs = results.get("documents") or []
+        metas = results.get("metadatas") or []
+        for doc_list, meta_list in zip(docs, metas, strict=False):
+            for doc, meta in zip(doc_list, meta_list, strict=False):
+                chunks.append(
+                    RunbookChunk(
+                        title=str(meta.get("title", "")),
+                        service=str(meta.get("service", "")),
+                        tags=[t.strip() for t in str(meta.get("tags", "")).split(",") if t.strip()],
+                        content=doc,
+                    )
+                )
+        return chunks
+
+    def list_indexed(self) -> list[dict[str, str]]:
+        """Return metadata for all indexed runbooks."""
+        if self._collection.count() == 0:
+            return []
+        results = self._collection.get(include=["metadatas"])
+        metas = results.get("metadatas") or []
+        return [dict(m) for m in metas]
+
+    def count(self) -> int:
+        """Return the number of indexed runbook documents."""
+        return self._collection.count()
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _build_embed_text(title: str, meta: RunbookMeta, body: str) -> str:
+    """Build the text to embed for a runbook — combines signals for best recall."""
+    parts: list[str] = [f"title: {title}"]
+    if meta.service:
+        parts.append(f"service: {meta.service}")
+    if meta.symptoms:
+        parts.append(f"symptoms: {', '.join(meta.symptoms)}")
+    if meta.tags:
+        parts.append(f"tags: {', '.join(meta.tags)}")
+    # Prepend a stripped excerpt of the body for additional signal
+    body_excerpt = body.strip()[:800]
+    parts.append(body_excerpt)
+    return "\n".join(parts)
diff --git a/tests/test_ai.py b/tests/test_ai.py
index 9446823..f37de7d 100644
--- a/tests/test_ai.py
+++ b/tests/test_ai.py
@@ -174,6 +174,7 @@ def test_build_system_prompt_contains_key_instructions() -> None:
     assert "Evidence" in prompt
     assert "Recommended Actions" in prompt
     assert "read-only" in prompt.lower()
+    assert "absent or not installed" in prompt
 
 
 def test_build_user_message_contains_issue_and_host() -> None:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c2bc72b..2f05dd7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,3 +1,4 @@
+from types import SimpleNamespace
 from unittest.mock import AsyncMock, MagicMock
 
 from typer.testing import CliRunner
@@ -31,7 +32,7 @@ def test_run_command_prints_scaffold_summary() -> None:
     result = runner.invoke(
         app,
         [
-            "apache failed",
+            "run", "apache failed",
             "--host",
             "web01",
             "--port",
@@ -62,7 +63,7 @@ def test_probe_success_prints_remote_output_by_default(monkeypatch) -> None:  #
     runner = CliRunner()
     result = runner.invoke(
         app,
-        ["apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"],
+        ["run", "apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"],
     )
 
     assert result.exit_code == 0
@@ -84,7 +85,7 @@ def test_probe_failure_returns_non_zero(monkeypatch) -> None:  # type: ignore[no
     runner = CliRunner()
     result = runner.invoke(
         app,
-        ["apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"],
+        ["run", "apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"],
     )
 
     assert result.exit_code == 1
@@ -126,7 +127,7 @@ def test_collect_success_prints_summary(monkeypatch) -> None:  # type: ignore[no
     result = runner.invoke(
         app,
         [
-            "apache failed",
+            "run", "apache failed",
             "--host",
             "ssh.archflux.net",
             "--port",
@@ -172,7 +173,7 @@ def test_interactive_collect_then_quit(monkeypatch) -> None:  # type: ignore[no-
     result = runner.invoke(
         app,
         [
-            "apache failed",
+            "run", "apache failed",
             "--host",
             "ssh.archflux.net",
             "--port",
@@ -210,8 +211,8 @@ def test_interactive_unknown_command_prints_hint(monkeypatch) -> None:  # type:
     commands = iter(["what should I check next?", "/quit"])
     monkeypatch.setattr("tai.cli.collect_from_plan", fake_collect_from_plan)
     monkeypatch.setattr(
-        "tai.cli.AIClient.stream",
-        lambda *_args, **_kwargs: iter(["Check logs."]),
+        "tai.cli.AIClient.complete",
+        lambda *_args, **_kwargs: SimpleNamespace(content="Check logs."),
     )
     monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands))
     monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True)
@@ -220,7 +221,7 @@ def test_interactive_unknown_command_prints_hint(monkeypatch) -> None:  # type:
     result = runner.invoke(
         app,
         [
-            "apache failed",
+            "run", "apache failed",
             "--host",
             "ssh.archflux.net",
             "--port",
@@ -257,7 +258,10 @@ def test_interactive_prints_rag_fallback_notice_on_index_failure(monkeypatch) ->
     commands = iter(["what should I check next?", "/quit"])
     monkeypatch.setattr("tai.cli.collect_from_plan", fake_collect_from_plan)
     monkeypatch.setattr("tai.cli._try_embed_report", lambda *_args: (None, "embed failed", 1.0))
-    monkeypatch.setattr("tai.cli.AIClient.stream", lambda *_args, **_kwargs: iter(["Check logs."]))
+    monkeypatch.setattr(
+        "tai.cli.AIClient.complete",
+        lambda *_args, **_kwargs: SimpleNamespace(content="Check logs."),
+    )
     monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands))
     monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True)
 
@@ -265,7 +269,7 @@ def test_interactive_prints_rag_fallback_notice_on_index_failure(monkeypatch) ->
     result = runner.invoke(
         app,
         [
-            "apache failed",
+            "run", "apache failed",
             "--host",
             "ssh.archflux.net",
             "--port",
@@ -310,7 +314,10 @@ def test_interactive_rag_debug_prints_retrieval_scores(monkeypatch) -> None:  #
         ),
     )
     monkeypatch.setattr("tai.cli.AIClient.embed", lambda *_args, **_kwargs: [1.0, 0.0])
-    monkeypatch.setattr("tai.cli.AIClient.stream", lambda *_args, **_kwargs: iter(["Check logs."]))
+    monkeypatch.setattr(
+        "tai.cli.AIClient.complete",
+        lambda *_args, **_kwargs: SimpleNamespace(content="Check logs."),
+    )
     monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands))
     monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True)
 
@@ -318,7 +325,7 @@ def test_interactive_rag_debug_prints_retrieval_scores(monkeypatch) -> None:  #
     result = runner.invoke(
         app,
         [
-            "apache failed",
+            "run", "apache failed",
             "--host",
             "ssh.archflux.net",
             "--port",
diff --git a/tests/test_plan.py b/tests/test_plan.py
index 94016e2..4fdf556 100644
--- a/tests/test_plan.py
+++ b/tests/test_plan.py
@@ -80,6 +80,7 @@ def test_nginx_in_issue_adds_nginx_service_commands() -> None:
     plan = plan_from_request(_req("nginx is failing to start"))
     names = _names(plan)
     cmds = _commands(plan)
+    assert "unit-file-nginx" in names
     assert "service-nginx" in names
     assert "journal-nginx" in names
     assert any("systemctl status nginx" in c for c in cmds)
@@ -98,6 +99,30 @@ def test_sshd_adds_config_cat() -> None:
     assert any("cat /etc/ssh/sshd_config" in c for c in cmds)
 
 
+def test_sssd_in_issue_adds_presence_service_and_config_commands() -> None:
+    plan = plan_from_request(_req("troubleshoot sssd login failures"))
+    names = _names(plan)
+    cmds = _commands(plan)
+    assert "unit-file-sssd" in names
+    assert "binary-sssd-1" in names
+    assert "service-sssd" in names
+    assert "journal-sssd" in names
+    assert any("cat /etc/sssd/sssd.conf" in c for c in cmds)
+    assert any("ls -l /usr/sbin/sssd" in c for c in cmds)
+    assert any("list-unit-files sssd.service" in c for c in cmds)
+
+
+def test_docker_presence_probe_checks_package_and_binary() -> None:
+    plan = plan_from_request(_req("docker daemon not running"))
+    names = _names(plan)
+    cmds = _commands(plan)
+    assert "unit-file-docker" in names
+    assert "binary-docker-1" in names
+    assert "binary-docker-2" in names
+    assert any("ls -l /usr/bin/docker" in c for c in cmds)
+    assert any("ls -l /usr/bin/dockerd" in c for c in cmds)
+
+
 def test_unknown_service_name_no_config_cat() -> None:
     plan = plan_from_request(_req("myweirdapp service crashed"))
     cmds = _commands(plan)
diff --git a/tests/test_runbook_store.py b/tests/test_runbook_store.py
new file mode 100644
index 0000000..a6afb7b
--- /dev/null
+++ b/tests/test_runbook_store.py
@@ -0,0 +1,253 @@
+"""Tests for runbook_store — no network calls, ChromaDB mocked."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from tai.runbook_store import (
+    RunbookChunk,
+    RunbookMeta,
+    RunbookStore,
+    _build_embed_text,
+    _parse_frontmatter,
+)
+
+# ---------------------------------------------------------------------------
+# _parse_frontmatter
+# ---------------------------------------------------------------------------
+
+
+def test_parse_frontmatter_extracts_service() -> None:
+    text = "---\nservice: nginx\n---\n## Body\nsome content"
+    meta, body = _parse_frontmatter(text)
+    assert meta.service == "nginx"
+    assert "## Body" in body
+
+
+def test_parse_frontmatter_extracts_tags_as_list() -> None:
+    text = "---\ntags: nginx, web, http\n---\nbody"
+    meta, body = _parse_frontmatter(text)
+    assert meta.tags == ["nginx", "web", "http"]
+
+
+def test_parse_frontmatter_extracts_symptoms_as_list() -> None:
+    text = "---\nsymptoms: 502 Bad Gateway, upstream refused\n---\nbody"
+    meta, body = _parse_frontmatter(text)
+    assert meta.symptoms == ["502 Bad Gateway", "upstream refused"]
+
+
+def test_parse_frontmatter_returns_empty_meta_when_missing() -> None:
+    text = "# Just a heading\nno frontmatter here"
+    meta, body = _parse_frontmatter(text)
+    assert meta.service == ""
+    assert meta.tags == []
+    assert meta.symptoms == []
+    assert "Just a heading" in body
+
+
+def test_parse_frontmatter_body_strips_delimiter() -> None:
+    text = "---\nservice: ssh\n---\nBody starts here."
+    _, body = _parse_frontmatter(text)
+    assert body.strip() == "Body starts here."
+
+
+# ---------------------------------------------------------------------------
+# _build_embed_text
+# ---------------------------------------------------------------------------
+
+
+def test_build_embed_text_includes_title_and_service() -> None:
+    meta = RunbookMeta(service="nginx", symptoms=["502"], tags=["web"])
+    result = _build_embed_text("nginx", meta, "body content")
+    assert "title: nginx" in result
+    assert "service: nginx" in result
+
+
+def test_build_embed_text_includes_symptoms_and_tags() -> None:
+    meta = RunbookMeta(service="nginx", symptoms=["502 Bad Gateway"], tags=["web", "http"])
+    result = _build_embed_text("nginx", meta, "body")
+    assert "502 Bad Gateway" in result
+    assert "web" in result
+
+
+def test_build_embed_text_includes_body_excerpt() -> None:
+    meta = RunbookMeta()
+    result = _build_embed_text("disk", meta, "check df -h output")
+    assert "check df -h output" in result
+
+
+def test_build_embed_text_truncates_long_body() -> None:
+    meta = RunbookMeta()
+    long_body = "x" * 2000
+    result = _build_embed_text("disk", meta, long_body)
+    # Body excerpt is capped at 800 chars
+    assert len(result) < 1500
+
+
+# ---------------------------------------------------------------------------
+# RunbookStore — unit tests using tmp_path and mocked chromadb
+# ---------------------------------------------------------------------------
+
+
+def _make_chromadb_mock() -> MagicMock:
+    """Return a chromadb mock that satisfies RunbookStore internals."""
+    collection = MagicMock()
+    collection.count.return_value = 0
+    client = MagicMock()
+    client.get_or_create_collection.return_value = collection
+    chroma_mod = MagicMock()
+    chroma_mod.PersistentClient.return_value = client
+    return chroma_mod
+
+
+def _make_ai_mock(embedding: list[float] | None = None) -> MagicMock:
+    ai = MagicMock()
+    ai.embed.return_value = embedding or [0.1, 0.2, 0.3]
+    return ai
+
+
+def test_runbook_store_sync_returns_count(tmp_path: Path) -> None:
+    (tmp_path / "nginx.md").write_text(
+        "---\nservice: nginx\ntags: web\nsymptoms: 502\n---\n## Body\ncontent"
+    )
+    (tmp_path / "ssh.md").write_text(
+        "---\nservice: ssh\ntags: ssh\nsymptoms: refused\n---\n## Body\ncontent"
+    )
+
+    chroma_mock = _make_chromadb_mock()
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        count = store.sync(tmp_path, ai)
+
+    assert count == 2
+
+
+def test_runbook_store_sync_calls_upsert(tmp_path: Path) -> None:
+    (tmp_path / "nginx.md").write_text("---\nservice: nginx\n---\nbody")
+
+    chroma_mock = _make_chromadb_mock()
+    collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        store.sync(tmp_path, ai)
+
+    collection.upsert.assert_called_once()
+    call_kwargs = collection.upsert.call_args.kwargs
+    assert "nginx" in call_kwargs["ids"]
+
+
+def test_runbook_store_sync_empty_dir_returns_zero(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        count = store.sync(tmp_path, ai)
+
+    assert count == 0
+
+
+def test_runbook_store_sync_missing_dir_raises(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        with pytest.raises(FileNotFoundError):
+            store.sync(tmp_path / "nonexistent", ai)
+
+
+def test_runbook_store_query_returns_empty_when_no_docs(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    # collection.count() returns 0 by default in our mock
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        results = store.query("disk full", ai)
+
+    assert results == []
+
+
+def test_runbook_store_query_returns_runbook_chunks(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value
+    collection.count.return_value = 2
+    collection.query.return_value = {
+        "documents": [["## Body\ncheck df -h"]],
+        "metadatas": [
+            [{"title": "disk", "service": "disk", "tags": "disk, storage", "symptoms": "full"}]
+        ],
+    }
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        results = store.query("disk is full", ai)
+
+    assert len(results) == 1
+    assert isinstance(results[0], RunbookChunk)
+    assert results[0].title == "disk"
+    assert results[0].service == "disk"
+    assert "disk" in results[0].tags
+    assert "df -h" in results[0].content
+
+
+def test_runbook_store_list_indexed_returns_metadata(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value
+    collection.count.return_value = 1
+    collection.get.return_value = {
+        "metadatas": [{"title": "nginx", "service": "nginx", "tags": "web", "symptoms": "502"}]
+    }
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        entries = store.list_indexed()
+
+    assert len(entries) == 1
+    assert entries[0]["title"] == "nginx"
+
+
+def test_runbook_store_count_delegates_to_collection(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value
+    collection.count.return_value = 5
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        assert store.count() == 5
+
+
+def test_runbook_store_sync_single_upserts_one(tmp_path: Path) -> None:
+    runbook = tmp_path / "nginx.md"
+    runbook.write_text("---\nservice: nginx\ntags: web\n---\nbody text")
+
+    chroma_mock = _make_chromadb_mock()
+    collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        store.sync_single(runbook, ai)
+
+    collection.upsert.assert_called_once()
+    call_kwargs = collection.upsert.call_args.kwargs
+    assert call_kwargs["ids"] == ["nginx"]
+
+
+def test_runbook_store_sync_single_missing_file_raises(tmp_path: Path) -> None:
+    chroma_mock = _make_chromadb_mock()
+    ai = _make_ai_mock()
+
+    with patch.dict("sys.modules", {"chromadb": chroma_mock}):
+        store = RunbookStore(tmp_path / "store")
+        with pytest.raises(FileNotFoundError):
+            store.sync_single(tmp_path / "missing.md", ai)