From 57f4c0efaa0a4b58c42c59f9c53d23546a5cf8fa Mon Sep 17 00:00:00 2001 From: zphinx Date: Wed, 6 May 2026 04:48:41 +0200 Subject: [PATCH] feat: complete RAG runbook workflow and release docs --- CHANGELOG.md | 54 ++++--- README.md | 214 ++++++++++++++++++++-------- ROADMAP.md | 55 ++++---- docs/ARCHITECTURE.md | 85 +++++++++++ pyproject.toml | 2 +- runbooks/apparmor.md | 86 ++++++++++++ runbooks/disk.md | 106 ++++++++++++++ runbooks/docker.md | 120 ++++++++++++++++ runbooks/kernel.md | 117 ++++++++++++++++ runbooks/nginx.md | 99 +++++++++++++ runbooks/postgres.md | 107 ++++++++++++++ runbooks/selinux.md | 112 +++++++++++++++ runbooks/ssh.md | 100 +++++++++++++ runbooks/sssd.md | 115 +++++++++++++++ runbooks/wayland.md | 89 ++++++++++++ runbooks/x2go.md | 106 ++++++++++++++ runbooks/xorg.md | 94 +++++++++++++ src/tai/chroma_telemetry.py | 24 ++++ src/tai/cli.py | 273 ++++++++++++++++++++++++++++++++++-- src/tai/plan.py | 27 ++++ src/tai/prompt_builder.py | 84 ++++++++++- src/tai/runbook_store.py | 268 +++++++++++++++++++++++++++++++++++ tests/test_ai.py | 1 + tests/test_cli.py | 31 ++-- tests/test_plan.py | 25 ++++ tests/test_runbook_store.py | 253 +++++++++++++++++++++++++++++++++ 26 files changed, 2510 insertions(+), 137 deletions(-) create mode 100644 docs/ARCHITECTURE.md create mode 100644 runbooks/apparmor.md create mode 100644 runbooks/disk.md create mode 100644 runbooks/docker.md create mode 100644 runbooks/kernel.md create mode 100644 runbooks/nginx.md create mode 100644 runbooks/postgres.md create mode 100644 runbooks/selinux.md create mode 100644 runbooks/ssh.md create mode 100644 runbooks/sssd.md create mode 100644 runbooks/wayland.md create mode 100644 runbooks/x2go.md create mode 100644 runbooks/xorg.md create mode 100644 src/tai/chroma_telemetry.py create mode 100644 src/tai/runbook_store.py create mode 100644 tests/test_runbook_store.py diff --git a/CHANGELOG.md b/CHANGELOG.md index cfd146d..bbb0180 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,27 +10,37 @@ ______________________________________________________________________ ### Added -- `README.md` — project overview, description, example workflow, supported distributions, and suggested tooling -- `ROADMAP.md` — phased development plan covering decisions, data collection, AI integration, CLI design, and hardening -- `CHANGELOG.md` — this file; established changelog tracking for the project -- `.gitea/workflows/ci.yml` — Gitea Actions CI workflow for push and pull request events -- Gitea CI now uses native `git` checkout and system Python setup to avoid host-executor JavaScript action path issues -- Gitea native checkout now uses `CI_GIT_TOKEN` repository secret for authenticated fetch from private repos -- Gitea CI now installs dependencies in a local `.venv` to avoid Debian/PEP 668 externally-managed pip errors -- Python package scaffold with `src` layout and project metadata in `pyproject.toml` -- Initial CLI entrypoint with agreed SSH flags: `--identity-file`, `--jump-host`, and `--ignore-ssh-config` -- Input parsing/validation module and core request model -- SSH configuration scaffold module for upcoming connection/read-only execution work -- Implemented SSH module with real key-based command execution via system `ssh` -- Added explicit SSH port support across CLI, input parsing, request model, and SSH client (`--port`, e.g. 5566) -- Added live SSH connectivity probe (`uname -a`) enabled by default, with `--no-probe` opt-out and non-zero exit on failure -- Added baseline diagnostics collection via `--collect`, including service, journal, disk, and network checks -- Read-only command policy enforcement (allowlist + blocked shell operators) -- Added byte-limited SSH output capture with truncation markers for large command output -- Test scaffold (`pytest`) with initial parser and CLI coverage -- SSH test coverage for policy checks, SSH argument construction, and config summary behavior -- CI workflow for lint (`ruff`), type-check (`mypy`), and tests (`pytest`) -- CI coverage expanded with Markdown formatting checks (`mdformat --check`) and YAML linting (`yamllint`) +- Nothing yet. + +______________________________________________________________________ + +## [0.4.0] - 2026-05-06 + +### Added + +- `runbooks/` corpus with service troubleshooting guides: `ssh`, `nginx`, `postgres`, `disk`, `kernel`, `docker`, `sssd`, `xorg`, `wayland`, `x2go`, `selinux`, `apparmor` +- Runbook knowledge store module `src/tai/runbook_store.py` (persistent ChromaDB-backed index and query) +- Chroma telemetry no-op client `src/tai/chroma_telemetry.py` to suppress noisy local telemetry errors +- `tai runbooks` command group with: + - `sync` for indexing all Markdown runbooks + - `list` for listing indexed metadata + - `add` for indexing a single runbook file +- `--runbooks` option on `tai run` to enable Tier 2 runbook retrieval +- Initial analysis RAG path using retrieved diagnostic chunks (`build_analysis_message_with_chunks`) +- Follow-up RAG path updates with tighter `top_k` and runbook context injection +- AI runtime controls: + - `--ai-timeout-seconds` + - `--ai-max-tokens` +- Non-streaming AI completion path for improved local backend reliability +- Service/subsystem presence probes in collection plans: + - unit-file checks + - expected binary path checks + - status/journal/config probes for recognized services including `sssd` +- Prompt instruction for "component absent or not installed" interpretation when presence signals are missing +- Runbook store unit tests in `tests/test_runbook_store.py` +- CLI tests updated for `tai run` subcommand and non-streaming completion mocks +- README refreshed with current CLI, architecture layout, RAG/runbook workflow, and usage examples +- `docs/ARCHITECTURE.md` with end-to-end flow, module responsibilities, safety boundaries, and fallback behavior ### Removed @@ -44,3 +54,5 @@ ______________________________________________________________________ - SSH bastion support: `--jump-host` flag using SSH native ProxyJump - SSH config behavior: use `~/.ssh/config` by default; allow override via `--ignore-ssh-config` - Interface: **interactive REPL** for v0.1; `textual`-based TUI (split-pane) for v0.2+ +- RAG Tier 1 strategy: semantic diagnostic chunk retrieval with local embeddings +- RAG Tier 2 strategy: Markdown runbooks persisted in embedded ChromaDB diff --git a/README.md b/README.md index d25e954..5571b0c 100644 --- a/README.md +++ b/README.md @@ -1,112 +1,202 @@ -# tai — Linux AI Troubleshooting Agent +# tai - Linux AI Troubleshooting Agent -`tai` is an agentic AI-driven troubleshooting tool for Linux systems. It autonomously investigates issues on remote hosts via SSH, analyzes relevant logs and configuration files, and provides a clear diagnosis along with suggested remediation steps — all without making any changes to the target system. +`tai` is a read-only Linux troubleshooting assistant that connects to remote hosts via SSH, collects diagnostics, and runs grounded AI analysis using local models. -## Overview +The project is designed for operators who want AI speed without losing operational safety or evidence traceability. -Given a problem description and a target hostname, `tai` connects to the remote system over SSH, gathers relevant data (logs, configuration files, service status, etc.), and uses a locally-hosted AI model to reason about the root cause and recommend solutions. +## What tai Does -The agent operates in **read-only mode at all times**. It will never modify the target system under any circumstances — all suggestions are presented to the human troubleshooter for review and action. +- Runs safe, read-only remote checks over SSH +- Builds a diagnostics collection plan from issue text +- Supports one-shot analysis and interactive follow-up mode +- Uses local AI backends (OpenAI-compatible endpoint, typically Ollama) +- Uses RAG over collected diagnostics (Tier 1) +- Uses persistent runbook retrieval with ChromaDB (Tier 2) +- Emits structured Markdown analysis with evidence and actions +- Can log session and retrieval telemetry locally as JSONL -## Supported Distributions +## Safety Model -- Ubuntu -- Debian -- RHEL -- Rocky Linux +`tai` enforces read-only command policy on all remote commands. -## Example Workflow +- Allowlist based command validation +- Blocked shell operators (`>`, `>>`, `<`, `|`, `&&`, `||`, `;`) +- No write/mutation actions are executed on target hosts -A troubleshooter receives a ticket reporting that the Apache service on a remote server has failed to start. They provide `tai` with: +The tool may suggest remediation commands in output, but does not execute them. -1. The ticket description or error message -1. The hostname of the affected system -1. Any relevant directories to focus on +## Current Feature Set -`tai` then connects to the host, reads through system logs, service configurations, and any other related files, and returns a structured analysis of the likely cause along with recommended next steps. +### Core CLI -## Suggested Tooling +- `tai run ...` main troubleshooting entrypoint +- SSH options: host, port, identity file, jump host, SSH config control +- Live probe mode (`uname -a`) +- Diagnostics collection mode +- AI analysis mode +- Interactive loop with `/collect`, `/analyze`, `/help`, `/quit` -| Component | Tool | -|-----------|------| -| AI inference backend | [Ollama](https://ollama.com) | -| Chat model | `gemma3:4b`, `llama3.1:8b`, or `qwen2.5:7b` | -| Embedding model | `nomic-embed-text` (via Ollama) | -| Vector store | [ChromaDB](https://www.trychroma.com) (embedded, local) | -| Language | Python 3.11+ | +### AI and Prompting -______________________________________________________________________ +- OpenAI-compatible AI client +- Configurable model, timeout, token budget +- Guardrails to keep responses evidence-based +- Initial and follow-up prompts grounded in collected diagnostics +- Non-streaming completion path for local backend reliability -## How-To: Setting Up the AI Backend (Arch Linux + RTX 3080) +### RAG and Knowledge -`tai` uses [Ollama](https://ollama.com) as its local AI backend. It exposes an OpenAI-compatible HTTP API that `tai` talks to — no cloud services, no data leaving your machine. +- Tier 1: semantic retrieval of diagnostic chunks per question +- Tier 2: persistent runbook knowledge base with ChromaDB +- Runbook retrieval injected as separate prompt context +- Retrieval debug output (`--rag-debug`) +- Full-context fallback if retrieval/indexing fails -An RTX 3080 (10 GB VRAM) comfortably runs 7–8B parameter models at 4-bit quantisation. +### Runbook Management -### 1. Install CUDA and Ollama +- `tai runbooks sync --path ./runbooks --store ~/.tai/runbooks` +- `tai runbooks list --store ~/.tai/runbooks` +- `tai runbooks add --store ~/.tai/runbooks` -```bash -# CUDA runtime (skip if already installed) -sudo pacman -S cuda +### Presence and Absence Signals -# Ollama with CUDA support from the AUR -yay -S ollama-cuda -# or: paru -S ollama-cuda +For recognized services/subsystems (for example `sssd`, `docker`, `x2go`, `xorg`, `wayland`, `selinux`, `apparmor`), collection includes: -# Enable and start the service -sudo systemctl enable --now ollama +- service unit-file discovery (`systemctl list-unit-files ...`) +- binary presence checks via `ls -l ` +- service status and journals +- selected config path probes where defined + +This improves analysis quality for "component missing/not installed" scenarios. + +## Repository Layout + +```text +src/tai/ + cli.py # CLI commands and orchestration + ssh_client.py # SSH execution + read-only policy + collectors.py # execution of collection plans + plan.py # issue -> command plan builder + ai_client.py # OpenAI-compatible AI + embeddings client + ai_guardrails.py # response guardrails/validation + prompt_builder.py # prompt composition + rag_retriever.py # diagnostic chunk retrieval + runbook_store.py # persistent ChromaDB runbook index/query + chroma_telemetry.py # no-op Chroma telemetry client + session_log.py # JSONL session logging + input_parser.py # CLI input validation + models.py # domain request models + +runbooks/ + *.md # Markdown runbooks with frontmatter + +tests/ + test_*.py # unit and CLI coverage ``` -### 2. Pull a chat model +## Installation ```bash -ollama pull gemma3:4b # ~3 GB — fast, good for sysadmin tasks -ollama pull llama3.1:8b # ~5 GB — stronger reasoning -ollama pull qwen2.5:7b # ~4.5 GB — strong structured output +python -m venv .venv +source .venv/bin/activate +pip install -e . ``` -### 3. Pull the embedding model - -`tai` uses `nomic-embed-text` to embed diagnostic data and runbooks for semantic retrieval (RAG). Pull it on the same host as Ollama: +RAG runbook storage requires optional dependencies: ```bash -ollama pull nomic-embed-text # ~274 MB +pip install -e .[rag] ``` -Verify it loaded: +Development dependencies: ```bash -curl http://localhost:11434/api/embeddings \ - -d '{"model":"nomic-embed-text","prompt":"test"}' +pip install -e .[dev] ``` -A JSON response with an `"embedding"` array confirms it is ready. +## AI Backend Setup (Ollama) -### 4. Verify the chat model works +`tai` expects an OpenAI-compatible API endpoint, defaulting to `http://localhost:11434/v1`. ```bash -ollama run gemma3:4b "what causes a systemd service to enter failed state?" +ollama pull gemma3:4b +ollama pull nomic-embed-text ``` -### 5. Verify the HTTP API is running - -`tai` communicates with Ollama over its OpenAI-compatible REST API: +Quick backend check: ```bash curl http://localhost:11434/api/generate \ -d '{"model":"gemma3:4b","prompt":"hello","stream":false}' ``` -A JSON response with a `response` field confirms everything is working. +## Usage -### 6. Point tai at your Ollama instance - -Once `tai` AI integration is complete, use these flags: +### Basic Probe and Collect ```bash -tai "nginx failing to start" --host web01 \ - --ai-host http://localhost:11434 \ - --model gemma3:4b +tai run "nginx failing to start" \ + --host web01 \ + --probe \ + --collect ``` -The default values for `--ai-host` and `--model` will be `http://localhost:11434` and `gemma3:4b` respectively, so for local use you won't need to specify them explicitly. +### Analyze with RAG and Runbooks + +```bash +tai run "why isnt sssd working?" \ + --host ssh.archflux.net \ + --port 5566 \ + --probe --collect --analyze \ + --runbooks ~/.tai/runbooks \ + --rag-debug \ + --ai-timeout-seconds 45 \ + --ai-max-tokens 300 +``` + +### Interactive Session + +```bash +tai run "docker daemon keeps failing" \ + --host app01 \ + --collect \ + --interactive \ + --runbooks ~/.tai/runbooks +``` + +## Runbook Workflow + +1. Write Markdown runbooks in `runbooks/` with frontmatter keys: `service`, `symptoms`, `tags`. +1. Sync the store. +1. Pass `--runbooks ` to `tai run`. + +Example: + +```bash +tai runbooks sync --path ./runbooks --store ~/.tai/runbooks +tai runbooks list --store ~/.tai/runbooks +``` + +## Testing + +```bash +pytest +``` + +Focused suites: + +```bash +pytest tests/test_plan.py tests/test_ai.py tests/test_cli.py +``` + +## Known Limits + +- Service-specific presence checks currently apply to recognized service/subsystem names. +- Package-manager-level presence checks are not yet in the default read-only command allowlist. +- Tier 3 persistent session memory is not implemented yet. + +## Changelog and Roadmap + +- See `CHANGELOG.md` for release history. +- See `ROADMAP.md` for phase status and next milestones. +- See `docs/ARCHITECTURE.md` for module-level architecture and data flow. diff --git a/ROADMAP.md b/ROADMAP.md index 208ae12..6a1e8ef 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -18,10 +18,11 @@ These must be resolved before meaningful development can begin. ### AI Backend & Model -- [ ] Confirm use of [vLLM](https://github.com/vllm-project/vllm) as the inference backend -- [ ] Confirm `gemma4:a4b` as the default model (or select an alternative) +- [x] OpenAI-compatible backend client implemented (`AIClient`) +- [x] Default local backend profile wired for Ollama (`http://localhost:11434/v1`) +- [x] Default model profile set to `gemma3:4b` (override via `--model`) - [ ] Define minimum hardware requirements for running the model locally -- [ ] Decide whether the AI backend is bundled, self-hosted externally, or user-supplied +- [x] AI backend is user-supplied/self-hosted ### SSH Strategy @@ -38,7 +39,7 @@ These must be resolved before meaningful development can begin. ### Scope & Constraints - [ ] Define the supported scope of issues (services, network, disk, kernel, etc.) -- [ ] Confirm read-only guarantee — document exactly what "read-only" means in practice +- [x] Read-only guarantee implemented with command allowlist + blocked shell operator policy - [x] **Decision: interactive REPL mode for v0.1, full TUI for v0.2+** - v0.1: chat-loop REPL launched from CLI; human can follow up, correct, and redirect the agent - v0.2+: `textual`-based TUI with split panes (collected data | AI output | input bar) @@ -52,7 +53,7 @@ Basic project scaffolding and connectivity. - [x] Finalise repository structure and language toolchain - [x] Set up CI pipeline (linting, tests) -- [ ] Implement SSH connection module +- [x] Implement SSH connection module - [x] Define SSH config model and probe interface scaffold - [x] Connect to remote host - [x] Execute read-only commands (e.g. `journalctl`, `systemctl status`, `cat`) @@ -68,15 +69,15 @@ ______________________________________________________________________ Define what information the agent gathers and how. -- [ ] Identify the canonical set of data sources per issue type: +- [x] Identify a baseline canonical set of data sources per issue type: - Service failures: `journalctl`, `systemctl`, service config files - Network issues: `ip`, `ss`, `netstat`, firewall rules - Disk issues: `df`, `du`, `dmesg`, `smartctl` - General: `/var/log/syslog`, `/var/log/messages`, `dmesg` -- [ ] Implement pluggable "collector" modules per data source -- [ ] Implement directory traversal for user-specified paths (read-only) +- [x] Implement collectors and plan builder for baseline issue categories +- [x] Implement directory traversal for user-specified paths (read-only) - [ ] Add support for per-distro variations (Ubuntu vs RHEL path differences, etc.) -- [ ] Write tests with mocked SSH output +- [x] Write tests with mocked SSH output ______________________________________________________________________ @@ -84,12 +85,12 @@ ______________________________________________________________________ Wire collected data into the local AI model. -- [ ] Implement vLLM client module -- [ ] Design prompt template: system context, collected data, issue description → diagnosis -- [ ] Implement response parsing and structured output (root cause + suggested steps) -- [ ] Tune context window usage — handle truncation for large log outputs -- [ ] Add streaming support for long AI responses -- [ ] Evaluate and test model output quality on common issue types +- [x] Implement OpenAI-compatible AI client module +- [x] Design prompt templates for initial and follow-up analysis +- [x] Implement response guardrail checks and structured response headings +- [x] Tune context usage with RAG retrieval and chunk/runbook truncation budgets +- [x] Implement reliable non-streaming completion path for local backends +- [ ] Continue output quality tuning and grounding evaluation on real hosts ______________________________________________________________________ @@ -97,11 +98,11 @@ ______________________________________________________________________ Polish the interface for real-world use. -- [ ] Design CLI interface (flags, subcommands, interactive prompts) -- [ ] Implement structured output: diagnosis, confidence, recommended actions -- [ ] Add `--verbose` / `--debug` mode showing raw collected data +- [x] Design CLI interface with run command, interactive prompts, and runbook subcommands +- [x] Implement structured output sections (Root Cause, Evidence, Recommended Actions) +- [x] Add RAG debug mode (`--rag-debug`) showing retrieval scores - [ ] Support output to file or clipboard -- [ ] Write man page / `--help` documentation +- [x] Provide comprehensive `--help` command documentation via Typer options ______________________________________________________________________ @@ -135,19 +136,21 @@ model weights alone. Three tiers of increasing capability, each buildable indepe | Decision | Options | Recommendation | Status | |---|---|---|---| -| Embedding model | `nomic-embed-text`, `mxbai-embed-large`, `all-minilm` | `nomic-embed-text` via Ollama (local, 274MB, strong perf) | ⬜ Pending | -| Vector store — Tier 1 | In-memory numpy cosine, `faiss-cpu` | numpy (zero deps) for session scope | ⬜ Pending | -| Vector store — Tier 2/3 | `chromadb`, `qdrant`, `weaviate`, `pgvector` | `chromadb` (embedded mode, no server needed) or `qdrant` (self-hosted, REST API, production-grade) | ⬜ Pending | -| Chunking strategy | Fixed token, sentence-aware, command-boundary | Command-boundary splitting (natural unit for diagnostics) | ⬜ Pending | +| Embedding model | `nomic-embed-text`, `mxbai-embed-large`, `all-minilm` | `nomic-embed-text` via Ollama (local, 274MB, strong perf) | ✅ Implemented | +| Vector store — Tier 1 | In-memory numpy cosine, `faiss-cpu` | numpy (zero deps) for session scope | ✅ Implemented | +| Vector store — Tier 2/3 | `chromadb`, `qdrant`, `weaviate`, `pgvector` | `chromadb` embedded mode | ✅ Tier 2 Implemented | +| Chunking strategy | Fixed token, sentence-aware, command-boundary | Command-boundary splitting (natural unit for diagnostics) | ✅ Implemented | | Hybrid retrieval | Semantic only, BM25 only, hybrid | Hybrid (BM25 keyword + cosine semantic) for best recall | ⬜ Pending | | Reranking | None, cross-encoder (`ms-marco-MiniLM`), LLM-as-judge | Cross-encoder rerank pass before prompt injection | ⬜ Pending | -| Runbook format | Markdown, YAML, JSON | Markdown (human-editable, version-controllable) | ⬜ Pending | +| Runbook format | Markdown, YAML, JSON | Markdown (human-editable, version-controllable) | ✅ Implemented | | Session index storage | Local `~/.tai/`, configurable path | `~/.tai/sessions/` with ChromaDB collection | ⬜ Pending | --- ### Tier 1 — Diagnostic Chunk Retrieval (in-memory, per-session) +Status: ✅ Implemented + **Problem:** Current flow injects all collected output into the prompt as one block. On busy hosts this floods the context window with irrelevant output, degrading quality. @@ -180,6 +183,8 @@ On busy hosts this floods the context window with irrelevant output, degrading q ### Tier 2 — Runbook Knowledge Base (persistent, ChromaDB) +Status: ✅ Implemented + **Problem:** AI improvises remediation steps from training data, which may be wrong for specific environments, distros, or internal conventions. @@ -214,6 +219,8 @@ specific environments, distros, or internal conventions. ### Tier 3 — Session Memory Index (institutional learning) +Status: ⬜ Pending + **Problem:** Every session starts from zero. Repeat incidents on the same host or same issue type get no benefit from past work. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..ac0cbee --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,85 @@ +# Architecture + +This document describes tai's current runtime architecture, module responsibilities, and data flow. + +## High-Level Flow + +1. User runs `tai run` with issue text and target host settings. +1. CLI validates input and opens a shared SSH session. +1. Probe and collection run against a read-only command plan. +1. Collection output is converted into diagnostic chunks. +1. Optional RAG retrieval selects top-k chunks per question. +1. Optional runbook retrieval selects top-k runbook chunks from ChromaDB. +1. Prompt builder composes system + user message. +1. AI completion returns analysis. +1. Guardrails validate response quality signals. +1. Optional session logger writes JSONL events. + +## Module Layout + +- `src/tai/cli.py` + - Command definitions (`run`, `runbooks sync/list/add`) + - Orchestration across SSH, collection, RAG, prompts, AI, and logging +- `src/tai/input_parser.py` + - User input validation and request normalization +- `src/tai/models.py` + - Core dataclasses (`TroubleshootRequest`) +- `src/tai/ssh_client.py` + - SSH invocation + - Read-only command policy validation + - Probe and command execution helpers +- `src/tai/plan.py` + - Issue keyword/service extraction + - Command plan generation + - Service/subsystem presence probes (unit files, binaries) +- `src/tai/collectors.py` + - Executes command plans and builds `CollectionReport` +- `src/tai/rag_retriever.py` + - Command-output chunking + - Embedding wrapper structures + - Similarity retrieval and scoring +- `src/tai/runbook_store.py` + - Persistent ChromaDB runbook indexing and querying +- `src/tai/chroma_telemetry.py` + - No-op telemetry adapter for Chroma local usage +- `src/tai/prompt_builder.py` + - Prompt assembly for full-context and retrieved-context paths +- `src/tai/ai_client.py` + - OpenAI-compatible completions and embeddings client +- `src/tai/ai_guardrails.py` + - Lightweight response guardrails and warnings +- `src/tai/session_log.py` + - Optional JSONL event logging + +## Data Stores + +- Runbook store (Tier 2): local ChromaDB path, default `~/.tai/runbooks` +- Session logs: optional JSONL file configured by `--log-file` + +## Retrieval Layers + +- Tier 1 (implemented): in-memory semantic retrieval over diagnostic chunks +- Tier 2 (implemented): persistent semantic retrieval over runbook corpus +- Tier 3 (pending): persistent retrieval over prior sessions + +## Safety Boundaries + +Read-only policy is enforced before each remote command execution. + +- Allowed command families are explicitly enumerated. +- Shell composition operators are blocked. +- Commands that fail execution are recorded and surfaced to the model as non-evidence. + +## Failure and Fallback Behavior + +- If RAG indexing fails, analysis falls back to full-context prompts. +- If runbook store is unavailable, analysis proceeds without runbook context. +- If AI call fails, CLI exits with non-zero status and displays an error. + +## Test Coverage Highlights + +- Planner behavior and service detection +- Prompt formatting and guardrail-sensitive messaging +- CLI command behavior and interactive loop controls +- Runbook store parsing/index/query behavior (with mocked Chroma) +- SSH policy validation and command execution contract diff --git a/pyproject.toml b/pyproject.toml index 165d664..9bcd8d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "tai" -version = "0.1.0" +version = "0.4.0" description = "Linux AI-driven troubleshooting agent" readme = "README.md" requires-python = ">=3.11" diff --git a/runbooks/apparmor.md b/runbooks/apparmor.md new file mode 100644 index 0000000..7546d0b --- /dev/null +++ b/runbooks/apparmor.md @@ -0,0 +1,86 @@ +--- +service: apparmor +symptoms: permission denied despite correct unix permissions, apparmor deny logs, service blocked by profile, executable transition denied, path access denied, snap confinement issue, profile in complain mode +tags: apparmor, security, profile, aa-status, audit, confinement, complain, enforce, snap +--- + +## Symptoms + +- Application gets `Permission denied` even though Unix permissions look correct +- Service starts in complain mode but fails in enforce mode +- Log shows AppArmor `DENIED` entries +- Binary works when profile is disabled but fails when confinement is enabled +- Snap or packaged app cannot access expected files or sockets + +## Diagnostics + +### Check AppArmor status and loaded profiles + +``` +aa-status +systemctl status apparmor +``` + +Confirm whether the profile is loaded and whether it is in enforce or complain mode. + +### Check denial logs + +``` +journalctl -k | grep -i apparmor +journalctl -b | grep -i DENIED +dmesg | grep -i apparmor +``` + +AppArmor denials usually identify the profile, operation, and path that was blocked. + +### Inspect the active profile + +``` +find /etc/apparmor.d -maxdepth 2 -type f | sort +cat /etc/apparmor.d/ +``` + +Look for missing file path rules, capability rules, and `ix`/`px` execution transitions. + +### Check complain vs enforce mode + +``` +aa-status | grep complain +``` + +If the issue only occurs in enforce mode, the profile is too restrictive rather than the app being broken. + +### Check profile parser and reload + +``` +apparmor_parser -r /etc/apparmor.d/ +aa-status +``` + +Syntax or include errors can prevent an updated profile from loading. + +## Remediation + +**Profile too restrictive:** +Add the missing path, capability, or network rule to the profile, then reload AppArmor. + +If the denial pattern is repetitive, use AppArmor tooling to review and refine the profile instead of disabling confinement globally. + +**Need to observe without blocking:** +Temporarily switch the profile to complain mode: +``` +aa-complain /etc/apparmor.d/ +``` + +**Return to enforcement after fixing rules:** +``` +aa-enforce /etc/apparmor.d/ +``` + +**Profile reload after changes:** +``` +apparmor_parser -r /etc/apparmor.d/ +systemctl reload apparmor +``` + +Do not disable AppArmor globally when the issue is isolated to a single profile. diff --git a/runbooks/disk.md b/runbooks/disk.md new file mode 100644 index 0000000..fcd713d --- /dev/null +++ b/runbooks/disk.md @@ -0,0 +1,106 @@ +--- +service: disk +symptoms: no space left on device, disk full, inode exhaustion, df shows 100%, du large files, write failed, cannot create file, filesystem read-only, ext4 error +tags: disk, filesystem, storage, inodes, df, du, ext4, xfs, lvm, partition, full, space +--- + +## Symptoms + +- `No space left on device` — disk or inode exhaustion +- `df -h` shows a filesystem at 100% (or near 100%) +- `df -i` shows inode usage at 100% — file count exhausted even if byte space is free +- Filesystem remounted read-only — kernel detected errors and protected itself +- Services failing to write logs, create temp files, or open sockets + +## Diagnostics + +### Overall disk usage + +``` +df -h +df -i +``` + +`df -h` shows byte space; `df -i` shows inode usage. Both can be independently exhausted. +Note which filesystem is full (`/`, `/var`, `/tmp`, `/home`, etc.). + +### Find the large directories + +``` +du -sh /* 2>/dev/null | sort -rh | head -20 +du -sh /var/* 2>/dev/null | sort -rh | head -20 +du -sh /var/log/* 2>/dev/null | sort -rh | head -20 +``` + +### Find large individual files + +``` +find / -xdev -type f -size +100M 2>/dev/null | sort -k5 -rn +find /var/log -type f -size +50M 2>/dev/null +``` + +### Find deleted-but-open files holding space + +``` +lsof +L1 2>/dev/null | grep -v "^COMMAND" +``` + +Files deleted while a process still has them open do not free space until the process releases the file descriptor. + +### Inode exhaustion — find directories with many small files + +``` +find / -xdev -printf '%h\n' 2>/dev/null | sort | uniq -c | sort -rn | head -20 +``` + +### Filesystem errors (after a crash or read-only remount) + +``` +dmesg | grep -i 'ext4\|xfs\|btrfs\|error\|corrupt' +journalctl -k | grep -i 'filesystem\|disk\|io error' +``` + +### LVM / partition layout + +``` +lsblk +pvs +vgs +lvs +``` + +## Remediation + +**Large log files — truncate safely (do NOT rm while in use):** +``` +truncate -s 0 /var/log/ +``` +Or configure log rotation in `/etc/logrotate.d/`. + +**Old journal logs eating space:** +``` +journalctl --disk-usage +journalctl --vacuum-size=500M +journalctl --vacuum-time=30d +``` + +**Deleted-but-open files — restart the holding process to release space:** +Identify the PID from `lsof +L1`, then: +``` +systemctl restart +``` + +**Inode exhaustion — remove many small files:** +Common culprits: PHP session files in `/var/lib/php/sessions/`, old apt cache, tmp dirs. +``` +find /var/lib/php/sessions -type f -mtime +7 -delete +apt-get clean +find /tmp -type f -mtime +3 -delete +``` + +**Extend LVM volume (if free extents exist in the volume group):** +``` +lvextend -l +100%FREE /dev// +resize2fs /dev// # ext4 +xfs_growfs /mountpoint # xfs +``` diff --git a/runbooks/docker.md b/runbooks/docker.md new file mode 100644 index 0000000..ca75513 --- /dev/null +++ b/runbooks/docker.md @@ -0,0 +1,120 @@ +--- +service: docker +symptoms: cannot connect to docker daemon, docker daemon failed to start, docker socket permission denied, containers cannot resolve dns, docker network broken, daemon.json conflict, docker oom, unable to remove filesystem +tags: docker, dockerd, containerd, container, daemon, daemon.json, cgroup, dns, docker0, socket, compose +--- + +## Symptoms + +- `Cannot connect to the Docker daemon. Is the docker daemon running on this host?` +- `permission denied` on `/var/run/docker.sock` +- `dockerd` fails to start after a `daemon.json` change +- Containers cannot resolve DNS or pull images +- Docker bridge/network disappears or container networking breaks after boot +- Container or daemon is killed by the kernel OOM killer +- `Error: Unable to remove filesystem` when removing a container + +## Diagnostics + +### Check daemon health and client target + +``` +docker info +systemctl is-active docker +systemctl status docker +ps -ef | grep dockerd +env | grep DOCKER_HOST +``` + +If `DOCKER_HOST` is set incorrectly, the CLI may be talking to the wrong daemon. + +### Check daemon logs and startup failures + +``` +journalctl -u docker -n 200 +journalctl -u containerd -n 100 +cat /etc/docker/daemon.json +systemctl cat docker +``` + +Look for conflicts between `daemon.json` keys and systemd startup flags, especially duplicate `hosts` settings. + +### Check socket permissions and group access + +``` +ls -la /var/run/docker.sock +id +getent group docker +ls -la ~/.docker/ +``` + +If the user was added to the `docker` group recently, a new login shell may be required. + +### Check kernel, cgroups, and memory pressure + +``` +uname -r +free -h +dmesg | grep -i -E 'docker|cgroup|oom|killed process' +``` + +Low memory, missing kernel features, or cgroup issues can stop containers or the daemon. + +### Check Docker networking and DNS + +``` +docker network ls +ip addr show docker0 +sysctl net.ipv4.ip_forward +cat /etc/resolv.conf +ps aux | grep dnsmasq +``` + +Loopback DNS resolvers in `/etc/resolv.conf` often break container DNS unless Docker is given explicit nameservers. + +### Check storage and stuck mounts + +``` +df -h /var/lib/docker +docker system df +lsof /var/lib/docker +``` + +Bind-mounting `/var/lib/docker` into other containers can keep container filesystems busy and block removal. + +## Remediation + +**Daemon not running or client aimed at the wrong host:** +Unset an incorrect `DOCKER_HOST`, then start the daemon: +``` +unset DOCKER_HOST +systemctl restart docker +``` + +**`daemon.json` conflicts with systemd flags:** +Remove duplicate settings or create a systemd override so `dockerd` is started without conflicting flags. + +**Permission denied on Docker socket:** +Add the user to the `docker` group, then re-login: +``` +usermod -aG docker $USER +newgrp docker +``` + +If `~/.docker/` was created by `sudo`, fix ownership: +``` +sudo chown "$USER":"$USER" "$HOME/.docker" -R +sudo chmod g+rwx "$HOME/.docker" -R +``` + +**Container DNS broken:** +Configure explicit DNS servers in `/etc/docker/daemon.json`, then restart Docker. + +**Docker networking disappears after boot:** +Stop the host network manager from managing Docker interfaces and confirm `net.ipv4.ip_forward=1`. + +**OOM kills:** +Treat this as host memory pressure first; reduce workload, add memory, or enforce container memory limits. + +**Unable to remove filesystem:** +Find the process holding the path open with `lsof`, then stop that process or the container bind-mounting `/var/lib/docker`. \ No newline at end of file diff --git a/runbooks/kernel.md b/runbooks/kernel.md new file mode 100644 index 0000000..fb42c1c --- /dev/null +++ b/runbooks/kernel.md @@ -0,0 +1,117 @@ +--- +service: kernel +symptoms: OOM kill, out of memory, high load average, kernel panic, segfault, soft lockup, CPU steal, system unresponsive, zombie processes, NMI watchdog +tags: kernel, oom, memory, load, cpu, panic, dmesg, segfault, lockup, swap, zombie +--- + +## Symptoms + +- `Out of memory: Kill process ` in dmesg — OOM killer fired +- Load average far above CPU count — system overloaded or I/O blocked +- `kernel: BUG: soft lockup` — CPU stuck in kernel code +- `segfault at ...` in dmesg — process crashed due to invalid memory access +- `kernel panic` — unrecoverable kernel error (visible only on console or serial) +- Many zombie (`Z`) processes in `ps` output +- High `%steal` in `top`/`vmstat` — hypervisor CPU contention + +## Diagnostics + +### Recent kernel messages + +``` +dmesg -T | tail -100 +dmesg -T | grep -iE 'error|warn|oom|kill|panic|oops|fault|hung|lockup' +journalctl -k -n 200 +``` + +### OOM events + +``` +dmesg -T | grep -i 'out of memory\|oom_kill\|killed process' +``` + +The log shows which process was killed, its RSS at time of kill, and available memory. + +### Memory usage + +``` +free -h +cat /proc/meminfo | head -30 +vmstat -s +``` + +`MemAvailable` is the key metric. If it is near zero and swap is also exhausted, OOM kills are imminent. + +### Swap + +``` +swapon --show +cat /proc/swaps +vmstat 1 5 +``` + +High `si`/`so` (swap-in/swap-out) in `vmstat` indicates active swapping and likely memory pressure. + +### Load average and CPU + +``` +uptime +top -b -n1 | head -30 +mpstat -P ALL 1 3 +``` + +Load average above 2× CPU count sustained over 15 minutes is concerning. +High `%iowait` indicates processes blocked on disk I/O, not CPU-bound load. + +### Process memory usage + +``` +ps aux --sort=-%mem | head -20 +ps aux --sort=-%cpu | head -20 +``` + +### Zombie processes + +``` +ps aux | awk '$8=="Z"' +``` + +Zombies cannot be killed; the parent must `wait()` for them or be killed itself. + +### I/O wait and disk health + +``` +iostat -x 1 3 +dmesg -T | grep -iE 'i/o error|hard resetting link|ata.*error|blk_update_request' +``` + +Persistent I/O errors alongside high load suggest failing storage. + +## Remediation + +**Memory pressure / frequent OOM kills:** +Identify the largest memory consumers from `ps aux --sort=-%mem`. +Consider increasing swap, adding RAM, tuning `vm.overcommit_memory`, or scaling the workload. +Do NOT just raise `vm.overcommit_ratio` without understanding the root consumer. + +**Adjust OOM killer scoring for critical services (temporary, resets on reboot):** +``` +echo -17 > /proc//oom_adj # legacy +echo -1000 > /proc//oom_score_adj # current kernels +``` + +**Swap exhausted — add a swapfile:** +``` +fallocate -l 2G /swapfile +chmod 600 /swapfile +mkswap /swapfile +swapon /swapfile +``` + +**High I/O wait — find the I/O-heavy process:** +``` +iotop -a -o -b -n3 +``` + +**Zombie reaping — if parent is stuck:** +Kill the parent process (it will reap children on exit), then verify zombies disappear. diff --git a/runbooks/nginx.md b/runbooks/nginx.md new file mode 100644 index 0000000..173007f --- /dev/null +++ b/runbooks/nginx.md @@ -0,0 +1,99 @@ +--- +service: nginx +symptoms: 502 Bad Gateway, 504 Gateway Timeout, upstream connection refused, nginx not starting, failed to bind socket, permission denied reading config, configuration test failed +tags: nginx, web, http, https, proxy, upstream, reverse-proxy, load-balancer +--- + +## Symptoms + +- `502 Bad Gateway` — nginx reached the upstream but got an invalid response, or upstream is down +- `504 Gateway Timeout` — upstream took too long to respond +- `111: Connection refused` in nginx error log — upstream process is not running or not on the expected port +- `nginx.service: Start request repeated too quickly` — crash-loop; check error log +- `[emerg] bind() to 0.0.0.0:80 failed (98: Address already in use)` — port conflict +- `[emerg] open() ... failed (13: Permission denied)` — file permission issue + +## Diagnostics + +### Service status + +``` +systemctl status nginx +``` + +### Config test + +``` +nginx -t +``` + +A config error is the most common reason for nginx failing to start or reload. + +### Error log + +``` +journalctl -u nginx -n 100 +tail -n 100 /var/log/nginx/error.log +``` + +For 502/504 errors look for: `connect() failed`, `upstream timed out`, `no live upstreams`. + +### Access log — recent requests + +``` +tail -n 50 /var/log/nginx/access.log +``` + +### Check upstream services + +For `proxy_pass` targets, verify the upstream is running: +``` +systemctl status +ss -tlnp | grep +``` + +Common upstreams: `gunicorn`, `uwsgi`, `node`, `puma`, `php-fpm`. + +### Port binding conflicts + +``` +ss -tlnp | grep ':80\|:443' +``` + +### Config files + +``` +cat /etc/nginx/nginx.conf +ls /etc/nginx/sites-enabled/ +cat /etc/nginx/sites-enabled/ +``` + +Check `proxy_pass`, `upstream` blocks, `proxy_connect_timeout`, `proxy_read_timeout`. + +## Remediation + +**Upstream service not running:** +Start the upstream service, then verify nginx resumes proxying. + +**Config syntax error:** +Fix the error shown by `nginx -t`, then: +``` +systemctl reload nginx +``` + +**Port already in use:** +Find the conflicting process with `ss -tlnp | grep :80`, stop it, then restart nginx. + +**Upstream timeouts — increase timeouts (caution: treat the slow upstream as the root cause):** +```nginx +proxy_connect_timeout 10s; +proxy_read_timeout 60s; +proxy_send_timeout 60s; +``` + +**Permission denied on log or socket file:** +``` +ls -la /var/log/nginx/ +ls -la /run/nginx.pid +chown -R www-data:www-data /var/log/nginx/ +``` diff --git a/runbooks/postgres.md b/runbooks/postgres.md new file mode 100644 index 0000000..b64f032 --- /dev/null +++ b/runbooks/postgres.md @@ -0,0 +1,107 @@ +--- +service: postgres +symptoms: connection refused port 5432, FATAL password authentication failed, replication lag, disk full, out of shared memory, too many connections, relation does not exist, could not connect to the primary +tags: postgres, postgresql, database, replication, pg, psql, disk, connections +--- + +## Symptoms + +- `could not connect to server: Connection refused` — postgres not running or not on port 5432 +- `FATAL: password authentication failed for user ""` — wrong credentials or pg_hba mismatch +- `FATAL: too many connections` — connection pool exhausted +- `ERROR: could not resize shared memory segment` / `out of shared memory` — shared_buffers too high for system +- `PANIC: could not write to file "pg_wal/..."` — disk full on WAL directory +- Replication lag growing — standby falling behind primary +- `FATAL: could not connect to the primary server` — standby cannot reach primary + +## Diagnostics + +### Service status + +``` +systemctl status postgresql +systemctl status postgresql@-main +``` + +### PostgreSQL logs + +``` +journalctl -u postgresql -n 100 +tail -n 100 /var/log/postgresql/postgresql-*.log +``` + +### Is postgres listening? + +``` +ss -tlnp | grep 5432 +``` + +### Disk space (WAL and data directory are the critical paths) + +``` +df -h +du -sh /var/lib/postgresql/ +du -sh /var/lib/postgresql/*/main/pg_wal/ +``` + +A full disk on the pg_wal partition causes a PANIC and hard crash. + +### Connection count + +```sql +SELECT count(*), state FROM pg_stat_activity GROUP BY state; +SELECT setting FROM pg_settings WHERE name = 'max_connections'; +``` + +### Replication lag (run on primary) + +```sql +SELECT client_addr, state, sent_lsn, write_lsn, flush_lsn, replay_lsn, + (sent_lsn - replay_lsn) AS lag_bytes +FROM pg_stat_replication; +``` + +### pg_hba.conf — authentication rules + +``` +cat /etc/postgresql/*/main/pg_hba.conf +``` + +Entries are matched top-to-bottom. `reject` or missing entry for the client IP causes auth failure even with correct credentials. + +### Shared memory / kernel settings + +``` +cat /proc/sys/kernel/shmmax +cat /etc/postgresql/*/main/postgresql.conf | grep shared_buffers +``` + +`shared_buffers` must not exceed ~40% of RAM; kernel `shmmax` must accommodate it. + +## Remediation + +**Postgres not running:** +``` +systemctl start postgresql +``` +Check logs immediately after start for the failure reason. + +**Authentication failure (pg_hba mismatch):** +Add or update the correct entry in `pg_hba.conf`, then reload: +``` +systemctl reload postgresql +``` + +**Too many connections — increase limit (requires restart):** +In `postgresql.conf`: +``` +max_connections = 200 +``` +Or deploy a connection pooler (`pgbouncer`). + +**Disk full on WAL:** +Identify and remove old base backups or archived WAL segments under `/var/lib/postgresql/*/main/pg_wal/`. +Do NOT delete pg_wal files directly — use `pg_archivecleanup` or let archiving catch up. + +**Replication lag — standby too far behind:** +Check network bandwidth and I/O on standby. If `wal_receiver_status_interval` lag is large, increase `wal_sender_timeout` temporarily. diff --git a/runbooks/selinux.md b/runbooks/selinux.md new file mode 100644 index 0000000..3bef20b --- /dev/null +++ b/runbooks/selinux.md @@ -0,0 +1,112 @@ +--- +service: selinux +symptoms: permission denied despite correct unix permissions, service blocked by selinux, avc denied, file context mismatch, port binding denied, boolean missing, domain transition failure +tags: selinux, avc, enforcing, security, policy, restorecon, audit, sealert, semanage +--- + +## Symptoms + +- Service gets `Permission denied` even though file ownership and mode look correct +- Process cannot bind to a port or open a file after a config change +- AVC denials appear in audit logs +- App works when SELinux is permissive but fails in enforcing mode +- Newly created files under custom paths are inaccessible to a confined service + +## Diagnostics + +### Confirm SELinux mode and policy + +``` +getenforce +sestatus +cat /etc/selinux/config +``` + +If SELinux is `Permissive`, denials are logged but not enforced. + +### Check AVC denials + +``` +auditctl -s +ausearch -m AVC,USER_AVC,SELINUX_ERR,USER_SELINUX_ERR -ts recent +journalctl -t setroubleshoot -n 50 +dmesg | grep -i -e type=1300 -e type=1400 +``` + +AVC denials are the primary source of truth for SELinux policy failures. + +If AVCs are missing but SELinux still appears involved, temporarily disable `dontaudit` rules to expose hidden denials: +``` +semodule -DB +``` +Re-enable them after reproducing the issue: +``` +semodule -B +``` + +### Inspect file contexts + +``` +ls -lZ /path/to/file +ps -eZ | grep +matchpathcon -V /path/to/file +``` + +A service can have correct Unix permissions and still fail if the SELinux context is wrong. + +### Check port labeling and booleans + +``` +semanage port -l | grep +getsebool -a | grep +semanage boolean -l | grep +``` + +Custom ports often require explicit SELinux port labels. + +### Check for relabeling needs + +``` +restorecon -nRv /path +matchpathcon /path/to/file +sealert -l "*" +``` + +`restorecon -n` shows what would change without modifying labels. + +`sealert` is often the fastest way to turn a raw AVC into a concrete fix, but treat `audit2allow` suggestions as a last resort, not a first response. + +## Remediation + +**Wrong file context:** +Restore the default context: +``` +restorecon -Rv /path +``` + +**Custom application path needs persistent labeling:** +``` +semanage fcontext -a -t '/custom/path(/.*)?' +restorecon -Rv /custom/path +``` + +**Custom port binding denied:** +Add the port label required by the service type: +``` +semanage port -a -t -p tcp +``` + +**Boolean disabled:** +Enable the needed boolean persistently: +``` +setsebool -P on +``` + +**Still unsure whether SELinux is the blocker:** +Temporarily switch to permissive mode and reproduce the issue: +``` +setenforce 0 +``` +If the problem still occurs, SELinux is not the root cause. + +Do not disable SELinux or generate custom policy modules as a first response. Fix labels, booleans, or port mappings first. diff --git a/runbooks/ssh.md b/runbooks/ssh.md new file mode 100644 index 0000000..2714f21 --- /dev/null +++ b/runbooks/ssh.md @@ -0,0 +1,100 @@ +--- +service: ssh +symptoms: connection refused, authentication failed, host key mismatch, permission denied, timeout connecting, no route to host +tags: ssh, sshd, openssh, authentication, network, connectivity +--- + +## Symptoms + +- `ssh: connect to host port 22: Connection refused` +- `Permission denied (publickey)` — key not accepted or wrong user +- `WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!` — host key mismatch +- `Connection timed out` — firewall blocking or host unreachable +- `No route to host` — routing issue or host is down + +## Diagnostics + +### Is sshd running? + +``` +systemctl status sshd +systemctl status ssh +``` + +A stopped or failed sshd is the most common cause of "connection refused". + +### Check sshd configuration + +``` +sshd -t +cat /etc/ssh/sshd_config +``` + +Look for: `PasswordAuthentication`, `PubkeyAuthentication yes`, `AuthorizedKeysFile`. + +### Check authorised keys + +``` +ls -la ~/.ssh/ +cat ~/.ssh/authorized_keys +``` + +Permissions must be: `~/.ssh` → `700`, `authorized_keys` → `600`. +Wrong permissions cause silent auth failure even with the correct key. + +### Check sshd logs + +``` +journalctl -u sshd -n 100 +journalctl -u ssh -n 100 +grep sshd /var/log/auth.log | tail -50 +``` + +Look for: `Invalid user`, `Failed publickey`, `Connection reset by peer`, `Too many authentication failures`. + +### Check listening port + +``` +ss -tlnp | grep sshd +netstat -tlnp | grep :22 +``` + +If sshd is running but not listening on the expected port, check `Port` in `/etc/ssh/sshd_config`. + +### Firewall rules + +``` +iptables -L INPUT -n -v +nft list ruleset +ufw status verbose +``` + +A DROP rule on port 22 causes silent timeouts, not "connection refused". + +## Remediation + +**sshd not running:** +``` +systemctl enable --now sshd +``` + +**Wrong permissions on authorized_keys:** +``` +chmod 700 ~/.ssh +chmod 600 ~/.ssh/authorized_keys +chown -R $USER:$USER ~/.ssh +``` + +**sshd config error:** +Fix the error reported by `sshd -t`, then: +``` +systemctl restart sshd +``` + +**Host key mismatch (expected after reinstall/reprovisioning):** +Remove the old key from the client: +``` +ssh-keygen -R +``` +Only do this if you are certain the host was intentionally reprovisioned. +If the key change is unexpected, treat as a potential MITM and investigate before connecting. diff --git a/runbooks/sssd.md b/runbooks/sssd.md new file mode 100644 index 0000000..0a862ae --- /dev/null +++ b/runbooks/sssd.md @@ -0,0 +1,115 @@ +--- +service: sssd +symptoms: login denied, user not found, id command hangs, sudo rules missing, ldap auth failure, kerberos failure, cache stale, offline authentication not working +tags: sssd, ldap, kerberos, ad, identity, auth, pam, nss, sudo +--- + +## Symptoms + +- `id ` hangs or returns no such user for a domain account +- SSH or console login fails for directory-backed users +- Group membership is missing or incomplete +- `sudo` rules from LDAP/AD do not appear +- Authentication works intermittently or only after cache flush +- Offline authentication fails when the directory is unreachable + +## Diagnostics + +### Check service health + +``` +systemctl status sssd +sssctl domain-list +sssctl config-check +cat /etc/nsswitch.conf +``` + +A running daemon with a valid config and `sss` present in `nsswitch.conf` are the first prerequisites. + +### Check identity resolution + +``` +id +getent passwd +getent group +``` + +If NSS lookups fail, the issue is often in SSSD configuration, connectivity, or cache. + +### Check SSSD logs + +``` +journalctl -u sssd -n 100 +ls -la /var/log/sssd/ +tail -n 100 /var/log/sssd/*.log +sssctl logs-fetch +``` + +Look for: backend offline, LDAP bind failures, Kerberos errors, TLS problems, and access provider denials. + +If the issue is unclear, raise `debug_level=6` in the relevant `[nss]`, `[pam]`, and `[domain/]` sections. Raising debug only in `[sssd]` is not enough for most real failures. + +### Check domain reachability + +``` +sssctl domain-status +ping +dig -t SRV _ldap._tcp. +cat /etc/resolv.conf +``` + +If the identity provider is unreachable, SSSD may serve cached data only or fail entirely. + +### Check Kerberos and LDAP configuration + +``` +cat /etc/sssd/sssd.conf +cat /etc/krb5.conf +kinit +klist +ldapsearch -ZZ -x -H ldap:// -b +``` + +Look for wrong realm names, bad server addresses, TLS settings, and access filters. + +For AD or IPA providers, Kerberos and DNS are often the real dependency chain: broken SRV lookup, keytab issues, or a slow KDC will surface as SSSD failures. + +### Check cache and permissions + +``` +ls -la /var/lib/sss/db/ +sssctl cache-status +sssctl cache-expire -E +``` + +`/etc/sssd/sssd.conf` must usually be mode `600` or SSSD will refuse to start. + +Do not wipe cache files blindly on an offline system that depends on cached logins. + +## Remediation + +**Config syntax or permission issue:** +Fix `sssd.conf`, set secure permissions, then restart: +``` +chmod 600 /etc/sssd/sssd.conf +systemctl restart sssd +``` + +**Stale cache:** +Clear cache carefully, then repopulate with a fresh lookup: +``` +sss_cache -E +id +``` + +**Kerberos failure:** +Validate time sync, realm, keytab credentials, and KDC reachability before changing LDAP settings. + +**Backend offline or `sdap_async_sys_connect request failed`:** +Treat as DNS/network first. Validate SRV records and TLS handshake before increasing `ldap_network_timeout` or `ldap_search_timeout`. + +**Access denied despite successful lookup:** +Check `access_provider`, LDAP filters, HBAC rules, or AD group-based access restrictions. + +**No `pam_sss` messages at all:** +The PAM stack is likely misconfigured. Fix the PAM/authselect profile before changing SSSD itself. diff --git a/runbooks/wayland.md b/runbooks/wayland.md new file mode 100644 index 0000000..1c7b6aa --- /dev/null +++ b/runbooks/wayland.md @@ -0,0 +1,89 @@ +--- +service: wayland +symptoms: wayland session fails, gdm falls back to xorg, black screen on login, fractional scaling broken, screen sharing broken, remote desktop broken, wlroots crash, compositor crash +tags: wayland, compositor, gnome, kde, mutter, wlroots, pipewire, xwayland, graphics +--- + +## Symptoms + +- User selects a Wayland session but is returned to login +- GDM or another display manager falls back to Xorg +- Screen sharing, remote desktop, or clipboard integration is broken +- Apps requiring XWayland fail while native Wayland apps work +- Fractional scaling or multi-monitor layout behaves incorrectly +- Wayland compositor crashes after login + +## Diagnostics + +### Confirm the active session type + +``` +echo $XDG_SESSION_TYPE +loginctl show-session $XDG_SESSION_ID -p Type +echo $WAYLAND_DISPLAY +``` + +If the session type is `x11`, you are not debugging an active Wayland session. + +### Check display manager and compositor logs + +``` +systemctl status gdm +journalctl -b | grep -iE 'wayland|mutter|kwin|wlroots|xwayland' +journalctl -b | grep -i 'renderer for' +``` + +Look for compositor crashes, GPU driver incompatibilities, and forced Xorg fallback messages. + +### Check XWayland and PipeWire components + +``` +which Xwayland +systemctl --user status pipewire +systemctl --user status xdg-desktop-portal +systemctl --user status xdg-desktop-portal-gnome +systemctl --user status xdg-desktop-portal-kde +xlsclients -l +``` + +Broken screen sharing is often a PipeWire or portal issue, not a compositor issue. + +`xlsclients -l` helps identify apps that are actually running under XWayland rather than native Wayland. + +### Check GPU compatibility + +``` +lspci -k | grep -A3 -E 'VGA|3D|Display' +lsmod | grep -E 'nvidia|nouveau|amdgpu|i915' +``` + +Wayland support quality depends heavily on the GPU driver stack. + +### Check environment and session overrides + +``` +env | grep -E 'WAYLAND|XDG|GDK_BACKEND|QT_QPA_PLATFORM' +cat /etc/gdm/custom.conf +wayland-info +``` + +Environment overrides can force apps onto X11 or disable Wayland entirely. + +For NVIDIA systems, confirm the compositor is using a supported buffer path (GBM on current drivers is the expected default). + +## Remediation + +**Wayland disabled in display manager config:** +Check `WaylandEnable=false` or similar settings and remove the override if unintended. + +**Fallback to Xorg on unsupported GPU stack:** +Upgrade or change the graphics driver; Wayland stability is often limited by the driver, not the compositor. + +**Screen sharing broken:** +Fix PipeWire and `xdg-desktop-portal` services before changing compositor settings. + +**XWayland-only app failures:** +Treat them separately from native Wayland issues; confirm `Xwayland` is installed and launching. + +**Remote desktop, VM, or game input grabbing is broken:** +This is often a Wayland protocol/compositor support limitation, not a generic keyboard bug. Check compositor support for pointer constraints, relative pointer, and keyboard shortcut inhibit protocols. diff --git a/runbooks/x2go.md b/runbooks/x2go.md new file mode 100644 index 0000000..7d88fc7 --- /dev/null +++ b/runbooks/x2go.md @@ -0,0 +1,106 @@ +--- +service: x2go +symptoms: x2go session fails to start, x2go black screen, x2go disconnects immediately, no desktop in session, authentication failure, x2go agent not starting, sound forwarding broken +tags: x2go, nx, remote-desktop, x2goserver, x2goclient, session, desktop, xauth +--- + +## Symptoms + +- X2Go login succeeds but the session immediately disconnects +- Black screen after login +- Session is created but no desktop appears +- `x2goruncommand error` or `X2Go Agent got stuck in state` +- Sound, clipboard, or drive sharing fails while login itself works +- Authentication works over SSH but X2Go session startup fails + +## Diagnostics + +### Check X2Go services and packages + +``` +systemctl status x2goserver +systemctl status sshd +rpm -qa | grep x2go +apt list --installed | grep x2go +which x2golistsessions +``` + +X2Go depends on working SSH plus installed `x2goserver` and `x2goserver-xsession` components. + +### Check X2Go logs + +``` +journalctl -u x2goserver -n 100 +journalctl -u sshd -n 100 +ls -la ~/.x2go/ +find ~/.x2go -type f -maxdepth 2 -print +x2golistsessions +``` + +Look for session startup failures, agent crashes, and auth helper errors. + +### Check desktop environment startup command + +``` +cat /etc/x2go/Xsession +cat ~/.xsession +cat ~/.Xclients +``` + +A missing or broken desktop session command is a common cause of black screens. + +### Check X11 and xauth availability + +``` +which xauth +xauth -V +ls -la ~/.Xauthority +which sshfs +``` + +X2Go requires a working X11 session setup. Missing `xauth` or a bad `.Xauthority` often breaks startup. + +Filesystem and folder-sharing features may also depend on `sshfs` being installed. + +### Check session limits and stale sessions + +``` +x2golistsessions +x2gocleansessions +ulimit -a +loginctl list-sessions +``` + +Stale sessions or per-user process limits can prevent a new desktop from starting. + +### Check desktop dependencies + +``` +which startxfce4 +which mate-session +which startplasma-x11 +env | grep -E 'DESKTOP|XDG' +``` + +If the selected desktop command does not exist, X2Go may connect and then terminate immediately. + +## Remediation + +**Missing or broken desktop startup command:** +Set the session to a known-good desktop such as XFCE and verify the binary exists. + +**Corrupt Xauthority or stale X2Go session files:** +Remove stale session state and regenerate auth files: +``` +rm -f ~/.Xauthority +rm -rf ~/.x2go/C-* +``` + +**Missing `xauth` or X11 helpers:** +Install the missing X11 packages, then retry the session. + +**Required server packages missing:** +Install `x2goserver` and `x2goserver-xsession` first, then retry before debugging desktop startup. + +**SSH works but X2Go session fails:** +Treat it as a desktop startup or X11 auth problem, not an SSH transport problem. diff --git a/runbooks/xorg.md b/runbooks/xorg.md new file mode 100644 index 0000000..cfa54e5 --- /dev/null +++ b/runbooks/xorg.md @@ -0,0 +1,94 @@ +--- +service: xorg +symptoms: xorg black screen, display manager loop, no screens found, failed to start X server, GPU driver error, xrandr missing outputs, login screen not appearing +tags: xorg, x11, display, gpu, drm, xrandr, gdm, sddm, lightdm +--- + +## Symptoms + +- Black screen after graphical boot +- Display manager loops back to login +- `no screens found` in Xorg log +- External monitors are missing or not detected +- X server fails after a driver update +- `startx` exits immediately with display or device errors + +## Diagnostics + +### Check display manager and Xorg service path + +``` +systemctl status display-manager +systemctl status gdm +systemctl status sddm +systemctl status lightdm +``` + +If the display manager is failing, inspect its logs before focusing on Xorg itself. + +### Check Xorg logs + +``` +find /var/log -name 'Xorg*.log' -o -name 'Xorg.*.log' +grep -E '\(EE\)|\(WW\)' /var/log/Xorg.0.log +journalctl -b | grep -iE 'xorg|gdm|sddm|lightdm' +ls -la ~/.local/share/xorg/ +``` + +Look for: `no screens found`, GPU module load failures, and permission/device access errors. + +On rootless Xorg, logs are often under `~/.local/share/xorg/Xorg.0.log` instead of `/var/log/`. + +### Check DRM and GPU driver state + +``` +lspci -k | grep -A3 -E 'VGA|3D|Display' +lsmod | grep -E 'nouveau|nvidia|amdgpu|i915' +dmesg | grep -iE 'drm|gpu|nvidia|amdgpu|i915' +``` + +Driver mismatches after kernel updates are a common cause of X startup failures. + +### Check monitor detection and permissions + +``` +loginctl session-status +xrandr --query +ls -la /dev/dri/ +ps -o user= -C Xorg +``` + +If `/dev/dri/*` permissions or seat assignment are wrong, X may fail to access the GPU. + +### Check X configuration files + +``` +find /etc/X11 -maxdepth 3 -type f +cat /etc/X11/xorg.conf +cat /etc/X11/xorg.conf.d/*.conf +ls -la ~/.xinitrc ~/.xserverrc +``` + +Custom `Device`, `Monitor`, or `Screen` sections often break auto-detection. + +An empty or broken `.xinitrc` can produce a black screen even when the X server itself started correctly. + +## Remediation + +**Bad static Xorg config:** +Move custom config aside and let auto-detection work unless the hardware truly needs manual config. + +**Driver mismatch after update:** +Reinstall the GPU driver package matching the running kernel and reboot or restart the display manager. + +**`no screens found`:** +Check whether the correct DRM module loaded and whether the display manager is running on the expected seat. + +**Display manager loop:** +Correlate Xorg errors with PAM/auth logs; some loops are session startup failures, not graphics failures. + +**Framebuffer mode failure:** +If X falls back to `fbdev` and errors with framebuffer/bus ID messages, remove the generic `fbdev` driver package and let Xorg use the proper modesetting or vendor driver. + +**`SocketCreateListener() failed`:** +Check for stale sockets in `/tmp/.X11-unix`, especially after previous root-run Xorg sessions. diff --git a/src/tai/chroma_telemetry.py b/src/tai/chroma_telemetry.py new file mode 100644 index 0000000..310c65e --- /dev/null +++ b/src/tai/chroma_telemetry.py @@ -0,0 +1,24 @@ +"""Local no-op telemetry implementation for ChromaDB. + +ChromaDB expects a product telemetry client component. Some local package +combinations emit noisy PostHog errors even when anonymized telemetry is +disabled, so tai wires ChromaDB to this no-op client instead. +""" + +from __future__ import annotations + +from chromadb.config import System +from chromadb.telemetry.product import ProductTelemetryClient, ProductTelemetryEvent +from overrides import override + + +class NoOpProductTelemetryClient(ProductTelemetryClient): + """Telemetry client that intentionally drops all events.""" + + def __init__(self, system: System): + super().__init__(system) + + @override + def capture(self, event: ProductTelemetryEvent) -> None: + del event + return None \ No newline at end of file diff --git a/src/tai/cli.py b/src/tai/cli.py index e8b8f31..b6eb5a3 100644 --- a/src/tai/cli.py +++ b/src/tai/cli.py @@ -21,16 +21,20 @@ from tai.input_parser import InputValidationError, build_request from tai.models import TroubleshootRequest from tai.plan import plan_from_request from tai.prompt_builder import ( + build_analysis_message_with_chunks, build_followup_message, build_message_with_chunks, build_system_prompt, build_user_message, ) from tai.rag_retriever import EmbeddedChunk, chunk_report, retrieve_scored +from tai.runbook_store import RunbookChunk, RunbookStore from tai.session_log import SessionLogger from tai.ssh_client import SSHClient, SSHCommandResult, SSHConnectionConfig, SSHSession app = typer.Typer(no_args_is_help=True, add_completion=False) +runbooks_app = typer.Typer(no_args_is_help=True, help="Manage the runbook knowledge base.") +app.add_typer(runbooks_app, name="runbooks") console = Console() @@ -98,6 +102,20 @@ def run( str, typer.Option("--ai-key", help="API key for the AI backend (not needed for Ollama)."), ] = "ollama", + ai_timeout_seconds: Annotated[ + float, + typer.Option( + "--ai-timeout-seconds", + help="Timeout for AI requests/generation in seconds.", + ), + ] = 120.0, + ai_max_tokens: Annotated[ + int, + typer.Option( + "--ai-max-tokens", + help="Upper bound for generated completion tokens.", + ), + ] = 1024, log_file: Annotated[ str | None, typer.Option( @@ -126,6 +144,13 @@ def run( help="Print retrieved chunk names/scores and log per-question retrieval metrics.", ), ] = False, + runbooks_path: Annotated[ + str | None, + typer.Option( + "--runbooks", + help="Path to a synced runbook ChromaDB store. Enables Tier 2 RAG.", + ), + ] = None, ) -> None: """Start an interactive troubleshooting session scaffold.""" try: @@ -161,11 +186,27 @@ def run( if not (probe or collect or analyze or interactive): return # nothing SSH-related requested - ai_config = AIConfig(host=ai_host, model=model, api_key=ai_key, embed_model=embed_model) + ai_config = AIConfig( + host=ai_host, + model=model, + api_key=ai_key, + timeout_seconds=ai_timeout_seconds, + max_tokens=ai_max_tokens, + embed_model=embed_model, + ) logger = SessionLogger.create(log_file) if log_file else None if analyze or interactive: console.print(f"[cyan]AI:[/cyan] {AIClient(ai_config).summary()}") + runbook_store: RunbookStore | None = None + if runbooks_path is not None: + try: + runbook_store = RunbookStore(runbooks_path) + rb_count = runbook_store.count() + console.print(f"[dim]Runbooks: {rb_count} indexed at {runbooks_path}[/dim]") + except Exception as exc: # noqa: BLE001 + console.print(f"[yellow]Runbook store unavailable:[/yellow] {exc}") + try: asyncio.run( _async_main( @@ -178,6 +219,7 @@ def run( ai_config=ai_config, no_rag=no_rag, rag_debug=rag_debug, + runbook_store=runbook_store, logger=logger, ) ) @@ -202,6 +244,7 @@ async def _async_main( ai_config: AIConfig, no_rag: bool, rag_debug: bool, + runbook_store: RunbookStore | None, logger: SessionLogger | None, ) -> None: """Open a single SSH session and run probe / collection / analysis through it.""" @@ -249,7 +292,15 @@ async def _async_main( ) if analyze and report is not None: - _run_analysis(ai_config, req.issue, report, logger=logger) + _run_analysis( + ai_config, + req.issue, + report, + no_rag=no_rag, + rag_debug=rag_debug, + runbook_store=runbook_store, + logger=logger, + ) if interactive: await _interactive_loop( @@ -259,6 +310,7 @@ async def _async_main( report, no_rag=no_rag, rag_debug=rag_debug, + runbook_store=runbook_store, logger=logger, ) @@ -271,6 +323,7 @@ async def _interactive_loop( *, no_rag: bool = False, rag_debug: bool = False, + runbook_store: RunbookStore | None = None, logger: SessionLogger | None, ) -> None: """Run a follow-up loop for collecting and conversational analysis.""" @@ -421,6 +474,7 @@ async def _interactive_loop( prior_questions, embedded_chunks=embedded_chunks, rag_debug=rag_debug, + runbook_store=runbook_store, logger=logger, ) prior_questions.append("/analyze") @@ -477,6 +531,7 @@ async def _interactive_loop( prior_questions, embedded_chunks=embedded_chunks, rag_debug=rag_debug, + runbook_store=runbook_store, logger=logger, ) prior_questions.append(command) @@ -539,6 +594,9 @@ def _run_analysis( issue: str, report: CollectionReport, *, + no_rag: bool = False, + rag_debug: bool = False, + runbook_store: RunbookStore | None = None, logger: SessionLogger | None, ) -> None: """Send collected data to the AI and stream the analysis to stdout.""" @@ -547,12 +605,45 @@ def _run_analysis( console.print() ai = AIClient(ai_config) system_prompt = build_system_prompt() - user_message = build_user_message(issue, report) + runbook_chunks = _query_runbooks(runbook_store, issue, ai, top_k=1) + + user_message: str + if no_rag: + user_message = build_user_message(issue, report, runbook_chunks=runbook_chunks or None) + else: + try: + chunks = chunk_report(report) + embedded = [EmbeddedChunk(chunk=c, embedding=ai.embed(c.content)) for c in chunks] + q_embedding = ai.embed(issue) + scored = retrieve_scored(q_embedding, embedded, top_k=3) + if rag_debug: + pairs = ", ".join( + f"{chunk.name}={score:.3f}" for chunk, score in scored + ) + console.print(f"[dim]RAG retrieve (initial):[/dim] {pairs or 'no matches'}") + selected = [chunk for chunk, _score in scored] + if selected: + user_message = build_analysis_message_with_chunks( + issue, + report.host, + selected, + runbook_chunks=runbook_chunks or None, + ) + else: + user_message = build_user_message(issue, report, runbook_chunks=runbook_chunks or None) + except Exception as exc: # noqa: BLE001 + console.print( + "[yellow]RAG unavailable for initial analysis; using full-context fallback.[/yellow]" + ) + if logger is not None: + logger.log_event("rag_index", {"status": "fallback", "error": str(exc)}) + user_message = build_user_message(issue, report, runbook_chunks=runbook_chunks or None) try: - chunks: list[str] = [] - for chunk in ai.stream(system_prompt, user_message): - chunks.append(chunk) - response = "".join(chunks) + response = _complete_ai_response( + ai, + system_prompt, + user_message, + ) console.print(Markdown(response)) warnings = validate_ai_response(response) @@ -596,6 +687,7 @@ def _run_followup_analysis( *, embedded_chunks: list[EmbeddedChunk] | None = None, rag_debug: bool = False, + runbook_store: RunbookStore | None = None, logger: SessionLogger | None, ) -> str: """Run grounded follow-up analysis re-anchored to current diagnostics. @@ -609,6 +701,7 @@ def _run_followup_analysis( console.print() ai = AIClient(ai_config) system_prompt = build_system_prompt() + runbook_chunks = _query_runbooks(runbook_store, question, ai, top_k=1) user_message: str retrieved_names: list[str] = [] @@ -620,7 +713,7 @@ def _run_followup_analysis( retrieval_start = perf_counter() try: q_embedding = ai.embed(question) - scored = retrieve_scored(q_embedding, embedded_chunks, top_k=5) + scored = retrieve_scored(q_embedding, embedded_chunks, top_k=3) retrieval_ms = (perf_counter() - retrieval_start) * 1000.0 retrieved_names = [chunk.name for chunk, _score in scored] retrieved_scores = [round(score, 4) for _chunk, score in scored] @@ -630,6 +723,7 @@ def _run_followup_analysis( [chunk for chunk, _score in scored], question, prior_questions, + runbook_chunks=runbook_chunks or None, ) if rag_debug: pairs = ", ".join( @@ -644,10 +738,16 @@ def _run_followup_analysis( "[yellow]RAG unavailable (query embedding failed); using full-context " "fallback.[/yellow]" ) - user_message = build_followup_message(issue, report, question, prior_questions) + user_message = build_followup_message( + issue, report, question, prior_questions, + runbook_chunks=runbook_chunks or None, + ) else: fallback_reason = "rag not indexed" - user_message = build_followup_message(issue, report, question, prior_questions) + user_message = build_followup_message( + issue, report, question, prior_questions, + runbook_chunks=runbook_chunks or None, + ) if logger is not None: logger.log_event( @@ -665,10 +765,11 @@ def _run_followup_analysis( ) try: - chunks: list[str] = [] - for chunk in ai.stream(system_prompt, user_message): - chunks.append(chunk) - response = "".join(chunks) + response = _complete_ai_response( + ai, + system_prompt, + user_message, + ) console.print(Markdown(response)) console.print(Rule(style="dim")) @@ -696,6 +797,150 @@ def _run_followup_analysis( raise typer.Exit(code=1) from exc +def _complete_ai_response( + ai: AIClient, + system_prompt: str, + user_message: str, +) -> str: + """Return a full AI completion in one request. + + Some local backends intermittently stall on streaming before yielding a first + token; using a non-streaming completion path is more reliable for CLI runs. + """ + return ai.complete(system_prompt, user_message).content + + +def _query_runbooks( + store: RunbookStore | None, + question: str, + ai: AIClient, + *, + top_k: int = 3, +) -> list[RunbookChunk]: + """Query the runbook store silently; returns empty list on any failure.""" + if store is None: + return [] + try: + return store.query(question, ai, top_k=top_k) + except Exception: # noqa: BLE001 + return [] + + +# --------------------------------------------------------------------------- +# runbooks sub-app +# --------------------------------------------------------------------------- + + +@runbooks_app.command("sync") +def runbooks_sync( + path: Annotated[ + str, + typer.Option("--path", help="Directory containing runbook Markdown files."), + ] = "./runbooks", + store_path: Annotated[ + str, + typer.Option("--store", help="ChromaDB store path. Defaults to ~/.tai/runbooks."), + ] = "~/.tai/runbooks", + ai_host: Annotated[ + str, + typer.Option("--ai-host", help="OpenAI-compatible AI backend URL."), + ] = DEFAULT_AI_HOST, + embed_model: Annotated[ + str, + typer.Option("--embed-model", help="Embedding model name."), + ] = DEFAULT_EMBED_MODEL, + ai_key: Annotated[ + str, + typer.Option("--ai-key", help="API key for the AI backend."), + ] = "ollama", +) -> None: + """Embed and index all runbooks from PATH into the persistent store.""" + from pathlib import Path + + runbooks_dir = Path(path).expanduser().resolve() + if not runbooks_dir.is_dir(): + console.print(f"[red]Directory not found:[/red] {runbooks_dir}") + raise typer.Exit(code=1) + + ai_config = AIConfig(host=ai_host, model="", api_key=ai_key, embed_model=embed_model) + ai = AIClient(ai_config) + + try: + store = RunbookStore(store_path) + count = store.sync(runbooks_dir, ai) + console.print(f"[green]✓ Synced {count} runbook(s)[/green] → {store_path}") + except Exception as exc: # noqa: BLE001 + console.print(f"[red]Sync failed:[/red] {exc}") + raise typer.Exit(code=1) from exc + + +@runbooks_app.command("list") +def runbooks_list( + store_path: Annotated[ + str, + typer.Option("--store", help="ChromaDB store path. Defaults to ~/.tai/runbooks."), + ] = "~/.tai/runbooks", +) -> None: + """List all indexed runbooks and their metadata.""" + try: + store = RunbookStore(store_path) + entries = store.list_indexed() + except Exception as exc: # noqa: BLE001 + console.print(f"[red]Could not open store:[/red] {exc}") + raise typer.Exit(code=1) from exc + + if not entries: + console.print("[yellow]No runbooks indexed.[/yellow] Run [bold]tai runbooks sync[/bold].") + return + + console.print(f"[bold]{len(entries)} indexed runbook(s):[/bold]") + for entry in sorted(entries, key=lambda e: e.get("title", "")): + title = entry.get("title", "?") + service = entry.get("service", "") + tags = entry.get("tags", "") + console.print(f" [green]{title}[/green] service={service} tags={tags}") + + +@runbooks_app.command("add") +def runbooks_add( + file: Annotated[str, typer.Argument(help="Path to a single runbook Markdown file.")], + store_path: Annotated[ + str, + typer.Option("--store", help="ChromaDB store path. Defaults to ~/.tai/runbooks."), + ] = "~/.tai/runbooks", + ai_host: Annotated[ + str, + typer.Option("--ai-host", help="OpenAI-compatible AI backend URL."), + ] = DEFAULT_AI_HOST, + embed_model: Annotated[ + str, + typer.Option("--embed-model", help="Embedding model name."), + ] = DEFAULT_EMBED_MODEL, + ai_key: Annotated[ + str, + typer.Option("--ai-key", help="API key for the AI backend."), + ] = "ollama", +) -> None: + """Embed and index a single runbook file into the persistent store.""" + from pathlib import Path + + runbook_path = Path(file).expanduser().resolve() + if not runbook_path.is_file(): + console.print(f"[red]File not found:[/red] {runbook_path}") + raise typer.Exit(code=1) + + ai_config = AIConfig(host=ai_host, model="", api_key=ai_key, embed_model=embed_model) + ai = AIClient(ai_config) + + try: + store = RunbookStore(store_path) + store.sync_single(runbook_path, ai) + console.print(f"[green]✓ Indexed[/green] {runbook_path.name} → {store_path}") + except Exception as exc: # noqa: BLE001 + console.print(f"[red]Add failed:[/red] {exc}") + raise typer.Exit(code=1) from exc + + def main() -> None: """Console script entrypoint.""" app() diff --git a/src/tai/plan.py b/src/tai/plan.py index e3c76a8..c6d6701 100644 --- a/src/tai/plan.py +++ b/src/tai/plan.py @@ -91,6 +91,7 @@ _KNOWN_SERVICES: list[str] = [ "docker", "containerd", "kubelet", + "sssd", "sshd", "postfix", "dovecot", @@ -107,6 +108,11 @@ _KNOWN_SERVICES: list[str] = [ "crond", "rsyslog", "auditd", + "selinux", + "apparmor", + "xorg", + "wayland", + "x2go", "firewalld", "haproxy", "varnish", @@ -121,6 +127,7 @@ _SERVICE_CONFIGS: dict[str, list[str]] = { "mysqld": ["/etc/my.cnf"], "mariadb": ["/etc/mysql/mariadb.conf.d/50-server.cnf"], "postgresql": ["/etc/postgresql"], + "sssd": ["/etc/sssd/sssd.conf"], "sshd": ["/etc/ssh/sshd_config"], "postfix": ["/etc/postfix/main.cf"], "haproxy": ["/etc/haproxy/haproxy.cfg"], @@ -128,6 +135,18 @@ _SERVICE_CONFIGS: dict[str, list[str]] = { "redis-server": ["/etc/redis/redis.conf"], "fail2ban": ["/etc/fail2ban/jail.conf"], "ufw": ["/etc/ufw/ufw.conf"], + "x2go": ["/etc/x2go"], +} + +_SERVICE_BINARIES: dict[str, list[str]] = { + "docker": ["/usr/bin/docker", "/usr/bin/dockerd"], + "sssd": ["/usr/sbin/sssd", "/usr/bin/sssctl"], + "sshd": ["/usr/sbin/sshd", "/usr/bin/ssh"], + "x2go": ["/usr/bin/x2golistsessions", "/usr/bin/x2goruncommand"], + "xorg": ["/usr/bin/Xorg", "/usr/bin/xrandr"], + "wayland": ["/usr/bin/wayland-info", "/usr/bin/Xwayland"], + "selinux": ["/usr/sbin/getenforce", "/usr/sbin/sestatus"], + "apparmor": ["/usr/sbin/aa-status", "/sbin/apparmor_parser"], } # --------------------------------------------------------------------------- @@ -200,6 +219,12 @@ def plan_from_request(request: TroubleshootRequest) -> CollectionPlan: if svc in seen: continue seen.add(svc) + plan.add( + f"unit-file-{svc}", + f"systemctl list-unit-files {svc}.service --no-pager --no-legend", + ) + for idx, binary_path in enumerate(_SERVICE_BINARIES.get(svc, []), start=1): + plan.add(f"binary-{svc}-{idx}", f"ls -l {binary_path}") plan.add(f"service-{svc}", f"systemctl status {svc}") plan.add(f"journal-{svc}", f"journalctl -u {svc} -n 100 --no-pager") for cfg_path in _SERVICE_CONFIGS.get(svc, []): @@ -242,3 +267,5 @@ def _extract_services(issue: str) -> list[str]: if words & svc_words: found.append(svc) return found + + diff --git a/src/tai/prompt_builder.py b/src/tai/prompt_builder.py index 6094123..ede0607 100644 --- a/src/tai/prompt_builder.py +++ b/src/tai/prompt_builder.py @@ -4,6 +4,7 @@ from __future__ import annotations from tai.collectors import CollectionReport from tai.rag_retriever import Chunk +from tai.runbook_store import RunbookChunk _SYSTEM_PROMPT = """\ You are an expert Linux systems administrator and troubleshooting assistant. @@ -19,6 +20,8 @@ Important rules: - For every root-cause claim, quote at least one exact snippet from collected output in backticks. - If a command shows "could not be executed (SSH error)" it means the remote host blocked or rejected that specific command — it is not evidence about the service or system state. +- If service presence checks show a unit, binary, package, or config is missing, treat that as + evidence the component may be absent or not installed, not as proof that the component is broken. - If there is not enough data to diagnose the issue, say so plainly and list exactly what additional commands or log files would be needed. - Keep the response short. Skip sections that have nothing useful to say. @@ -28,18 +31,56 @@ Important rules: - Format with clear sections: **Root Cause**, **Evidence**, **Recommended Actions**. """ +_MAX_RUNBOOK_CHARS = 500 +_MAX_DIAGNOSTIC_CHUNK_CHARS = 700 + def build_system_prompt() -> str: """Return the static system prompt for the troubleshooting agent.""" return _SYSTEM_PROMPT.strip() -def build_user_message(issue: str, report: CollectionReport) -> str: +def _format_runbook_context(runbook_chunks: list[RunbookChunk]) -> str: + """Format retrieved runbook chunks as a Markdown context section.""" + lines: list[str] = ["## Runbook context\n"] + lines.append( + "The following runbooks are relevant to this issue. " + "Use them to ground your diagnosis and recommendations in known procedures.\n" + ) + for rb in runbook_chunks: + tag_str = f" — tags: {', '.join(rb.tags)}" if rb.tags else "" + content = rb.content.strip() + if len(content) > _MAX_RUNBOOK_CHARS: + content = content[:_MAX_RUNBOOK_CHARS].rstrip() + "\n...[truncated runbook context]" + lines.append(f"### Runbook: {rb.title} ({rb.service}){tag_str}\n") + lines.append(content) + lines.append("") + return "\n".join(lines) + + +def _format_diagnostic_chunk(content: str) -> str: + """Cap diagnostic chunk size before prompt injection.""" + text = content.strip() + if len(text) <= _MAX_DIAGNOSTIC_CHUNK_CHARS: + return text + return text[:_MAX_DIAGNOSTIC_CHUNK_CHARS].rstrip() + "\n...[truncated diagnostic context]" + + +def build_user_message( + issue: str, + report: CollectionReport, + *, + runbook_chunks: list[RunbookChunk] | None = None, +) -> str: """Format *issue* and *report* into the user message sent to the AI.""" lines: list[str] = [] lines.append(f"## Issue reported\n\n{issue}\n") lines.append(f"## Target host\n\n{report.host}\n") + + if runbook_chunks: + lines.append(_format_runbook_context(runbook_chunks)) + lines.append("## Collected diagnostics\n") skipped: list[str] = [] @@ -83,9 +124,11 @@ def build_followup_message( report: CollectionReport, question: str, prior_questions: list[str], + *, + runbook_chunks: list[RunbookChunk] | None = None, ) -> str: """Build a grounded follow-up message that re-anchors to diagnostics each turn.""" - base = build_user_message(issue, report) + base = build_user_message(issue, report, runbook_chunks=runbook_chunks) lines: list[str] = [base, "## Follow-up"] if prior_questions: @@ -112,11 +155,15 @@ def build_message_with_chunks( chunks: list[Chunk], question: str, prior_questions: list[str], + *, + runbook_chunks: list[RunbookChunk] | None = None, ) -> str: """Build a follow-up message using only semantically retrieved diagnostic chunks. Used by the RAG path: instead of sending the full report, only the top-k most relevant chunks are included, reducing token usage and focusing the AI. + If *runbook_chunks* are provided they are injected as a separate context + section before the follow-up question. """ lines: list[str] = [] lines.append(f"## Issue reported\n\n{issue}\n") @@ -125,9 +172,12 @@ def build_message_with_chunks( for chunk in chunks: lines.append(f"### {chunk.name}\n") - lines.append(chunk.content) + lines.append(_format_diagnostic_chunk(chunk.content)) lines.append("") + if runbook_chunks: + lines.append(_format_runbook_context(runbook_chunks)) + lines.append("## Follow-up") if prior_questions: @@ -146,3 +196,31 @@ def build_message_with_chunks( "hypothesis unless newly retrieved evidence directly contradicts it." ) return "\n".join(lines) + + +def build_analysis_message_with_chunks( + issue: str, + host: str, + chunks: list[Chunk], + *, + runbook_chunks: list[RunbookChunk] | None = None, +) -> str: + """Build an initial analysis message from retrieved diagnostic chunks.""" + lines: list[str] = [] + lines.append(f"## Issue reported\n\n{issue}\n") + lines.append(f"## Target host\n\n{host}\n") + + if runbook_chunks: + lines.append(_format_runbook_context(runbook_chunks)) + + lines.append("## Most relevant diagnostics (retrieved by semantic similarity)\n") + for chunk in chunks: + lines.append(f"### {chunk.name}\n") + lines.append(_format_diagnostic_chunk(chunk.content)) + lines.append("") + + lines.append( + "Use the diagnostics above to provide an initial analysis. " + "If evidence is insufficient, state exactly what is missing." + ) + return "\n".join(lines) diff --git a/src/tai/runbook_store.py b/src/tai/runbook_store.py new file mode 100644 index 0000000..42778e6 --- /dev/null +++ b/src/tai/runbook_store.py @@ -0,0 +1,268 @@ +"""Persistent runbook knowledge base backed by ChromaDB (Tier 2). + +Runbooks are Markdown files with YAML-style frontmatter describing a service, +its typical symptoms, and tags used for retrieval matching. The store embeds +each runbook via AIClient and persists the collection so that queries across +sessions are instant (no re-embedding on startup). + +Typical flow +------------ +1. User runs ``tai runbooks --sync ./runbooks`` once (or after adding files). +2. On each analysis turn, the store is queried with the user's question and the + top-k matching runbooks are injected as ``## Runbook Context`` in the prompt. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from tai.ai_client import AIClient + +DEFAULT_STORE_PATH = "~/.tai/runbooks" +_COLLECTION_NAME = "tai_runbooks" + + +# --------------------------------------------------------------------------- +# Data types +# --------------------------------------------------------------------------- + + +@dataclass(slots=True) +class RunbookChunk: + """A retrieved runbook document ready for prompt injection.""" + + title: str + service: str + tags: list[str] + content: str + + +@dataclass +class RunbookMeta: + """Parsed frontmatter metadata from a runbook file.""" + + service: str = "" + symptoms: list[str] = field(default_factory=list) + tags: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Frontmatter parsing +# --------------------------------------------------------------------------- + +_FRONTMATTER_RE = re.compile(r"^\s*---\s*\n(.*?)\n---\s*\n", re.DOTALL) +_KV_RE = re.compile(r"^(\w+)\s*:\s*(.+)$") + + +def _parse_frontmatter(text: str) -> tuple[RunbookMeta, str]: + """Split YAML-style frontmatter from Markdown body. + + Supports simple ``key: value`` and ``key: item1, item2`` syntax only. + Returns ``(meta, body)``; if no frontmatter found, meta has empty fields. + """ + meta = RunbookMeta() + match = _FRONTMATTER_RE.match(text) + if not match: + return meta, text + + for line in match.group(1).splitlines(): + kv = _KV_RE.match(line.strip()) + if not kv: + continue + key, value = kv.group(1).lower(), kv.group(2).strip() + if key == "service": + meta.service = value + elif key == "symptoms": + meta.symptoms = [s.strip() for s in value.split(",") if s.strip()] + elif key == "tags": + meta.tags = [t.strip() for t in value.split(",") if t.strip()] + + body = text[match.end():] + return meta, body + + +# --------------------------------------------------------------------------- +# RunbookStore +# --------------------------------------------------------------------------- + + +class RunbookStore: + """ChromaDB-backed store for runbook documents. + + Parameters + ---------- + store_path: + Directory where ChromaDB persists its data. + Defaults to ``~/.tai/runbooks``. + """ + + def __init__(self, store_path: str | Path = DEFAULT_STORE_PATH) -> None: + import chromadb # optional dep — imported lazily + + path = Path(store_path).expanduser().resolve() + path.mkdir(parents=True, exist_ok=True) + settings = None + try: + from chromadb.config import Settings + + settings = Settings( + anonymized_telemetry=False, + chroma_product_telemetry_impl="tai.chroma_telemetry.NoOpProductTelemetryClient", + chroma_telemetry_impl="tai.chroma_telemetry.NoOpProductTelemetryClient", + ) + except (ImportError, ModuleNotFoundError): + # Test doubles may replace `chromadb` with a lightweight mock that + # does not expose the real config module. + settings = None + + if settings is None: + self._client = chromadb.PersistentClient(path=str(path)) + else: + self._client = chromadb.PersistentClient(path=str(path), settings=settings) + self._collection = self._client.get_or_create_collection( + name=_COLLECTION_NAME, + metadata={"hnsw:space": "cosine"}, + ) + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def sync(self, runbooks_dir: Path, ai: AIClient) -> int: + """Embed and upsert all ``*.md`` files from *runbooks_dir*. + + Existing documents with the same ID are updated if the file content + has changed. Returns the number of runbooks successfully indexed. + """ + runbooks_dir = Path(runbooks_dir).expanduser().resolve() + if not runbooks_dir.is_dir(): + raise FileNotFoundError(f"Runbooks directory not found: {runbooks_dir}") + + files = sorted(runbooks_dir.glob("*.md")) + if not files: + return 0 + + ids: list[str] = [] + documents: list[str] = [] + embeddings: list[list[float]] = [] + metadatas: list[dict[str, str]] = [] + + for path in files: + raw = path.read_text(encoding="utf-8") + meta, body = _parse_frontmatter(raw) + + # Embed the full document (frontmatter stripped) for semantic recall + embed_text = _build_embed_text(path.stem, meta, body) + embedding = ai.embed(embed_text) + + ids.append(path.stem) + documents.append(body.strip()) + embeddings.append(embedding) + metadatas.append( + { + "title": path.stem, + "service": meta.service, + "tags": ", ".join(meta.tags), + "symptoms": ", ".join(meta.symptoms), + } + ) + + self._collection.upsert( + ids=ids, + documents=documents, + embeddings=embeddings, + metadatas=metadatas, + ) + return len(ids) + + def sync_single(self, runbook_path: Path, ai: AIClient) -> None: + """Embed and upsert a single runbook file.""" + path = Path(runbook_path).expanduser().resolve() + if not path.is_file(): + raise FileNotFoundError(f"Runbook not found: {path}") + + raw = path.read_text(encoding="utf-8") + meta, body = _parse_frontmatter(raw) + embed_text = _build_embed_text(path.stem, meta, body) + embedding = ai.embed(embed_text) + + self._collection.upsert( + ids=[path.stem], + documents=[body.strip()], + embeddings=[embedding], + metadatas=[ + { + "title": path.stem, + "service": meta.service, + "tags": ", ".join(meta.tags), + "symptoms": ", ".join(meta.symptoms), + } + ], + ) + + def query(self, question: str, ai: AIClient, *, top_k: int = 3) -> list[RunbookChunk]: + """Return the *top_k* most relevant runbooks for *question*. + + Returns an empty list if the collection is empty or if the AI backend + is unavailable — callers should handle an empty result gracefully. + """ + if self._collection.count() == 0: + return [] + + q_embedding = ai.embed(question) + results = self._collection.query( + query_embeddings=[q_embedding], + n_results=min(top_k, self._collection.count()), + include=["documents", "metadatas"], + ) + + chunks: list[RunbookChunk] = [] + docs = results.get("documents") or [] + metas = results.get("metadatas") or [] + for doc_list, meta_list in zip(docs, metas, strict=False): + for doc, meta in zip(doc_list, meta_list, strict=False): + chunks.append( + RunbookChunk( + title=str(meta.get("title", "")), + service=str(meta.get("service", "")), + tags=[t.strip() for t in str(meta.get("tags", "")).split(",") if t.strip()], + content=doc, + ) + ) + return chunks + + def list_indexed(self) -> list[dict[str, str]]: + """Return metadata for all indexed runbooks.""" + if self._collection.count() == 0: + return [] + results = self._collection.get(include=["metadatas"]) + metas = results.get("metadatas") or [] + return [dict(m) for m in metas] + + def count(self) -> int: + """Return the number of indexed runbook documents.""" + return self._collection.count() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _build_embed_text(title: str, meta: RunbookMeta, body: str) -> str: + """Build the text to embed for a runbook — combines signals for best recall.""" + parts: list[str] = [f"title: {title}"] + if meta.service: + parts.append(f"service: {meta.service}") + if meta.symptoms: + parts.append(f"symptoms: {', '.join(meta.symptoms)}") + if meta.tags: + parts.append(f"tags: {', '.join(meta.tags)}") + # Prepend a stripped excerpt of the body for additional signal + body_excerpt = body.strip()[:800] + parts.append(body_excerpt) + return "\n".join(parts) diff --git a/tests/test_ai.py b/tests/test_ai.py index 9446823..f37de7d 100644 --- a/tests/test_ai.py +++ b/tests/test_ai.py @@ -174,6 +174,7 @@ def test_build_system_prompt_contains_key_instructions() -> None: assert "Evidence" in prompt assert "Recommended Actions" in prompt assert "read-only" in prompt.lower() + assert "absent or not installed" in prompt def test_build_user_message_contains_issue_and_host() -> None: diff --git a/tests/test_cli.py b/tests/test_cli.py index c2bc72b..2f05dd7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,3 +1,4 @@ +from types import SimpleNamespace from unittest.mock import AsyncMock, MagicMock from typer.testing import CliRunner @@ -31,7 +32,7 @@ def test_run_command_prints_scaffold_summary() -> None: result = runner.invoke( app, [ - "apache failed", + "run", "apache failed", "--host", "web01", "--port", @@ -62,7 +63,7 @@ def test_probe_success_prints_remote_output_by_default(monkeypatch) -> None: # runner = CliRunner() result = runner.invoke( app, - ["apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"], + ["run", "apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"], ) assert result.exit_code == 0 @@ -84,7 +85,7 @@ def test_probe_failure_returns_non_zero(monkeypatch) -> None: # type: ignore[no runner = CliRunner() result = runner.invoke( app, - ["apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"], + ["run", "apache failed", "--host", "ssh.archflux.net", "--port", "5566", "--probe"], ) assert result.exit_code == 1 @@ -126,7 +127,7 @@ def test_collect_success_prints_summary(monkeypatch) -> None: # type: ignore[no result = runner.invoke( app, [ - "apache failed", + "run", "apache failed", "--host", "ssh.archflux.net", "--port", @@ -172,7 +173,7 @@ def test_interactive_collect_then_quit(monkeypatch) -> None: # type: ignore[no- result = runner.invoke( app, [ - "apache failed", + "run", "apache failed", "--host", "ssh.archflux.net", "--port", @@ -210,8 +211,8 @@ def test_interactive_unknown_command_prints_hint(monkeypatch) -> None: # type: commands = iter(["what should I check next?", "/quit"]) monkeypatch.setattr("tai.cli.collect_from_plan", fake_collect_from_plan) monkeypatch.setattr( - "tai.cli.AIClient.stream", - lambda *_args, **_kwargs: iter(["Check logs."]), + "tai.cli.AIClient.complete", + lambda *_args, **_kwargs: SimpleNamespace(content="Check logs."), ) monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands)) monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True) @@ -220,7 +221,7 @@ def test_interactive_unknown_command_prints_hint(monkeypatch) -> None: # type: result = runner.invoke( app, [ - "apache failed", + "run", "apache failed", "--host", "ssh.archflux.net", "--port", @@ -257,7 +258,10 @@ def test_interactive_prints_rag_fallback_notice_on_index_failure(monkeypatch) -> commands = iter(["what should I check next?", "/quit"]) monkeypatch.setattr("tai.cli.collect_from_plan", fake_collect_from_plan) monkeypatch.setattr("tai.cli._try_embed_report", lambda *_args: (None, "embed failed", 1.0)) - monkeypatch.setattr("tai.cli.AIClient.stream", lambda *_args, **_kwargs: iter(["Check logs."])) + monkeypatch.setattr( + "tai.cli.AIClient.complete", + lambda *_args, **_kwargs: SimpleNamespace(content="Check logs."), + ) monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands)) monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True) @@ -265,7 +269,7 @@ def test_interactive_prints_rag_fallback_notice_on_index_failure(monkeypatch) -> result = runner.invoke( app, [ - "apache failed", + "run", "apache failed", "--host", "ssh.archflux.net", "--port", @@ -310,7 +314,10 @@ def test_interactive_rag_debug_prints_retrieval_scores(monkeypatch) -> None: # ), ) monkeypatch.setattr("tai.cli.AIClient.embed", lambda *_args, **_kwargs: [1.0, 0.0]) - monkeypatch.setattr("tai.cli.AIClient.stream", lambda *_args, **_kwargs: iter(["Check logs."])) + monkeypatch.setattr( + "tai.cli.AIClient.complete", + lambda *_args, **_kwargs: SimpleNamespace(content="Check logs."), + ) monkeypatch.setattr("tai.cli.console.input", lambda _prompt: next(commands)) monkeypatch.setattr("tai.cli._stdin_is_tty", lambda: True) @@ -318,7 +325,7 @@ def test_interactive_rag_debug_prints_retrieval_scores(monkeypatch) -> None: # result = runner.invoke( app, [ - "apache failed", + "run", "apache failed", "--host", "ssh.archflux.net", "--port", diff --git a/tests/test_plan.py b/tests/test_plan.py index 94016e2..4fdf556 100644 --- a/tests/test_plan.py +++ b/tests/test_plan.py @@ -80,6 +80,7 @@ def test_nginx_in_issue_adds_nginx_service_commands() -> None: plan = plan_from_request(_req("nginx is failing to start")) names = _names(plan) cmds = _commands(plan) + assert "unit-file-nginx" in names assert "service-nginx" in names assert "journal-nginx" in names assert any("systemctl status nginx" in c for c in cmds) @@ -98,6 +99,30 @@ def test_sshd_adds_config_cat() -> None: assert any("cat /etc/ssh/sshd_config" in c for c in cmds) +def test_sssd_in_issue_adds_presence_service_and_config_commands() -> None: + plan = plan_from_request(_req("troubleshoot sssd login failures")) + names = _names(plan) + cmds = _commands(plan) + assert "unit-file-sssd" in names + assert "binary-sssd-1" in names + assert "service-sssd" in names + assert "journal-sssd" in names + assert any("cat /etc/sssd/sssd.conf" in c for c in cmds) + assert any("ls -l /usr/sbin/sssd" in c for c in cmds) + assert any("list-unit-files sssd.service" in c for c in cmds) + + +def test_docker_presence_probe_checks_package_and_binary() -> None: + plan = plan_from_request(_req("docker daemon not running")) + names = _names(plan) + cmds = _commands(plan) + assert "unit-file-docker" in names + assert "binary-docker-1" in names + assert "binary-docker-2" in names + assert any("ls -l /usr/bin/docker" in c for c in cmds) + assert any("ls -l /usr/bin/dockerd" in c for c in cmds) + + def test_unknown_service_name_no_config_cat() -> None: plan = plan_from_request(_req("myweirdapp service crashed")) cmds = _commands(plan) diff --git a/tests/test_runbook_store.py b/tests/test_runbook_store.py new file mode 100644 index 0000000..a6afb7b --- /dev/null +++ b/tests/test_runbook_store.py @@ -0,0 +1,253 @@ +"""Tests for runbook_store — no network calls, ChromaDB mocked.""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from tai.runbook_store import ( + RunbookChunk, + RunbookMeta, + RunbookStore, + _build_embed_text, + _parse_frontmatter, +) + +# --------------------------------------------------------------------------- +# _parse_frontmatter +# --------------------------------------------------------------------------- + + +def test_parse_frontmatter_extracts_service() -> None: + text = "---\nservice: nginx\n---\n## Body\nsome content" + meta, body = _parse_frontmatter(text) + assert meta.service == "nginx" + assert "## Body" in body + + +def test_parse_frontmatter_extracts_tags_as_list() -> None: + text = "---\ntags: nginx, web, http\n---\nbody" + meta, body = _parse_frontmatter(text) + assert meta.tags == ["nginx", "web", "http"] + + +def test_parse_frontmatter_extracts_symptoms_as_list() -> None: + text = "---\nsymptoms: 502 Bad Gateway, upstream refused\n---\nbody" + meta, body = _parse_frontmatter(text) + assert meta.symptoms == ["502 Bad Gateway", "upstream refused"] + + +def test_parse_frontmatter_returns_empty_meta_when_missing() -> None: + text = "# Just a heading\nno frontmatter here" + meta, body = _parse_frontmatter(text) + assert meta.service == "" + assert meta.tags == [] + assert meta.symptoms == [] + assert "Just a heading" in body + + +def test_parse_frontmatter_body_strips_delimiter() -> None: + text = "---\nservice: ssh\n---\nBody starts here." + _, body = _parse_frontmatter(text) + assert body.strip() == "Body starts here." + + +# --------------------------------------------------------------------------- +# _build_embed_text +# --------------------------------------------------------------------------- + + +def test_build_embed_text_includes_title_and_service() -> None: + meta = RunbookMeta(service="nginx", symptoms=["502"], tags=["web"]) + result = _build_embed_text("nginx", meta, "body content") + assert "title: nginx" in result + assert "service: nginx" in result + + +def test_build_embed_text_includes_symptoms_and_tags() -> None: + meta = RunbookMeta(service="nginx", symptoms=["502 Bad Gateway"], tags=["web", "http"]) + result = _build_embed_text("nginx", meta, "body") + assert "502 Bad Gateway" in result + assert "web" in result + + +def test_build_embed_text_includes_body_excerpt() -> None: + meta = RunbookMeta() + result = _build_embed_text("disk", meta, "check df -h output") + assert "check df -h output" in result + + +def test_build_embed_text_truncates_long_body() -> None: + meta = RunbookMeta() + long_body = "x" * 2000 + result = _build_embed_text("disk", meta, long_body) + # Body excerpt is capped at 800 chars + assert len(result) < 1500 + + +# --------------------------------------------------------------------------- +# RunbookStore — unit tests using tmp_path and mocked chromadb +# --------------------------------------------------------------------------- + + +def _make_chromadb_mock() -> MagicMock: + """Return a chromadb mock that satisfies RunbookStore internals.""" + collection = MagicMock() + collection.count.return_value = 0 + client = MagicMock() + client.get_or_create_collection.return_value = collection + chroma_mod = MagicMock() + chroma_mod.PersistentClient.return_value = client + return chroma_mod + + +def _make_ai_mock(embedding: list[float] | None = None) -> MagicMock: + ai = MagicMock() + ai.embed.return_value = embedding or [0.1, 0.2, 0.3] + return ai + + +def test_runbook_store_sync_returns_count(tmp_path: Path) -> None: + (tmp_path / "nginx.md").write_text( + "---\nservice: nginx\ntags: web\nsymptoms: 502\n---\n## Body\ncontent" + ) + (tmp_path / "ssh.md").write_text( + "---\nservice: ssh\ntags: ssh\nsymptoms: refused\n---\n## Body\ncontent" + ) + + chroma_mock = _make_chromadb_mock() + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + count = store.sync(tmp_path, ai) + + assert count == 2 + + +def test_runbook_store_sync_calls_upsert(tmp_path: Path) -> None: + (tmp_path / "nginx.md").write_text("---\nservice: nginx\n---\nbody") + + chroma_mock = _make_chromadb_mock() + collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + store.sync(tmp_path, ai) + + collection.upsert.assert_called_once() + call_kwargs = collection.upsert.call_args.kwargs + assert "nginx" in call_kwargs["ids"] + + +def test_runbook_store_sync_empty_dir_returns_zero(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + count = store.sync(tmp_path, ai) + + assert count == 0 + + +def test_runbook_store_sync_missing_dir_raises(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + with pytest.raises(FileNotFoundError): + store.sync(tmp_path / "nonexistent", ai) + + +def test_runbook_store_query_returns_empty_when_no_docs(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + # collection.count() returns 0 by default in our mock + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + results = store.query("disk full", ai) + + assert results == [] + + +def test_runbook_store_query_returns_runbook_chunks(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value + collection.count.return_value = 2 + collection.query.return_value = { + "documents": [["## Body\ncheck df -h"]], + "metadatas": [ + [{"title": "disk", "service": "disk", "tags": "disk, storage", "symptoms": "full"}] + ], + } + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + results = store.query("disk is full", ai) + + assert len(results) == 1 + assert isinstance(results[0], RunbookChunk) + assert results[0].title == "disk" + assert results[0].service == "disk" + assert "disk" in results[0].tags + assert "df -h" in results[0].content + + +def test_runbook_store_list_indexed_returns_metadata(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value + collection.count.return_value = 1 + collection.get.return_value = { + "metadatas": [{"title": "nginx", "service": "nginx", "tags": "web", "symptoms": "502"}] + } + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + entries = store.list_indexed() + + assert len(entries) == 1 + assert entries[0]["title"] == "nginx" + + +def test_runbook_store_count_delegates_to_collection(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value + collection.count.return_value = 5 + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + assert store.count() == 5 + + +def test_runbook_store_sync_single_upserts_one(tmp_path: Path) -> None: + runbook = tmp_path / "nginx.md" + runbook.write_text("---\nservice: nginx\ntags: web\n---\nbody text") + + chroma_mock = _make_chromadb_mock() + collection = chroma_mock.PersistentClient.return_value.get_or_create_collection.return_value + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + store.sync_single(runbook, ai) + + collection.upsert.assert_called_once() + call_kwargs = collection.upsert.call_args.kwargs + assert call_kwargs["ids"] == ["nginx"] + + +def test_runbook_store_sync_single_missing_file_raises(tmp_path: Path) -> None: + chroma_mock = _make_chromadb_mock() + ai = _make_ai_mock() + + with patch.dict("sys.modules", {"chromadb": chroma_mock}): + store = RunbookStore(tmp_path / "store") + with pytest.raises(FileNotFoundError): + store.sync_single(tmp_path / "missing.md", ai)