From b58a4e46920b494b3fd5b490e864b30683819d38 Mon Sep 17 00:00:00 2001 From: Storme-bit Date: Mon, 27 Apr 2026 20:17:05 -0700 Subject: [PATCH] minor clean up --- docs/roadmap.md | 2 +- package-lock.json | 3 +- packages/embedding-service/CLAUDE.md | 64 ++++++++++++++++ packages/embedding-service/package.json | 3 +- packages/embedding-service/src/index.js | 12 ++- packages/inference-service/CLAUDE.md | 75 +++++++++++++++++++ packages/inference-service/src/index.js | 2 +- .../inference-service/src/providers/ollama.js | 11 ++- .../inference-service/src/routes/inference.js | 4 +- .../memory-service/src/entities/extraction.js | 1 + packages/memory-service/src/episodic/index.js | 8 +- packages/memory-service/src/index.js | 2 +- packages/orchestration-service/src/index.js | 2 +- 13 files changed, 171 insertions(+), 18 deletions(-) create mode 100644 packages/embedding-service/CLAUDE.md create mode 100644 packages/inference-service/CLAUDE.md diff --git a/docs/roadmap.md b/docs/roadmap.md index 680c5ae..37735b5 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -57,7 +57,7 @@ ## Phase 2 — Memory System Upgrades *The core intelligence layer* -### 1. Knowledge Graph (SQLite) +### 1. Knowledge Graph (SQLite) ✅ The highest-leverage memory upgrade. Transforms NexusAI from "remembers conversations" to "understands relationships between things." - [x] Graph schema — `nodes` and `edges` tables with typed relationships - [x] Entity → node promotion pipeline (`mention_count` tracked; threshold gating deferred to Phase 2) diff --git a/package-lock.json b/package-lock.json index 559afc9..a9e9063 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4224,8 +4224,7 @@ "dependencies": { "@nexusai/shared": "^1.0.0", "dotenv": "^17.4.0", - "express": "^5.2.1", - "ollama": "^0.6.3" + "express": "^5.2.1" } }, "packages/inference-service": { diff --git a/packages/embedding-service/CLAUDE.md b/packages/embedding-service/CLAUDE.md new file mode 100644 index 0000000..d132b05 --- /dev/null +++ b/packages/embedding-service/CLAUDE.md @@ -0,0 +1,64 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout. + +## Running This Service + +```bash +npm run embedding # From repo root +npm -w packages/embedding-service run dev # With --watch +``` + +Default port: **3003**. Requires Ollama to be reachable at `OLLAMA_URL`. + +## Single-File Service + +The entire service is `src/index.js` — no subdirectory structure. All routes, the Ollama helper, and startup are in one file. + +## Environment Variables + +| Variable | Default | Description | +|---|---|---| +| `PORT` | `3003` | Port to listen on | +| `OLLAMA_URL` | `http://localhost:11434` | Ollama instance URL | +| `EMBEDDING_MODEL` | `nomic-embed-text` | Model passed to Ollama `/api/embed` | + +Note: the env var name is `EMBEDDING_MODEL`, not `EMBED_MODEL` — the internal constant is `EMBED_MODEL` but the lookup key is different. + +## Ollama API Details + +Uses Ollama's `/api/embed` endpoint (not `/api/embeddings`). Request shape: + +```json +{ "model": "nomic-embed-text", "input": "text to embed" } +``` + +Ollama returns `{ "embeddings": [[...]] }` — an array of arrays even for a single input. The helper takes `data.embeddings[0]` to return the single vector. + +The `ollama` npm package is listed as a dependency but is **not used** — all calls are raw `fetch`. Do not refactor to use the package without checking the API shape matches. + +## Batch Endpoint + +`POST /embed/batch` embeds items **sequentially** in a for-loop, not in parallel. The comment explains this: Ollama doesn't parallelise embedding calls, so parallel requests would queue internally anyway. Do not change to `Promise.all` without verifying Ollama behaviour. + +## Error Responses + +| Condition | Status | Notes | +|---|---|---| +| Missing/empty `text` | 400 | | +| Ollama call fails | 502 | Upstream failure — correct status | +| Empty `texts` array | 400 | | + +## Known Issue + +The 400 error message for `/embed` reads `"text is required and must be empty"` — the word "not" is missing. Should read `"must not be empty"`. + +## API Endpoints + +| Method | Path | Notes | +|---|---|---| +| GET | `/health` | Static response — does not verify Ollama is reachable | +| POST | `/embed` | Body: `{ text: string }`. Returns `{ embedding, model, dimensions }` | +| POST | `/embed/batch` | Body: `{ texts: string[] }`. Returns `{ embeddings, model, dimensions, count }` | diff --git a/packages/embedding-service/package.json b/packages/embedding-service/package.json index d9d7104..f72ff46 100644 --- a/packages/embedding-service/package.json +++ b/packages/embedding-service/package.json @@ -9,7 +9,6 @@ "dependencies": { "@nexusai/shared": "^1.0.0", "dotenv": "^17.4.0", - "express": "^5.2.1", - "ollama": "^0.6.3" + "express": "^5.2.1" } } diff --git a/packages/embedding-service/src/index.js b/packages/embedding-service/src/index.js index 74e4077..bd4fbd5 100644 --- a/packages/embedding-service/src/index.js +++ b/packages/embedding-service/src/index.js @@ -3,7 +3,7 @@ const express = require('express'); const {getEnv, OLLAMA, PORTS, logger} = require('@nexusai/shared'); const app = express(); -app.use(express.json()); +app.use(express.json({ limit: '1mb' })); // limit request body to 1mb to prevent abuse - embedding requests should be small const PORT = getEnv('PORT', PORTS.EMBEDDING); const OLLAMA_URL = getEnv('OLLAMA_URL', OLLAMA.DEFAULT_URL); @@ -14,7 +14,8 @@ async function embedText(text) { const res = await fetch(`${OLLAMA_URL}/api/embed`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ model: EMBED_MODEL, input: text }) + body: JSON.stringify({ model: EMBED_MODEL, input: text }), + signal: AbortSignal.timeout(30_000), }); if (!res.ok) { @@ -37,7 +38,7 @@ app.get('/health', (req,res) => { app.post('/embed', async (req, res) => { const { text } = req.body; if (!text || typeof text !== 'string' || text.trim() === '') { - return res.status(400).json({ error: 'text is required and must be empty' }); + return res.status(400).json({ error: 'text is required and must not be empty' }); } try { @@ -60,7 +61,10 @@ app.post('/embed/batch', async (req, res) => { } try { - //sequential embedding for now, Ollama doesn't natively parallize embeddings + const invalid = texts.findIndex(t => !t || typeof t !== 'string' || t.trim() === ''); + if (invalid !== -1) + return res.status(400).json({ error: `texts[${invalid}] is empty or not a string` }); + const embeddings = []; for (const text of texts) { embeddings.push(await embedText(text.trim())); diff --git a/packages/inference-service/CLAUDE.md b/packages/inference-service/CLAUDE.md new file mode 100644 index 0000000..c781eba --- /dev/null +++ b/packages/inference-service/CLAUDE.md @@ -0,0 +1,75 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout. + +## Running This Service + +```bash +npm run inference # From repo root +npm -w packages/inference-service run dev # With --watch +``` + +Default port: **3001**. Set `INFERENCE_PROVIDER` to select the backend. + +## Provider Pattern + +`src/infer.js` reads `INFERENCE_PROVIDER` at startup and loads one of two providers: + +| `INFERENCE_PROVIDER` | Module | Backend | +|---|---|---| +| `ollama` (default) | `src/providers/ollama.js` | Ollama npm client → `/api/generate` | +| `llamacpp` | `src/providers/llamacpp.js` | Raw fetch → `/v1/chat/completions` (OpenAI-compatible) | + +An unknown provider throws immediately at startup — fail-fast, not at request time. + +Both providers export the same interface: `complete(prompt, options)` and `completeStream(prompt, options)`. + +## Environment Variables + +| Variable | Default | Description | +|---|---|---| +| `PORT` | `3001` | Port to listen on | +| `INFERENCE_PROVIDER` | `ollama` | `ollama` or `llamacpp` | +| `INFERENCE_URL` | `http://localhost:11434` (Ollama) / `http://localhost:8080` (llama.cpp) | Backend URL | +| `DEFAULT_MODEL` | Provider-specific | Model name passed to backend | + +`INFERENCE_URL` defaults differ per provider — Ollama uses the Ollama default URL, llama.cpp uses the llama-server default. + +## Options Resolution + +Both providers use `resolveOptions(options)` to merge caller-supplied options with `INFERENCE_DEFAULTS` from shared constants. Any option not supplied by the caller falls back to the constant. + +## Streaming Chunk Format + +The two providers yield differently shaped chunks — the route in `src/routes/inference.js` normalises them: + +**Ollama** yields raw Ollama generate chunks: `{ response, done, model, eval_count, prompt_eval_count, ... }` + +**llama.cpp** yields: +- Per-token: `{ response: delta, done: false }` +- Final: `{ response: '', done: true, model, tokenCount }` — token count is the sum of `completion_tokens + prompt_tokens` from the usage chunk + +The route checks `chunk.response` to stream text and `chunk.done` to capture metadata. For Ollama streaming, **token count is not captured** — the done chunk from Ollama contains `eval_count`/`prompt_eval_count` but the route only reads `chunk.tokenCount` (a llama.cpp field). Ollama streaming calls always report `tokenCount: 0` to the client. + +## Known Issue: `maxTokens` Missing from Streaming Route + +`POST /complete` correctly destructures `maxTokens` from the request body and passes it through. `POST /complete/stream` does **not** — it omits `maxTokens` from its destructuring, so streaming completions always use `INFERENCE_DEFAULTS.MAX_TOKENS` regardless of what the caller sends. This means `/chat/stream` has a different effective token ceiling than `/chat`. + +## SSE Format (route → caller) + +``` +data: {"response":"Hello"} ← per token +data: {"response":" world"} +data: {"done":true,"model":"...","tokenCount":42} ← final metadata +data: [DONE] ← sentinel +``` + +## API Endpoints + +| Method | Path | Notes | +|---|---|---| +| GET | `/health` | Returns `{ service, status, provider, model }` | +| POST | `/complete` | Body: `{ prompt, model?, temperature?, maxTokens?, topP?, topK?, repeatPenalty? }` | +| POST | `/complete/stream` | Same body as `/complete` except `maxTokens` is silently ignored | diff --git a/packages/inference-service/src/index.js b/packages/inference-service/src/index.js index e85dac8..55169cc 100644 --- a/packages/inference-service/src/index.js +++ b/packages/inference-service/src/index.js @@ -4,7 +4,7 @@ const {getEnv, PORTS, OLLAMA, logger} = require('@nexusai/shared'); const inferenceRouter = require('./routes/inference'); const app = express(); -app.use(express.json()); +app.use(express.json({ limit: '8mb' })); // prompts include full context window const PORT = getEnv('PORT', PORTS.INFERENCE); const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama'); diff --git a/packages/inference-service/src/providers/ollama.js b/packages/inference-service/src/providers/ollama.js index 7bda6f2..2355583 100644 --- a/packages/inference-service/src/providers/ollama.js +++ b/packages/inference-service/src/providers/ollama.js @@ -57,7 +57,16 @@ async function* completeStream(prompt, options = {} ) { }); for await (const chunk of stream) { - yield chunk; + if (chunk.done) { + yield { + response: '', + done: true, + model: chunk.model, + tokenCount: (chunk.eval_count ?? 0) + (chunk.prompt_eval_count ?? 0), + }; + } else { + yield chunk; + } } } diff --git a/packages/inference-service/src/routes/inference.js b/packages/inference-service/src/routes/inference.js index f245e4d..3442bfd 100644 --- a/packages/inference-service/src/routes/inference.js +++ b/packages/inference-service/src/routes/inference.js @@ -23,7 +23,7 @@ router.post('/complete', async (req, res) => { // Streaming completion endpoint - sends partial responses as they arrive router.post('/complete/stream', async (req, res) => { - const { prompt, model, temperature, topP, topK, repeatPenalty } = req.body; + const { prompt, model, temperature, maxTokens, topP, topK, repeatPenalty } = req.body; if (!prompt) return res.status(400).json({ error: 'prompt is required' }); @@ -35,7 +35,7 @@ router.post('/complete/stream', async (req, res) => { let lastModel = model; let tokenCount = 0; - for await (const chunk of completeStream(prompt, { model, temperature, topP, topK, repeatPenalty })) { + for await (const chunk of completeStream(prompt, { model, temperature, maxTokens,topP, topK, repeatPenalty })) { if (chunk.response) { res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`); } diff --git a/packages/memory-service/src/entities/extraction.js b/packages/memory-service/src/entities/extraction.js index ff860f3..cd6a1f7 100644 --- a/packages/memory-service/src/entities/extraction.js +++ b/packages/memory-service/src/entities/extraction.js @@ -87,6 +87,7 @@ async function extractAndStoreEntities(userMessage, aiResponse, episodeId=null, num_predict: ENTITIES.NUM_PREDICT, }, }), + signal: AbortSignal.timeout(60_000), }); if (!res.ok) throw new Error(`Ollama responded ${res.status}`); diff --git a/packages/memory-service/src/episodic/index.js b/packages/memory-service/src/episodic/index.js index c90c347..0d73f80 100644 --- a/packages/memory-service/src/episodic/index.js +++ b/packages/memory-service/src/episodic/index.js @@ -170,6 +170,7 @@ function getRecentEpisodes(sessionId, limit = EPISODIC.DEFAULT_RECENT_LIMIT) { // Searches episodes using FTS5 full-text search, ordered by relevance, with a limit function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds = null) { const db = getDB(); + const safeQuery = `"${query.replace(/"/g, '""')}"`; if (sessionIds && sessionIds.length > 0) { const ph = sessionIds.map(() => '?').join(','); return db.prepare(` @@ -179,7 +180,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds AND e.session_id IN (${ph}) ORDER BY rank LIMIT ? - `).all(query, ...sessionIds, limit).map(parseRow); + `).all(safeQuery, ...sessionIds, limit).map(parseRow); } return db.prepare(` SELECT e.* FROM episodes e @@ -187,7 +188,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds WHERE episodes_fts MATCH ? ORDER BY rank LIMIT ? - `).all(query, limit).map(parseRow); + `).all(safeQuery, limit).map(parseRow); } // Deletes an episode by its ID @@ -206,7 +207,8 @@ async function getEpisodeEmbedding(userMessage, aiResponse){ const res = await fetch(`${url}/embed`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ text }) + body: JSON.stringify({ text }), + signal: AbortSignal.timeout(30_000), }) if (!res.ok) { diff --git a/packages/memory-service/src/index.js b/packages/memory-service/src/index.js index 91c670d..9b6f054 100644 --- a/packages/memory-service/src/index.js +++ b/packages/memory-service/src/index.js @@ -12,7 +12,7 @@ const semantic = require('./semantic'); const entities = require('./entities'); const app = express(); -app.use(express.json()); +app.use(express.json({ limit: '2mb' })); const PORT = getEnv('PORT', PORTS.MEMORY); diff --git a/packages/orchestration-service/src/index.js b/packages/orchestration-service/src/index.js index 5b9102f..85354de 100644 --- a/packages/orchestration-service/src/index.js +++ b/packages/orchestration-service/src/index.js @@ -15,7 +15,7 @@ const summariesRouter = require('./routes/summaries') const cors = require('cors'); const app = express(); -app.use(express.json()); +app.use(express.json({ limit: '2mb' })); app.use(cors({ origin: [