From b58a4e46920b494b3fd5b490e864b30683819d38 Mon Sep 17 00:00:00 2001
From: Storme-bit <tk.stomre@gmail.com>
Date: Mon, 27 Apr 2026 20:17:05 -0700
Subject: [PATCH] minor clean up

---
 docs/roadmap.md                               |  2 +-
 package-lock.json                             |  3 +-
 packages/embedding-service/CLAUDE.md          | 64 ++++++++++++++++
 packages/embedding-service/package.json       |  3 +-
 packages/embedding-service/src/index.js       | 12 ++-
 packages/inference-service/CLAUDE.md          | 75 +++++++++++++++++++
 packages/inference-service/src/index.js       |  2 +-
 .../inference-service/src/providers/ollama.js | 11 ++-
 .../inference-service/src/routes/inference.js |  4 +-
 .../memory-service/src/entities/extraction.js |  1 +
 packages/memory-service/src/episodic/index.js |  8 +-
 packages/memory-service/src/index.js          |  2 +-
 packages/orchestration-service/src/index.js   |  2 +-
 13 files changed, 171 insertions(+), 18 deletions(-)
 create mode 100644 packages/embedding-service/CLAUDE.md
 create mode 100644 packages/inference-service/CLAUDE.md

diff --git a/docs/roadmap.md b/docs/roadmap.md
index 680c5ae..37735b5 100644
--- a/docs/roadmap.md
+++ b/docs/roadmap.md
@@ -57,7 +57,7 @@
 ## Phase 2 — Memory System Upgrades
 *The core intelligence layer*
 
-### 1. Knowledge Graph (SQLite)
+### 1. Knowledge Graph (SQLite) ✅
 The highest-leverage memory upgrade. Transforms NexusAI from "remembers conversations" to "understands relationships between things."
 - [x] Graph schema — `nodes` and `edges` tables with typed relationships
 - [x] Entity → node promotion pipeline (`mention_count` tracked; threshold gating deferred to Phase 2)
diff --git a/package-lock.json b/package-lock.json
index 559afc9..a9e9063 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -4224,8 +4224,7 @@
       "dependencies": {
         "@nexusai/shared": "^1.0.0",
         "dotenv": "^17.4.0",
-        "express": "^5.2.1",
-        "ollama": "^0.6.3"
+        "express": "^5.2.1"
       }
     },
     "packages/inference-service": {
diff --git a/packages/embedding-service/CLAUDE.md b/packages/embedding-service/CLAUDE.md
new file mode 100644
index 0000000..d132b05
--- /dev/null
+++ b/packages/embedding-service/CLAUDE.md
@@ -0,0 +1,64 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
+
+## Running This Service
+
+```bash
+npm run embedding                          # From repo root
+npm -w packages/embedding-service run dev  # With --watch
+```
+
+Default port: **3003**. Requires Ollama to be reachable at `OLLAMA_URL`.
+
+## Single-File Service
+
+The entire service is `src/index.js` — no subdirectory structure. All routes, the Ollama helper, and startup are in one file.
+
+## Environment Variables
+
+| Variable | Default | Description |
+|---|---|---|
+| `PORT` | `3003` | Port to listen on |
+| `OLLAMA_URL` | `http://localhost:11434` | Ollama instance URL |
+| `EMBEDDING_MODEL` | `nomic-embed-text` | Model passed to Ollama `/api/embed` |
+
+Note: the env var name is `EMBEDDING_MODEL`, not `EMBED_MODEL` — the internal constant is `EMBED_MODEL` but the lookup key is different.
+
+## Ollama API Details
+
+Uses Ollama's `/api/embed` endpoint (not `/api/embeddings`). Request shape:
+
+```json
+{ "model": "nomic-embed-text", "input": "text to embed" }
+```
+
+Ollama returns `{ "embeddings": [[...]] }` — an array of arrays even for a single input. The helper takes `data.embeddings[0]` to return the single vector.
+
+The `ollama` npm package is listed as a dependency but is **not used** — all calls are raw `fetch`. Do not refactor to use the package without checking the API shape matches.
+
+## Batch Endpoint
+
+`POST /embed/batch` embeds items **sequentially** in a for-loop, not in parallel. The comment explains this: Ollama doesn't parallelise embedding calls, so parallel requests would queue internally anyway. Do not change to `Promise.all` without verifying Ollama behaviour.
+
+## Error Responses
+
+| Condition | Status | Notes |
+|---|---|---|
+| Missing/empty `text` | 400 | |
+| Ollama call fails | 502 | Upstream failure — correct status |
+| Empty `texts` array | 400 | |
+
+## Known Issue
+
+The 400 error message for `/embed` reads `"text is required and must be empty"` — the word "not" is missing. Should read `"must not be empty"`.
+
+## API Endpoints
+
+| Method | Path | Notes |
+|---|---|---|
+| GET | `/health` | Static response — does not verify Ollama is reachable |
+| POST | `/embed` | Body: `{ text: string }`. Returns `{ embedding, model, dimensions }` |
+| POST | `/embed/batch` | Body: `{ texts: string[] }`. Returns `{ embeddings, model, dimensions, count }` |
diff --git a/packages/embedding-service/package.json b/packages/embedding-service/package.json
index d9d7104..f72ff46 100644
--- a/packages/embedding-service/package.json
+++ b/packages/embedding-service/package.json
@@ -9,7 +9,6 @@
   "dependencies": {
     "@nexusai/shared": "^1.0.0",
     "dotenv": "^17.4.0",
-    "express": "^5.2.1",
-    "ollama": "^0.6.3"
+    "express": "^5.2.1"
   }
 }
diff --git a/packages/embedding-service/src/index.js b/packages/embedding-service/src/index.js
index 74e4077..bd4fbd5 100644
--- a/packages/embedding-service/src/index.js
+++ b/packages/embedding-service/src/index.js
@@ -3,7 +3,7 @@ const express = require('express');
 const {getEnv, OLLAMA, PORTS, logger} = require('@nexusai/shared');
 
 const app = express();
-app.use(express.json());
+app.use(express.json({ limit: '1mb' }));    // limit request body to 1mb to prevent abuse - embedding requests should be small
 
 const PORT          = getEnv('PORT',            PORTS.EMBEDDING);  
 const OLLAMA_URL    = getEnv('OLLAMA_URL',      OLLAMA.DEFAULT_URL); 
@@ -14,7 +14,8 @@ async function embedText(text) {
     const res = await fetch(`${OLLAMA_URL}/api/embed`, {
         method: 'POST',
         headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ model: EMBED_MODEL, input: text })
+        body: JSON.stringify({ model: EMBED_MODEL, input: text }),
+        signal: AbortSignal.timeout(30_000),
     });
 
     if (!res.ok) {
@@ -37,7 +38,7 @@ app.get('/health', (req,res) => {
 app.post('/embed', async (req, res) => {
     const { text } = req.body;
     if (!text || typeof text !== 'string' || text.trim() === '') {
-        return res.status(400).json({ error: 'text is required and must be empty' });
+        return res.status(400).json({ error: 'text is required and must not be empty' });
     }
 
     try {
@@ -60,7 +61,10 @@ app.post('/embed/batch', async (req, res) => {
     }
 
     try {
-        //sequential embedding for now, Ollama doesn't natively parallize embeddings
+        const invalid = texts.findIndex(t => !t || typeof t !== 'string' || t.trim() === '');
+        if (invalid !== -1)
+            return res.status(400).json({ error: `texts[${invalid}] is empty or not a string` });
+
         const embeddings = [];
         for (const text of texts) {
             embeddings.push(await embedText(text.trim()));
diff --git a/packages/inference-service/CLAUDE.md b/packages/inference-service/CLAUDE.md
new file mode 100644
index 0000000..c781eba
--- /dev/null
+++ b/packages/inference-service/CLAUDE.md
@@ -0,0 +1,75 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
+
+## Running This Service
+
+```bash
+npm run inference                          # From repo root
+npm -w packages/inference-service run dev  # With --watch
+```
+
+Default port: **3001**. Set `INFERENCE_PROVIDER` to select the backend.
+
+## Provider Pattern
+
+`src/infer.js` reads `INFERENCE_PROVIDER` at startup and loads one of two providers:
+
+| `INFERENCE_PROVIDER` | Module | Backend |
+|---|---|---|
+| `ollama` (default) | `src/providers/ollama.js` | Ollama npm client → `/api/generate` |
+| `llamacpp` | `src/providers/llamacpp.js` | Raw fetch → `/v1/chat/completions` (OpenAI-compatible) |
+
+An unknown provider throws immediately at startup — fail-fast, not at request time.
+
+Both providers export the same interface: `complete(prompt, options)` and `completeStream(prompt, options)`.
+
+## Environment Variables
+
+| Variable | Default | Description |
+|---|---|---|
+| `PORT` | `3001` | Port to listen on |
+| `INFERENCE_PROVIDER` | `ollama` | `ollama` or `llamacpp` |
+| `INFERENCE_URL` | `http://localhost:11434` (Ollama) / `http://localhost:8080` (llama.cpp) | Backend URL |
+| `DEFAULT_MODEL` | Provider-specific | Model name passed to backend |
+
+`INFERENCE_URL` defaults differ per provider — Ollama uses the Ollama default URL, llama.cpp uses the llama-server default.
+
+## Options Resolution
+
+Both providers use `resolveOptions(options)` to merge caller-supplied options with `INFERENCE_DEFAULTS` from shared constants. Any option not supplied by the caller falls back to the constant.
+
+## Streaming Chunk Format
+
+The two providers yield differently shaped chunks — the route in `src/routes/inference.js` normalises them:
+
+**Ollama** yields raw Ollama generate chunks: `{ response, done, model, eval_count, prompt_eval_count, ... }`
+
+**llama.cpp** yields:
+- Per-token: `{ response: delta, done: false }`
+- Final: `{ response: '', done: true, model, tokenCount }` — token count is the sum of `completion_tokens + prompt_tokens` from the usage chunk
+
+The route checks `chunk.response` to stream text and `chunk.done` to capture metadata. For Ollama streaming, **token count is not captured** — the done chunk from Ollama contains `eval_count`/`prompt_eval_count` but the route only reads `chunk.tokenCount` (a llama.cpp field). Ollama streaming calls always report `tokenCount: 0` to the client.
+
+## Known Issue: `maxTokens` Missing from Streaming Route
+
+`POST /complete` correctly destructures `maxTokens` from the request body and passes it through. `POST /complete/stream` does **not** — it omits `maxTokens` from its destructuring, so streaming completions always use `INFERENCE_DEFAULTS.MAX_TOKENS` regardless of what the caller sends. This means `/chat/stream` has a different effective token ceiling than `/chat`.
+
+## SSE Format (route → caller)
+
+```
+data: {"response":"Hello"}        ← per token
+data: {"response":" world"}
+data: {"done":true,"model":"...","tokenCount":42}  ← final metadata
+data: [DONE]                       ← sentinel
+```
+
+## API Endpoints
+
+| Method | Path | Notes |
+|---|---|---|
+| GET | `/health` | Returns `{ service, status, provider, model }` |
+| POST | `/complete` | Body: `{ prompt, model?, temperature?, maxTokens?, topP?, topK?, repeatPenalty? }` |
+| POST | `/complete/stream` | Same body as `/complete` except `maxTokens` is silently ignored |
diff --git a/packages/inference-service/src/index.js b/packages/inference-service/src/index.js
index e85dac8..55169cc 100644
--- a/packages/inference-service/src/index.js
+++ b/packages/inference-service/src/index.js
@@ -4,7 +4,7 @@ const {getEnv, PORTS, OLLAMA, logger} = require('@nexusai/shared');
 const inferenceRouter = require('./routes/inference');
 
 const app = express();
-app.use(express.json());
+app.use(express.json({ limit: '8mb' }));  // prompts include full context window
 
 const PORT      = getEnv('PORT', PORTS.INFERENCE);
 const PROVIDER  = getEnv('INFERENCE_PROVIDER',   'ollama');
diff --git a/packages/inference-service/src/providers/ollama.js b/packages/inference-service/src/providers/ollama.js
index 7bda6f2..2355583 100644
--- a/packages/inference-service/src/providers/ollama.js
+++ b/packages/inference-service/src/providers/ollama.js
@@ -57,7 +57,16 @@ async function* completeStream(prompt, options = {} ) {
     });
 
     for await (const chunk of stream) {
-        yield chunk;
+        if (chunk.done) {
+            yield {
+                response:   '',
+                done:       true,
+                model:      chunk.model,
+                tokenCount: (chunk.eval_count ?? 0) + (chunk.prompt_eval_count ?? 0),
+            };
+        } else {
+            yield chunk;
+        }
     }
 }
 
diff --git a/packages/inference-service/src/routes/inference.js b/packages/inference-service/src/routes/inference.js
index f245e4d..3442bfd 100644
--- a/packages/inference-service/src/routes/inference.js
+++ b/packages/inference-service/src/routes/inference.js
@@ -23,7 +23,7 @@ router.post('/complete', async (req, res) => {
 
 // Streaming completion endpoint - sends partial responses as they arrive
 router.post('/complete/stream', async (req, res) => {
-    const { prompt, model, temperature, topP, topK, repeatPenalty } = req.body;
+    const { prompt, model, temperature, maxTokens, topP, topK, repeatPenalty } = req.body;
 
     if (!prompt) return res.status(400).json({ error: 'prompt is required' });
 
@@ -35,7 +35,7 @@ router.post('/complete/stream', async (req, res) => {
         let lastModel = model;
         let tokenCount = 0;
 
-        for await (const chunk of completeStream(prompt, { model, temperature, topP, topK, repeatPenalty })) {
+        for await (const chunk of completeStream(prompt, { model, temperature, maxTokens,topP, topK, repeatPenalty })) {
             if (chunk.response) {
                 res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
             }
diff --git a/packages/memory-service/src/entities/extraction.js b/packages/memory-service/src/entities/extraction.js
index ff860f3..cd6a1f7 100644
--- a/packages/memory-service/src/entities/extraction.js
+++ b/packages/memory-service/src/entities/extraction.js
@@ -87,6 +87,7 @@ async function extractAndStoreEntities(userMessage, aiResponse, episodeId=null,
                     num_predict: ENTITIES.NUM_PREDICT,
                 },
             }),
+            signal: AbortSignal.timeout(60_000),
         });
 
         if (!res.ok) throw new Error(`Ollama responded ${res.status}`);
diff --git a/packages/memory-service/src/episodic/index.js b/packages/memory-service/src/episodic/index.js
index c90c347..0d73f80 100644
--- a/packages/memory-service/src/episodic/index.js
+++ b/packages/memory-service/src/episodic/index.js
@@ -170,6 +170,7 @@ function getRecentEpisodes(sessionId, limit = EPISODIC.DEFAULT_RECENT_LIMIT) {
 // Searches episodes using FTS5 full-text search, ordered by relevance, with a limit
 function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds = null) {
   const db = getDB();
+  const safeQuery = `"${query.replace(/"/g, '""')}"`;
   if (sessionIds && sessionIds.length > 0) {
     const ph = sessionIds.map(() => '?').join(',');
     return db.prepare(`
@@ -179,7 +180,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
       AND e.session_id IN (${ph})
       ORDER BY rank
       LIMIT ?
-    `).all(query, ...sessionIds, limit).map(parseRow);
+    `).all(safeQuery, ...sessionIds, limit).map(parseRow);
   }
   return db.prepare(`
     SELECT e.* FROM episodes e
@@ -187,7 +188,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
     WHERE episodes_fts MATCH ?
     ORDER BY rank
     LIMIT ?
-  `).all(query, limit).map(parseRow);
+  `).all(safeQuery, limit).map(parseRow);
 }
 
 // Deletes an episode by its ID
@@ -206,7 +207,8 @@ async function getEpisodeEmbedding(userMessage, aiResponse){
   const res = await fetch(`${url}/embed`, {
     method: 'POST',
     headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({ text })  
+    body: JSON.stringify({ text }),  
+    signal: AbortSignal.timeout(30_000),
   })
 
   if (!res.ok) {
diff --git a/packages/memory-service/src/index.js b/packages/memory-service/src/index.js
index 91c670d..9b6f054 100644
--- a/packages/memory-service/src/index.js
+++ b/packages/memory-service/src/index.js
@@ -12,7 +12,7 @@ const semantic = require('./semantic');
 const entities = require('./entities');
 
 const app = express();
-app.use(express.json());
+app.use(express.json({ limit: '2mb' }));
 
 const  PORT = getEnv('PORT', PORTS.MEMORY);
 
diff --git a/packages/orchestration-service/src/index.js b/packages/orchestration-service/src/index.js
index 5b9102f..85354de 100644
--- a/packages/orchestration-service/src/index.js
+++ b/packages/orchestration-service/src/index.js
@@ -15,7 +15,7 @@ const summariesRouter = require('./routes/summaries')
 const cors = require('cors');
 
 const app = express();
-app.use(express.json());
+app.use(express.json({ limit: '2mb' }));
 
 app.use(cors({
     origin: [