minor clean up

This commit is contained in:
Storme-bit
2026-04-27 20:17:05 -07:00
parent 055683424d
commit b58a4e4692
13 changed files with 171 additions and 18 deletions

View File

@@ -57,7 +57,7 @@
## Phase 2 — Memory System Upgrades ## Phase 2 — Memory System Upgrades
*The core intelligence layer* *The core intelligence layer*
### 1. Knowledge Graph (SQLite) ### 1. Knowledge Graph (SQLite)
The highest-leverage memory upgrade. Transforms NexusAI from "remembers conversations" to "understands relationships between things." The highest-leverage memory upgrade. Transforms NexusAI from "remembers conversations" to "understands relationships between things."
- [x] Graph schema — `nodes` and `edges` tables with typed relationships - [x] Graph schema — `nodes` and `edges` tables with typed relationships
- [x] Entity → node promotion pipeline (`mention_count` tracked; threshold gating deferred to Phase 2) - [x] Entity → node promotion pipeline (`mention_count` tracked; threshold gating deferred to Phase 2)

3
package-lock.json generated
View File

@@ -4224,8 +4224,7 @@
"dependencies": { "dependencies": {
"@nexusai/shared": "^1.0.0", "@nexusai/shared": "^1.0.0",
"dotenv": "^17.4.0", "dotenv": "^17.4.0",
"express": "^5.2.1", "express": "^5.2.1"
"ollama": "^0.6.3"
} }
}, },
"packages/inference-service": { "packages/inference-service": {

View File

@@ -0,0 +1,64 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
## Running This Service
```bash
npm run embedding # From repo root
npm -w packages/embedding-service run dev # With --watch
```
Default port: **3003**. Requires Ollama to be reachable at `OLLAMA_URL`.
## Single-File Service
The entire service is `src/index.js` — no subdirectory structure. All routes, the Ollama helper, and startup are in one file.
## Environment Variables
| Variable | Default | Description |
|---|---|---|
| `PORT` | `3003` | Port to listen on |
| `OLLAMA_URL` | `http://localhost:11434` | Ollama instance URL |
| `EMBEDDING_MODEL` | `nomic-embed-text` | Model passed to Ollama `/api/embed` |
Note: the env var name is `EMBEDDING_MODEL`, not `EMBED_MODEL` — the internal constant is `EMBED_MODEL` but the lookup key is different.
## Ollama API Details
Uses Ollama's `/api/embed` endpoint (not `/api/embeddings`). Request shape:
```json
{ "model": "nomic-embed-text", "input": "text to embed" }
```
Ollama returns `{ "embeddings": [[...]] }` — an array of arrays even for a single input. The helper takes `data.embeddings[0]` to return the single vector.
The `ollama` npm package is listed as a dependency but is **not used** — all calls are raw `fetch`. Do not refactor to use the package without checking the API shape matches.
## Batch Endpoint
`POST /embed/batch` embeds items **sequentially** in a for-loop, not in parallel. The comment explains this: Ollama doesn't parallelise embedding calls, so parallel requests would queue internally anyway. Do not change to `Promise.all` without verifying Ollama behaviour.
## Error Responses
| Condition | Status | Notes |
|---|---|---|
| Missing/empty `text` | 400 | |
| Ollama call fails | 502 | Upstream failure — correct status |
| Empty `texts` array | 400 | |
## Known Issue
The 400 error message for `/embed` reads `"text is required and must be empty"` — the word "not" is missing. Should read `"must not be empty"`.
## API Endpoints
| Method | Path | Notes |
|---|---|---|
| GET | `/health` | Static response — does not verify Ollama is reachable |
| POST | `/embed` | Body: `{ text: string }`. Returns `{ embedding, model, dimensions }` |
| POST | `/embed/batch` | Body: `{ texts: string[] }`. Returns `{ embeddings, model, dimensions, count }` |

View File

@@ -9,7 +9,6 @@
"dependencies": { "dependencies": {
"@nexusai/shared": "^1.0.0", "@nexusai/shared": "^1.0.0",
"dotenv": "^17.4.0", "dotenv": "^17.4.0",
"express": "^5.2.1", "express": "^5.2.1"
"ollama": "^0.6.3"
} }
} }

View File

@@ -3,7 +3,7 @@ const express = require('express');
const {getEnv, OLLAMA, PORTS, logger} = require('@nexusai/shared'); const {getEnv, OLLAMA, PORTS, logger} = require('@nexusai/shared');
const app = express(); const app = express();
app.use(express.json()); app.use(express.json({ limit: '1mb' })); // limit request body to 1mb to prevent abuse - embedding requests should be small
const PORT = getEnv('PORT', PORTS.EMBEDDING); const PORT = getEnv('PORT', PORTS.EMBEDDING);
const OLLAMA_URL = getEnv('OLLAMA_URL', OLLAMA.DEFAULT_URL); const OLLAMA_URL = getEnv('OLLAMA_URL', OLLAMA.DEFAULT_URL);
@@ -14,7 +14,8 @@ async function embedText(text) {
const res = await fetch(`${OLLAMA_URL}/api/embed`, { const res = await fetch(`${OLLAMA_URL}/api/embed`, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ model: EMBED_MODEL, input: text }) body: JSON.stringify({ model: EMBED_MODEL, input: text }),
signal: AbortSignal.timeout(30_000),
}); });
if (!res.ok) { if (!res.ok) {
@@ -37,7 +38,7 @@ app.get('/health', (req,res) => {
app.post('/embed', async (req, res) => { app.post('/embed', async (req, res) => {
const { text } = req.body; const { text } = req.body;
if (!text || typeof text !== 'string' || text.trim() === '') { if (!text || typeof text !== 'string' || text.trim() === '') {
return res.status(400).json({ error: 'text is required and must be empty' }); return res.status(400).json({ error: 'text is required and must not be empty' });
} }
try { try {
@@ -60,7 +61,10 @@ app.post('/embed/batch', async (req, res) => {
} }
try { try {
//sequential embedding for now, Ollama doesn't natively parallize embeddings const invalid = texts.findIndex(t => !t || typeof t !== 'string' || t.trim() === '');
if (invalid !== -1)
return res.status(400).json({ error: `texts[${invalid}] is empty or not a string` });
const embeddings = []; const embeddings = [];
for (const text of texts) { for (const text of texts) {
embeddings.push(await embedText(text.trim())); embeddings.push(await embedText(text.trim()));

View File

@@ -0,0 +1,75 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
## Running This Service
```bash
npm run inference # From repo root
npm -w packages/inference-service run dev # With --watch
```
Default port: **3001**. Set `INFERENCE_PROVIDER` to select the backend.
## Provider Pattern
`src/infer.js` reads `INFERENCE_PROVIDER` at startup and loads one of two providers:
| `INFERENCE_PROVIDER` | Module | Backend |
|---|---|---|
| `ollama` (default) | `src/providers/ollama.js` | Ollama npm client → `/api/generate` |
| `llamacpp` | `src/providers/llamacpp.js` | Raw fetch → `/v1/chat/completions` (OpenAI-compatible) |
An unknown provider throws immediately at startup — fail-fast, not at request time.
Both providers export the same interface: `complete(prompt, options)` and `completeStream(prompt, options)`.
## Environment Variables
| Variable | Default | Description |
|---|---|---|
| `PORT` | `3001` | Port to listen on |
| `INFERENCE_PROVIDER` | `ollama` | `ollama` or `llamacpp` |
| `INFERENCE_URL` | `http://localhost:11434` (Ollama) / `http://localhost:8080` (llama.cpp) | Backend URL |
| `DEFAULT_MODEL` | Provider-specific | Model name passed to backend |
`INFERENCE_URL` defaults differ per provider — Ollama uses the Ollama default URL, llama.cpp uses the llama-server default.
## Options Resolution
Both providers use `resolveOptions(options)` to merge caller-supplied options with `INFERENCE_DEFAULTS` from shared constants. Any option not supplied by the caller falls back to the constant.
## Streaming Chunk Format
The two providers yield differently shaped chunks — the route in `src/routes/inference.js` normalises them:
**Ollama** yields raw Ollama generate chunks: `{ response, done, model, eval_count, prompt_eval_count, ... }`
**llama.cpp** yields:
- Per-token: `{ response: delta, done: false }`
- Final: `{ response: '', done: true, model, tokenCount }` — token count is the sum of `completion_tokens + prompt_tokens` from the usage chunk
The route checks `chunk.response` to stream text and `chunk.done` to capture metadata. For Ollama streaming, **token count is not captured** — the done chunk from Ollama contains `eval_count`/`prompt_eval_count` but the route only reads `chunk.tokenCount` (a llama.cpp field). Ollama streaming calls always report `tokenCount: 0` to the client.
## Known Issue: `maxTokens` Missing from Streaming Route
`POST /complete` correctly destructures `maxTokens` from the request body and passes it through. `POST /complete/stream` does **not** — it omits `maxTokens` from its destructuring, so streaming completions always use `INFERENCE_DEFAULTS.MAX_TOKENS` regardless of what the caller sends. This means `/chat/stream` has a different effective token ceiling than `/chat`.
## SSE Format (route → caller)
```
data: {"response":"Hello"} ← per token
data: {"response":" world"}
data: {"done":true,"model":"...","tokenCount":42} ← final metadata
data: [DONE] ← sentinel
```
## API Endpoints
| Method | Path | Notes |
|---|---|---|
| GET | `/health` | Returns `{ service, status, provider, model }` |
| POST | `/complete` | Body: `{ prompt, model?, temperature?, maxTokens?, topP?, topK?, repeatPenalty? }` |
| POST | `/complete/stream` | Same body as `/complete` except `maxTokens` is silently ignored |

View File

@@ -4,7 +4,7 @@ const {getEnv, PORTS, OLLAMA, logger} = require('@nexusai/shared');
const inferenceRouter = require('./routes/inference'); const inferenceRouter = require('./routes/inference');
const app = express(); const app = express();
app.use(express.json()); app.use(express.json({ limit: '8mb' })); // prompts include full context window
const PORT = getEnv('PORT', PORTS.INFERENCE); const PORT = getEnv('PORT', PORTS.INFERENCE);
const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama'); const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama');

View File

@@ -57,8 +57,17 @@ async function* completeStream(prompt, options = {} ) {
}); });
for await (const chunk of stream) { for await (const chunk of stream) {
if (chunk.done) {
yield {
response: '',
done: true,
model: chunk.model,
tokenCount: (chunk.eval_count ?? 0) + (chunk.prompt_eval_count ?? 0),
};
} else {
yield chunk; yield chunk;
} }
} }
}
module.exports = { complete, completeStream }; module.exports = { complete, completeStream };

View File

@@ -23,7 +23,7 @@ router.post('/complete', async (req, res) => {
// Streaming completion endpoint - sends partial responses as they arrive // Streaming completion endpoint - sends partial responses as they arrive
router.post('/complete/stream', async (req, res) => { router.post('/complete/stream', async (req, res) => {
const { prompt, model, temperature, topP, topK, repeatPenalty } = req.body; const { prompt, model, temperature, maxTokens, topP, topK, repeatPenalty } = req.body;
if (!prompt) return res.status(400).json({ error: 'prompt is required' }); if (!prompt) return res.status(400).json({ error: 'prompt is required' });
@@ -35,7 +35,7 @@ router.post('/complete/stream', async (req, res) => {
let lastModel = model; let lastModel = model;
let tokenCount = 0; let tokenCount = 0;
for await (const chunk of completeStream(prompt, { model, temperature, topP, topK, repeatPenalty })) { for await (const chunk of completeStream(prompt, { model, temperature, maxTokens,topP, topK, repeatPenalty })) {
if (chunk.response) { if (chunk.response) {
res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`); res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
} }

View File

@@ -87,6 +87,7 @@ async function extractAndStoreEntities(userMessage, aiResponse, episodeId=null,
num_predict: ENTITIES.NUM_PREDICT, num_predict: ENTITIES.NUM_PREDICT,
}, },
}), }),
signal: AbortSignal.timeout(60_000),
}); });
if (!res.ok) throw new Error(`Ollama responded ${res.status}`); if (!res.ok) throw new Error(`Ollama responded ${res.status}`);

View File

@@ -170,6 +170,7 @@ function getRecentEpisodes(sessionId, limit = EPISODIC.DEFAULT_RECENT_LIMIT) {
// Searches episodes using FTS5 full-text search, ordered by relevance, with a limit // Searches episodes using FTS5 full-text search, ordered by relevance, with a limit
function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds = null) { function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds = null) {
const db = getDB(); const db = getDB();
const safeQuery = `"${query.replace(/"/g, '""')}"`;
if (sessionIds && sessionIds.length > 0) { if (sessionIds && sessionIds.length > 0) {
const ph = sessionIds.map(() => '?').join(','); const ph = sessionIds.map(() => '?').join(',');
return db.prepare(` return db.prepare(`
@@ -179,7 +180,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
AND e.session_id IN (${ph}) AND e.session_id IN (${ph})
ORDER BY rank ORDER BY rank
LIMIT ? LIMIT ?
`).all(query, ...sessionIds, limit).map(parseRow); `).all(safeQuery, ...sessionIds, limit).map(parseRow);
} }
return db.prepare(` return db.prepare(`
SELECT e.* FROM episodes e SELECT e.* FROM episodes e
@@ -187,7 +188,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
WHERE episodes_fts MATCH ? WHERE episodes_fts MATCH ?
ORDER BY rank ORDER BY rank
LIMIT ? LIMIT ?
`).all(query, limit).map(parseRow); `).all(safeQuery, limit).map(parseRow);
} }
// Deletes an episode by its ID // Deletes an episode by its ID
@@ -206,7 +207,8 @@ async function getEpisodeEmbedding(userMessage, aiResponse){
const res = await fetch(`${url}/embed`, { const res = await fetch(`${url}/embed`, {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text }) body: JSON.stringify({ text }),
signal: AbortSignal.timeout(30_000),
}) })
if (!res.ok) { if (!res.ok) {

View File

@@ -12,7 +12,7 @@ const semantic = require('./semantic');
const entities = require('./entities'); const entities = require('./entities');
const app = express(); const app = express();
app.use(express.json()); app.use(express.json({ limit: '2mb' }));
const PORT = getEnv('PORT', PORTS.MEMORY); const PORT = getEnv('PORT', PORTS.MEMORY);

View File

@@ -15,7 +15,7 @@ const summariesRouter = require('./routes/summaries')
const cors = require('cors'); const cors = require('cors');
const app = express(); const app = express();
app.use(express.json()); app.use(express.json({ limit: '2mb' }));
app.use(cors({ app.use(cors({
origin: [ origin: [