minor clean up
This commit is contained in:
@@ -57,7 +57,7 @@
|
|||||||
## Phase 2 — Memory System Upgrades
|
## Phase 2 — Memory System Upgrades
|
||||||
*The core intelligence layer*
|
*The core intelligence layer*
|
||||||
|
|
||||||
### 1. Knowledge Graph (SQLite)
|
### 1. Knowledge Graph (SQLite) ✅
|
||||||
The highest-leverage memory upgrade. Transforms NexusAI from "remembers conversations" to "understands relationships between things."
|
The highest-leverage memory upgrade. Transforms NexusAI from "remembers conversations" to "understands relationships between things."
|
||||||
- [x] Graph schema — `nodes` and `edges` tables with typed relationships
|
- [x] Graph schema — `nodes` and `edges` tables with typed relationships
|
||||||
- [x] Entity → node promotion pipeline (`mention_count` tracked; threshold gating deferred to Phase 2)
|
- [x] Entity → node promotion pipeline (`mention_count` tracked; threshold gating deferred to Phase 2)
|
||||||
|
|||||||
3
package-lock.json
generated
3
package-lock.json
generated
@@ -4224,8 +4224,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@nexusai/shared": "^1.0.0",
|
"@nexusai/shared": "^1.0.0",
|
||||||
"dotenv": "^17.4.0",
|
"dotenv": "^17.4.0",
|
||||||
"express": "^5.2.1",
|
"express": "^5.2.1"
|
||||||
"ollama": "^0.6.3"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"packages/inference-service": {
|
"packages/inference-service": {
|
||||||
|
|||||||
64
packages/embedding-service/CLAUDE.md
Normal file
64
packages/embedding-service/CLAUDE.md
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
# CLAUDE.md
|
||||||
|
|
||||||
|
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||||
|
|
||||||
|
See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
|
||||||
|
|
||||||
|
## Running This Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run embedding # From repo root
|
||||||
|
npm -w packages/embedding-service run dev # With --watch
|
||||||
|
```
|
||||||
|
|
||||||
|
Default port: **3003**. Requires Ollama to be reachable at `OLLAMA_URL`.
|
||||||
|
|
||||||
|
## Single-File Service
|
||||||
|
|
||||||
|
The entire service is `src/index.js` — no subdirectory structure. All routes, the Ollama helper, and startup are in one file.
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `PORT` | `3003` | Port to listen on |
|
||||||
|
| `OLLAMA_URL` | `http://localhost:11434` | Ollama instance URL |
|
||||||
|
| `EMBEDDING_MODEL` | `nomic-embed-text` | Model passed to Ollama `/api/embed` |
|
||||||
|
|
||||||
|
Note: the env var name is `EMBEDDING_MODEL`, not `EMBED_MODEL` — the internal constant is `EMBED_MODEL` but the lookup key is different.
|
||||||
|
|
||||||
|
## Ollama API Details
|
||||||
|
|
||||||
|
Uses Ollama's `/api/embed` endpoint (not `/api/embeddings`). Request shape:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{ "model": "nomic-embed-text", "input": "text to embed" }
|
||||||
|
```
|
||||||
|
|
||||||
|
Ollama returns `{ "embeddings": [[...]] }` — an array of arrays even for a single input. The helper takes `data.embeddings[0]` to return the single vector.
|
||||||
|
|
||||||
|
The `ollama` npm package is listed as a dependency but is **not used** — all calls are raw `fetch`. Do not refactor to use the package without checking the API shape matches.
|
||||||
|
|
||||||
|
## Batch Endpoint
|
||||||
|
|
||||||
|
`POST /embed/batch` embeds items **sequentially** in a for-loop, not in parallel. The comment explains this: Ollama doesn't parallelise embedding calls, so parallel requests would queue internally anyway. Do not change to `Promise.all` without verifying Ollama behaviour.
|
||||||
|
|
||||||
|
## Error Responses
|
||||||
|
|
||||||
|
| Condition | Status | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| Missing/empty `text` | 400 | |
|
||||||
|
| Ollama call fails | 502 | Upstream failure — correct status |
|
||||||
|
| Empty `texts` array | 400 | |
|
||||||
|
|
||||||
|
## Known Issue
|
||||||
|
|
||||||
|
The 400 error message for `/embed` reads `"text is required and must be empty"` — the word "not" is missing. Should read `"must not be empty"`.
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
| Method | Path | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| GET | `/health` | Static response — does not verify Ollama is reachable |
|
||||||
|
| POST | `/embed` | Body: `{ text: string }`. Returns `{ embedding, model, dimensions }` |
|
||||||
|
| POST | `/embed/batch` | Body: `{ texts: string[] }`. Returns `{ embeddings, model, dimensions, count }` |
|
||||||
@@ -9,7 +9,6 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@nexusai/shared": "^1.0.0",
|
"@nexusai/shared": "^1.0.0",
|
||||||
"dotenv": "^17.4.0",
|
"dotenv": "^17.4.0",
|
||||||
"express": "^5.2.1",
|
"express": "^5.2.1"
|
||||||
"ollama": "^0.6.3"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ const express = require('express');
|
|||||||
const {getEnv, OLLAMA, PORTS, logger} = require('@nexusai/shared');
|
const {getEnv, OLLAMA, PORTS, logger} = require('@nexusai/shared');
|
||||||
|
|
||||||
const app = express();
|
const app = express();
|
||||||
app.use(express.json());
|
app.use(express.json({ limit: '1mb' })); // limit request body to 1mb to prevent abuse - embedding requests should be small
|
||||||
|
|
||||||
const PORT = getEnv('PORT', PORTS.EMBEDDING);
|
const PORT = getEnv('PORT', PORTS.EMBEDDING);
|
||||||
const OLLAMA_URL = getEnv('OLLAMA_URL', OLLAMA.DEFAULT_URL);
|
const OLLAMA_URL = getEnv('OLLAMA_URL', OLLAMA.DEFAULT_URL);
|
||||||
@@ -14,7 +14,8 @@ async function embedText(text) {
|
|||||||
const res = await fetch(`${OLLAMA_URL}/api/embed`, {
|
const res = await fetch(`${OLLAMA_URL}/api/embed`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ model: EMBED_MODEL, input: text })
|
body: JSON.stringify({ model: EMBED_MODEL, input: text }),
|
||||||
|
signal: AbortSignal.timeout(30_000),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
@@ -37,7 +38,7 @@ app.get('/health', (req,res) => {
|
|||||||
app.post('/embed', async (req, res) => {
|
app.post('/embed', async (req, res) => {
|
||||||
const { text } = req.body;
|
const { text } = req.body;
|
||||||
if (!text || typeof text !== 'string' || text.trim() === '') {
|
if (!text || typeof text !== 'string' || text.trim() === '') {
|
||||||
return res.status(400).json({ error: 'text is required and must be empty' });
|
return res.status(400).json({ error: 'text is required and must not be empty' });
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -60,7 +61,10 @@ app.post('/embed/batch', async (req, res) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
//sequential embedding for now, Ollama doesn't natively parallize embeddings
|
const invalid = texts.findIndex(t => !t || typeof t !== 'string' || t.trim() === '');
|
||||||
|
if (invalid !== -1)
|
||||||
|
return res.status(400).json({ error: `texts[${invalid}] is empty or not a string` });
|
||||||
|
|
||||||
const embeddings = [];
|
const embeddings = [];
|
||||||
for (const text of texts) {
|
for (const text of texts) {
|
||||||
embeddings.push(await embedText(text.trim()));
|
embeddings.push(await embedText(text.trim()));
|
||||||
|
|||||||
75
packages/inference-service/CLAUDE.md
Normal file
75
packages/inference-service/CLAUDE.md
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
# CLAUDE.md
|
||||||
|
|
||||||
|
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||||
|
|
||||||
|
See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
|
||||||
|
|
||||||
|
## Running This Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm run inference # From repo root
|
||||||
|
npm -w packages/inference-service run dev # With --watch
|
||||||
|
```
|
||||||
|
|
||||||
|
Default port: **3001**. Set `INFERENCE_PROVIDER` to select the backend.
|
||||||
|
|
||||||
|
## Provider Pattern
|
||||||
|
|
||||||
|
`src/infer.js` reads `INFERENCE_PROVIDER` at startup and loads one of two providers:
|
||||||
|
|
||||||
|
| `INFERENCE_PROVIDER` | Module | Backend |
|
||||||
|
|---|---|---|
|
||||||
|
| `ollama` (default) | `src/providers/ollama.js` | Ollama npm client → `/api/generate` |
|
||||||
|
| `llamacpp` | `src/providers/llamacpp.js` | Raw fetch → `/v1/chat/completions` (OpenAI-compatible) |
|
||||||
|
|
||||||
|
An unknown provider throws immediately at startup — fail-fast, not at request time.
|
||||||
|
|
||||||
|
Both providers export the same interface: `complete(prompt, options)` and `completeStream(prompt, options)`.
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `PORT` | `3001` | Port to listen on |
|
||||||
|
| `INFERENCE_PROVIDER` | `ollama` | `ollama` or `llamacpp` |
|
||||||
|
| `INFERENCE_URL` | `http://localhost:11434` (Ollama) / `http://localhost:8080` (llama.cpp) | Backend URL |
|
||||||
|
| `DEFAULT_MODEL` | Provider-specific | Model name passed to backend |
|
||||||
|
|
||||||
|
`INFERENCE_URL` defaults differ per provider — Ollama uses the Ollama default URL, llama.cpp uses the llama-server default.
|
||||||
|
|
||||||
|
## Options Resolution
|
||||||
|
|
||||||
|
Both providers use `resolveOptions(options)` to merge caller-supplied options with `INFERENCE_DEFAULTS` from shared constants. Any option not supplied by the caller falls back to the constant.
|
||||||
|
|
||||||
|
## Streaming Chunk Format
|
||||||
|
|
||||||
|
The two providers yield differently shaped chunks — the route in `src/routes/inference.js` normalises them:
|
||||||
|
|
||||||
|
**Ollama** yields raw Ollama generate chunks: `{ response, done, model, eval_count, prompt_eval_count, ... }`
|
||||||
|
|
||||||
|
**llama.cpp** yields:
|
||||||
|
- Per-token: `{ response: delta, done: false }`
|
||||||
|
- Final: `{ response: '', done: true, model, tokenCount }` — token count is the sum of `completion_tokens + prompt_tokens` from the usage chunk
|
||||||
|
|
||||||
|
The route checks `chunk.response` to stream text and `chunk.done` to capture metadata. For Ollama streaming, **token count is not captured** — the done chunk from Ollama contains `eval_count`/`prompt_eval_count` but the route only reads `chunk.tokenCount` (a llama.cpp field). Ollama streaming calls always report `tokenCount: 0` to the client.
|
||||||
|
|
||||||
|
## Known Issue: `maxTokens` Missing from Streaming Route
|
||||||
|
|
||||||
|
`POST /complete` correctly destructures `maxTokens` from the request body and passes it through. `POST /complete/stream` does **not** — it omits `maxTokens` from its destructuring, so streaming completions always use `INFERENCE_DEFAULTS.MAX_TOKENS` regardless of what the caller sends. This means `/chat/stream` has a different effective token ceiling than `/chat`.
|
||||||
|
|
||||||
|
## SSE Format (route → caller)
|
||||||
|
|
||||||
|
```
|
||||||
|
data: {"response":"Hello"} ← per token
|
||||||
|
data: {"response":" world"}
|
||||||
|
data: {"done":true,"model":"...","tokenCount":42} ← final metadata
|
||||||
|
data: [DONE] ← sentinel
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
| Method | Path | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| GET | `/health` | Returns `{ service, status, provider, model }` |
|
||||||
|
| POST | `/complete` | Body: `{ prompt, model?, temperature?, maxTokens?, topP?, topK?, repeatPenalty? }` |
|
||||||
|
| POST | `/complete/stream` | Same body as `/complete` except `maxTokens` is silently ignored |
|
||||||
@@ -4,7 +4,7 @@ const {getEnv, PORTS, OLLAMA, logger} = require('@nexusai/shared');
|
|||||||
const inferenceRouter = require('./routes/inference');
|
const inferenceRouter = require('./routes/inference');
|
||||||
|
|
||||||
const app = express();
|
const app = express();
|
||||||
app.use(express.json());
|
app.use(express.json({ limit: '8mb' })); // prompts include full context window
|
||||||
|
|
||||||
const PORT = getEnv('PORT', PORTS.INFERENCE);
|
const PORT = getEnv('PORT', PORTS.INFERENCE);
|
||||||
const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama');
|
const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama');
|
||||||
|
|||||||
@@ -57,8 +57,17 @@ async function* completeStream(prompt, options = {} ) {
|
|||||||
});
|
});
|
||||||
|
|
||||||
for await (const chunk of stream) {
|
for await (const chunk of stream) {
|
||||||
|
if (chunk.done) {
|
||||||
|
yield {
|
||||||
|
response: '',
|
||||||
|
done: true,
|
||||||
|
model: chunk.model,
|
||||||
|
tokenCount: (chunk.eval_count ?? 0) + (chunk.prompt_eval_count ?? 0),
|
||||||
|
};
|
||||||
|
} else {
|
||||||
yield chunk;
|
yield chunk;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = { complete, completeStream };
|
module.exports = { complete, completeStream };
|
||||||
@@ -23,7 +23,7 @@ router.post('/complete', async (req, res) => {
|
|||||||
|
|
||||||
// Streaming completion endpoint - sends partial responses as they arrive
|
// Streaming completion endpoint - sends partial responses as they arrive
|
||||||
router.post('/complete/stream', async (req, res) => {
|
router.post('/complete/stream', async (req, res) => {
|
||||||
const { prompt, model, temperature, topP, topK, repeatPenalty } = req.body;
|
const { prompt, model, temperature, maxTokens, topP, topK, repeatPenalty } = req.body;
|
||||||
|
|
||||||
if (!prompt) return res.status(400).json({ error: 'prompt is required' });
|
if (!prompt) return res.status(400).json({ error: 'prompt is required' });
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@ router.post('/complete/stream', async (req, res) => {
|
|||||||
let lastModel = model;
|
let lastModel = model;
|
||||||
let tokenCount = 0;
|
let tokenCount = 0;
|
||||||
|
|
||||||
for await (const chunk of completeStream(prompt, { model, temperature, topP, topK, repeatPenalty })) {
|
for await (const chunk of completeStream(prompt, { model, temperature, maxTokens,topP, topK, repeatPenalty })) {
|
||||||
if (chunk.response) {
|
if (chunk.response) {
|
||||||
res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
|
res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ async function extractAndStoreEntities(userMessage, aiResponse, episodeId=null,
|
|||||||
num_predict: ENTITIES.NUM_PREDICT,
|
num_predict: ENTITIES.NUM_PREDICT,
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
|
signal: AbortSignal.timeout(60_000),
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!res.ok) throw new Error(`Ollama responded ${res.status}`);
|
if (!res.ok) throw new Error(`Ollama responded ${res.status}`);
|
||||||
|
|||||||
@@ -170,6 +170,7 @@ function getRecentEpisodes(sessionId, limit = EPISODIC.DEFAULT_RECENT_LIMIT) {
|
|||||||
// Searches episodes using FTS5 full-text search, ordered by relevance, with a limit
|
// Searches episodes using FTS5 full-text search, ordered by relevance, with a limit
|
||||||
function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds = null) {
|
function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds = null) {
|
||||||
const db = getDB();
|
const db = getDB();
|
||||||
|
const safeQuery = `"${query.replace(/"/g, '""')}"`;
|
||||||
if (sessionIds && sessionIds.length > 0) {
|
if (sessionIds && sessionIds.length > 0) {
|
||||||
const ph = sessionIds.map(() => '?').join(',');
|
const ph = sessionIds.map(() => '?').join(',');
|
||||||
return db.prepare(`
|
return db.prepare(`
|
||||||
@@ -179,7 +180,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
|
|||||||
AND e.session_id IN (${ph})
|
AND e.session_id IN (${ph})
|
||||||
ORDER BY rank
|
ORDER BY rank
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
`).all(query, ...sessionIds, limit).map(parseRow);
|
`).all(safeQuery, ...sessionIds, limit).map(parseRow);
|
||||||
}
|
}
|
||||||
return db.prepare(`
|
return db.prepare(`
|
||||||
SELECT e.* FROM episodes e
|
SELECT e.* FROM episodes e
|
||||||
@@ -187,7 +188,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
|
|||||||
WHERE episodes_fts MATCH ?
|
WHERE episodes_fts MATCH ?
|
||||||
ORDER BY rank
|
ORDER BY rank
|
||||||
LIMIT ?
|
LIMIT ?
|
||||||
`).all(query, limit).map(parseRow);
|
`).all(safeQuery, limit).map(parseRow);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deletes an episode by its ID
|
// Deletes an episode by its ID
|
||||||
@@ -206,7 +207,8 @@ async function getEpisodeEmbedding(userMessage, aiResponse){
|
|||||||
const res = await fetch(`${url}/embed`, {
|
const res = await fetch(`${url}/embed`, {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
headers: { 'Content-Type': 'application/json' },
|
headers: { 'Content-Type': 'application/json' },
|
||||||
body: JSON.stringify({ text })
|
body: JSON.stringify({ text }),
|
||||||
|
signal: AbortSignal.timeout(30_000),
|
||||||
})
|
})
|
||||||
|
|
||||||
if (!res.ok) {
|
if (!res.ok) {
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ const semantic = require('./semantic');
|
|||||||
const entities = require('./entities');
|
const entities = require('./entities');
|
||||||
|
|
||||||
const app = express();
|
const app = express();
|
||||||
app.use(express.json());
|
app.use(express.json({ limit: '2mb' }));
|
||||||
|
|
||||||
const PORT = getEnv('PORT', PORTS.MEMORY);
|
const PORT = getEnv('PORT', PORTS.MEMORY);
|
||||||
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ const summariesRouter = require('./routes/summaries')
|
|||||||
const cors = require('cors');
|
const cors = require('cors');
|
||||||
|
|
||||||
const app = express();
|
const app = express();
|
||||||
app.use(express.json());
|
app.use(express.json({ limit: '2mb' }));
|
||||||
|
|
||||||
app.use(cors({
|
app.use(cors({
|
||||||
origin: [
|
origin: [
|
||||||
|
|||||||
Reference in New Issue
Block a user