minor clean up
This commit is contained in:
64
packages/embedding-service/CLAUDE.md
Normal file
64
packages/embedding-service/CLAUDE.md
Normal file
@@ -0,0 +1,64 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
|
||||
|
||||
## Running This Service
|
||||
|
||||
```bash
|
||||
npm run embedding # From repo root
|
||||
npm -w packages/embedding-service run dev # With --watch
|
||||
```
|
||||
|
||||
Default port: **3003**. Requires Ollama to be reachable at `OLLAMA_URL`.
|
||||
|
||||
## Single-File Service
|
||||
|
||||
The entire service is `src/index.js` — no subdirectory structure. All routes, the Ollama helper, and startup are in one file.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|---|---|---|
|
||||
| `PORT` | `3003` | Port to listen on |
|
||||
| `OLLAMA_URL` | `http://localhost:11434` | Ollama instance URL |
|
||||
| `EMBEDDING_MODEL` | `nomic-embed-text` | Model passed to Ollama `/api/embed` |
|
||||
|
||||
Note: the env var name is `EMBEDDING_MODEL`, not `EMBED_MODEL` — the internal constant is `EMBED_MODEL` but the lookup key is different.
|
||||
|
||||
## Ollama API Details
|
||||
|
||||
Uses Ollama's `/api/embed` endpoint (not `/api/embeddings`). Request shape:
|
||||
|
||||
```json
|
||||
{ "model": "nomic-embed-text", "input": "text to embed" }
|
||||
```
|
||||
|
||||
Ollama returns `{ "embeddings": [[...]] }` — an array of arrays even for a single input. The helper takes `data.embeddings[0]` to return the single vector.
|
||||
|
||||
The `ollama` npm package is listed as a dependency but is **not used** — all calls are raw `fetch`. Do not refactor to use the package without checking the API shape matches.
|
||||
|
||||
## Batch Endpoint
|
||||
|
||||
`POST /embed/batch` embeds items **sequentially** in a for-loop, not in parallel. The comment explains this: Ollama doesn't parallelise embedding calls, so parallel requests would queue internally anyway. Do not change to `Promise.all` without verifying Ollama behaviour.
|
||||
|
||||
## Error Responses
|
||||
|
||||
| Condition | Status | Notes |
|
||||
|---|---|---|
|
||||
| Missing/empty `text` | 400 | |
|
||||
| Ollama call fails | 502 | Upstream failure — correct status |
|
||||
| Empty `texts` array | 400 | |
|
||||
|
||||
## Known Issue
|
||||
|
||||
The 400 error message for `/embed` reads `"text is required and must be empty"` — the word "not" is missing. Should read `"must not be empty"`.
|
||||
|
||||
## API Endpoints
|
||||
|
||||
| Method | Path | Notes |
|
||||
|---|---|---|
|
||||
| GET | `/health` | Static response — does not verify Ollama is reachable |
|
||||
| POST | `/embed` | Body: `{ text: string }`. Returns `{ embedding, model, dimensions }` |
|
||||
| POST | `/embed/batch` | Body: `{ texts: string[] }`. Returns `{ embeddings, model, dimensions, count }` |
|
||||
@@ -9,7 +9,6 @@
|
||||
"dependencies": {
|
||||
"@nexusai/shared": "^1.0.0",
|
||||
"dotenv": "^17.4.0",
|
||||
"express": "^5.2.1",
|
||||
"ollama": "^0.6.3"
|
||||
"express": "^5.2.1"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ const express = require('express');
|
||||
const {getEnv, OLLAMA, PORTS, logger} = require('@nexusai/shared');
|
||||
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
app.use(express.json({ limit: '1mb' })); // limit request body to 1mb to prevent abuse - embedding requests should be small
|
||||
|
||||
const PORT = getEnv('PORT', PORTS.EMBEDDING);
|
||||
const OLLAMA_URL = getEnv('OLLAMA_URL', OLLAMA.DEFAULT_URL);
|
||||
@@ -14,7 +14,8 @@ async function embedText(text) {
|
||||
const res = await fetch(`${OLLAMA_URL}/api/embed`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ model: EMBED_MODEL, input: text })
|
||||
body: JSON.stringify({ model: EMBED_MODEL, input: text }),
|
||||
signal: AbortSignal.timeout(30_000),
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
@@ -37,7 +38,7 @@ app.get('/health', (req,res) => {
|
||||
app.post('/embed', async (req, res) => {
|
||||
const { text } = req.body;
|
||||
if (!text || typeof text !== 'string' || text.trim() === '') {
|
||||
return res.status(400).json({ error: 'text is required and must be empty' });
|
||||
return res.status(400).json({ error: 'text is required and must not be empty' });
|
||||
}
|
||||
|
||||
try {
|
||||
@@ -60,7 +61,10 @@ app.post('/embed/batch', async (req, res) => {
|
||||
}
|
||||
|
||||
try {
|
||||
//sequential embedding for now, Ollama doesn't natively parallize embeddings
|
||||
const invalid = texts.findIndex(t => !t || typeof t !== 'string' || t.trim() === '');
|
||||
if (invalid !== -1)
|
||||
return res.status(400).json({ error: `texts[${invalid}] is empty or not a string` });
|
||||
|
||||
const embeddings = [];
|
||||
for (const text of texts) {
|
||||
embeddings.push(await embedText(text.trim()));
|
||||
|
||||
75
packages/inference-service/CLAUDE.md
Normal file
75
packages/inference-service/CLAUDE.md
Normal file
@@ -0,0 +1,75 @@
|
||||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
See the root [CLAUDE.md](../../CLAUDE.md) for overall architecture, service roles, and deployment layout.
|
||||
|
||||
## Running This Service
|
||||
|
||||
```bash
|
||||
npm run inference # From repo root
|
||||
npm -w packages/inference-service run dev # With --watch
|
||||
```
|
||||
|
||||
Default port: **3001**. Set `INFERENCE_PROVIDER` to select the backend.
|
||||
|
||||
## Provider Pattern
|
||||
|
||||
`src/infer.js` reads `INFERENCE_PROVIDER` at startup and loads one of two providers:
|
||||
|
||||
| `INFERENCE_PROVIDER` | Module | Backend |
|
||||
|---|---|---|
|
||||
| `ollama` (default) | `src/providers/ollama.js` | Ollama npm client → `/api/generate` |
|
||||
| `llamacpp` | `src/providers/llamacpp.js` | Raw fetch → `/v1/chat/completions` (OpenAI-compatible) |
|
||||
|
||||
An unknown provider throws immediately at startup — fail-fast, not at request time.
|
||||
|
||||
Both providers export the same interface: `complete(prompt, options)` and `completeStream(prompt, options)`.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
| Variable | Default | Description |
|
||||
|---|---|---|
|
||||
| `PORT` | `3001` | Port to listen on |
|
||||
| `INFERENCE_PROVIDER` | `ollama` | `ollama` or `llamacpp` |
|
||||
| `INFERENCE_URL` | `http://localhost:11434` (Ollama) / `http://localhost:8080` (llama.cpp) | Backend URL |
|
||||
| `DEFAULT_MODEL` | Provider-specific | Model name passed to backend |
|
||||
|
||||
`INFERENCE_URL` defaults differ per provider — Ollama uses the Ollama default URL, llama.cpp uses the llama-server default.
|
||||
|
||||
## Options Resolution
|
||||
|
||||
Both providers use `resolveOptions(options)` to merge caller-supplied options with `INFERENCE_DEFAULTS` from shared constants. Any option not supplied by the caller falls back to the constant.
|
||||
|
||||
## Streaming Chunk Format
|
||||
|
||||
The two providers yield differently shaped chunks — the route in `src/routes/inference.js` normalises them:
|
||||
|
||||
**Ollama** yields raw Ollama generate chunks: `{ response, done, model, eval_count, prompt_eval_count, ... }`
|
||||
|
||||
**llama.cpp** yields:
|
||||
- Per-token: `{ response: delta, done: false }`
|
||||
- Final: `{ response: '', done: true, model, tokenCount }` — token count is the sum of `completion_tokens + prompt_tokens` from the usage chunk
|
||||
|
||||
The route checks `chunk.response` to stream text and `chunk.done` to capture metadata. For Ollama streaming, **token count is not captured** — the done chunk from Ollama contains `eval_count`/`prompt_eval_count` but the route only reads `chunk.tokenCount` (a llama.cpp field). Ollama streaming calls always report `tokenCount: 0` to the client.
|
||||
|
||||
## Known Issue: `maxTokens` Missing from Streaming Route
|
||||
|
||||
`POST /complete` correctly destructures `maxTokens` from the request body and passes it through. `POST /complete/stream` does **not** — it omits `maxTokens` from its destructuring, so streaming completions always use `INFERENCE_DEFAULTS.MAX_TOKENS` regardless of what the caller sends. This means `/chat/stream` has a different effective token ceiling than `/chat`.
|
||||
|
||||
## SSE Format (route → caller)
|
||||
|
||||
```
|
||||
data: {"response":"Hello"} ← per token
|
||||
data: {"response":" world"}
|
||||
data: {"done":true,"model":"...","tokenCount":42} ← final metadata
|
||||
data: [DONE] ← sentinel
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
| Method | Path | Notes |
|
||||
|---|---|---|
|
||||
| GET | `/health` | Returns `{ service, status, provider, model }` |
|
||||
| POST | `/complete` | Body: `{ prompt, model?, temperature?, maxTokens?, topP?, topK?, repeatPenalty? }` |
|
||||
| POST | `/complete/stream` | Same body as `/complete` except `maxTokens` is silently ignored |
|
||||
@@ -4,7 +4,7 @@ const {getEnv, PORTS, OLLAMA, logger} = require('@nexusai/shared');
|
||||
const inferenceRouter = require('./routes/inference');
|
||||
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
app.use(express.json({ limit: '8mb' })); // prompts include full context window
|
||||
|
||||
const PORT = getEnv('PORT', PORTS.INFERENCE);
|
||||
const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama');
|
||||
|
||||
@@ -57,7 +57,16 @@ async function* completeStream(prompt, options = {} ) {
|
||||
});
|
||||
|
||||
for await (const chunk of stream) {
|
||||
yield chunk;
|
||||
if (chunk.done) {
|
||||
yield {
|
||||
response: '',
|
||||
done: true,
|
||||
model: chunk.model,
|
||||
tokenCount: (chunk.eval_count ?? 0) + (chunk.prompt_eval_count ?? 0),
|
||||
};
|
||||
} else {
|
||||
yield chunk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ router.post('/complete', async (req, res) => {
|
||||
|
||||
// Streaming completion endpoint - sends partial responses as they arrive
|
||||
router.post('/complete/stream', async (req, res) => {
|
||||
const { prompt, model, temperature, topP, topK, repeatPenalty } = req.body;
|
||||
const { prompt, model, temperature, maxTokens, topP, topK, repeatPenalty } = req.body;
|
||||
|
||||
if (!prompt) return res.status(400).json({ error: 'prompt is required' });
|
||||
|
||||
@@ -35,7 +35,7 @@ router.post('/complete/stream', async (req, res) => {
|
||||
let lastModel = model;
|
||||
let tokenCount = 0;
|
||||
|
||||
for await (const chunk of completeStream(prompt, { model, temperature, topP, topK, repeatPenalty })) {
|
||||
for await (const chunk of completeStream(prompt, { model, temperature, maxTokens,topP, topK, repeatPenalty })) {
|
||||
if (chunk.response) {
|
||||
res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
|
||||
}
|
||||
|
||||
@@ -87,6 +87,7 @@ async function extractAndStoreEntities(userMessage, aiResponse, episodeId=null,
|
||||
num_predict: ENTITIES.NUM_PREDICT,
|
||||
},
|
||||
}),
|
||||
signal: AbortSignal.timeout(60_000),
|
||||
});
|
||||
|
||||
if (!res.ok) throw new Error(`Ollama responded ${res.status}`);
|
||||
|
||||
@@ -170,6 +170,7 @@ function getRecentEpisodes(sessionId, limit = EPISODIC.DEFAULT_RECENT_LIMIT) {
|
||||
// Searches episodes using FTS5 full-text search, ordered by relevance, with a limit
|
||||
function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds = null) {
|
||||
const db = getDB();
|
||||
const safeQuery = `"${query.replace(/"/g, '""')}"`;
|
||||
if (sessionIds && sessionIds.length > 0) {
|
||||
const ph = sessionIds.map(() => '?').join(',');
|
||||
return db.prepare(`
|
||||
@@ -179,7 +180,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
|
||||
AND e.session_id IN (${ph})
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
`).all(query, ...sessionIds, limit).map(parseRow);
|
||||
`).all(safeQuery, ...sessionIds, limit).map(parseRow);
|
||||
}
|
||||
return db.prepare(`
|
||||
SELECT e.* FROM episodes e
|
||||
@@ -187,7 +188,7 @@ function searchEpisodes(query, limit = EPISODIC.DEFAULT_SEARCH_LIMIT, sessionIds
|
||||
WHERE episodes_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
`).all(query, limit).map(parseRow);
|
||||
`).all(safeQuery, limit).map(parseRow);
|
||||
}
|
||||
|
||||
// Deletes an episode by its ID
|
||||
@@ -206,7 +207,8 @@ async function getEpisodeEmbedding(userMessage, aiResponse){
|
||||
const res = await fetch(`${url}/embed`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ text })
|
||||
body: JSON.stringify({ text }),
|
||||
signal: AbortSignal.timeout(30_000),
|
||||
})
|
||||
|
||||
if (!res.ok) {
|
||||
|
||||
@@ -12,7 +12,7 @@ const semantic = require('./semantic');
|
||||
const entities = require('./entities');
|
||||
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
app.use(express.json({ limit: '2mb' }));
|
||||
|
||||
const PORT = getEnv('PORT', PORTS.MEMORY);
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ const summariesRouter = require('./routes/summaries')
|
||||
const cors = require('cors');
|
||||
|
||||
const app = express();
|
||||
app.use(express.json());
|
||||
app.use(express.json({ limit: '2mb' }));
|
||||
|
||||
app.use(cors({
|
||||
origin: [
|
||||
|
||||
Reference in New Issue
Block a user