chat client clean up and switch to llama.cpp with models folder network sharing

2026-04-09 04:13:21 -07:00
parent 541e664da1
commit 5c6e027fc1
15 changed files with 305 additions and 305 deletions
--- a/packages/inference-service/src/providers/llamacpp.js
+++ b/packages/inference-service/src/providers/llamacpp.js
@@ -1,80 +1,90 @@
-const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require('@nexusai/shared');
+const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require("@nexusai/shared");

-const BASE_URL      = getEnv('INFERENCE_URL', LLAMACPP.DEFAULT_URL);
-const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', LLAMACPP.DEFAULT_MODEL);
+const BASE_URL = getEnv("INFERENCE_URL", LLAMACPP.DEFAULT_URL);
+const DEFAULT_MODEL = getEnv("DEFAULT_MODEL", LLAMACPP.DEFAULT_MODEL);

 function resolveOptions(options) {
-    return {
-        temperature:    options.temperature    ?? INFERENCE_DEFAULTS.TEMPERATURE,
-        maxTokens:      options.maxTokens      ?? INFERENCE_DEFAULTS.MAX_TOKENS,
-        topP:           options.topP           ?? INFERENCE_DEFAULTS.TOP_P,
-        topK:           options.topK           ?? INFERENCE_DEFAULTS.TOP_K,
-        repeatPenalty:  options.repeatPenalty  ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
-        seed:           options.seed           ?? INFERENCE_DEFAULTS.SEED,
-    };
+  return {
+    temperature: options.temperature ?? INFERENCE_DEFAULTS.TEMPERATURE,
+    maxTokens: options.maxTokens ?? INFERENCE_DEFAULTS.MAX_TOKENS,
+    topP: options.topP ?? INFERENCE_DEFAULTS.TOP_P,
+    topK: options.topK ?? INFERENCE_DEFAULTS.TOP_K,
+    repeatPenalty: options.repeatPenalty ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
+    seed: options.seed ?? INFERENCE_DEFAULTS.SEED,
+  };
 }

-function buildPayload(prompt, options, stream = false){
-    const opts = resolveOptions(options);
+function buildPayload(prompt, options, stream = false) {
+  const opts = resolveOptions(options);

-    return {
-        model: options.model || DEFAULT_MODEL,
-        messages: [{ role: 'user', content: prompt }],
-        temperature:    opts.temperature,
-        max_tokens:     opts.maxTokens,
-        top_p:          opts.topP,
-        top_k:          opts.topK,
-        repeat_penalty: opts.repeatPenalty,
-        stream,
-        ...(opts.seed !== null && { seed: opts.seed }),
-    };
+  return {
+    model: options.model || DEFAULT_MODEL,
+    messages: [{ role: "user", content: prompt }],
+    temperature: opts.temperature,
+    max_tokens: opts.maxTokens,
+    top_p: opts.topP,
+    top_k: opts.topK,
+    repeat_penalty: opts.repeatPenalty,
+    stream,
+    ...(opts.seed !== null && { seed: opts.seed }),
+  };
 }

-async function complete(prompt, options = {} ) {
-    const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify(buildPayload(prompt, options, false))
-    })
+async function complete(prompt, options = {}) {
+  const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(buildPayload(prompt, options, false)),
+  });

-    if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
-    
-    const data = await res.json();
-    const choice = data.choices[0];
-    
-    return {
-        text: choice.message.content,
-        model: data.model,
-        done: choice.finish_reason === 'stop',
-        evalCount: data.usage?.completion_tokens,
-        promptEvalCount: data.usage?.prompt_tokens,
-    }
+  if (!res.ok)
+    throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
+
+  const data = await res.json();
+  const choice = data.choices[0];
+
+  return {
+    text: choice.message.content,
+    model: data.model,
+    done: choice.finish_reason === "stop",
+    evalCount: data.usage?.completion_tokens,
+    promptEvalCount: data.usage?.prompt_tokens,
+  };
 }

 async function* completeStream(prompt, options = {}) {
-    const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify(buildPayload(prompt, options, true))
-    });
+  let finalModel = DEFAULT_MODEL;
+  let finalTokenCount = 0;

-    if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
+  const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(buildPayload(prompt, options, true)),
+  });

-    //OpenAI streaming sends newline-delimited JSON (NDJSON) with "data: " prefix for each chunk
-    //Example chunk: data: {"choices":[{"delta":{"content":"Hello"},"finish_reason":null,"index":0}]}
-    //we parse each chunk as it arrives
-    for await (const chunk of res.body){
-        const lines = Buffer.from(chunk).toString('utf8')
-            .split('\n')
-            .filter(l => l.startsWith('data: ') && l !== 'data: [DONE]');
-        
-        for (const line of lines) {
-            const json = JSON.parse(line.slice(6)); //remove 'data: ' prefix
-            const delta = json.choices?.[0]?.delta?.content;
-            if (delta) yield {response: delta, done: false};
-        }        
+  if (!res.ok)
+    throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
+
+  for await (const chunk of res.body) {
+    const lines = Buffer.from(chunk)
+      .toString("utf8")
+      .split("\n")
+      .filter((l) => l.startsWith("data: ") && l !== "data: [DONE]");
+
+    for (const line of lines) {
+      const json = JSON.parse(line.slice(6));
+      const delta = json.choices?.[0]?.delta?.content;
+
+      // Capture final metadata from the stop chunk
+      if (json.choices?.[0]?.finish_reason === "stop") {
+        finalModel = json.model ?? finalModel;
+        finalTokenCount = json.usage?.completion_tokens ?? finalTokenCount;
+      }
+
+      if (delta) yield { response: delta, done: false };
    }
-    yield { response: '', done: true}; //signal completion at the end of the stream
+  }
+  yield { response: '', done: true, model: finalModel, tokenCount: finalTokenCount };
 }

-module.exports = { complete, completeStream };
+module.exports = { complete, completeStream };
--- a/packages/inference-service/src/routes/inference.js
+++ b/packages/inference-service/src/routes/inference.js
@@ -24,22 +24,34 @@ router.post('/complete', async (req, res) => {
 router.post('/complete/stream', async (req, res) => {
    const { prompt, model, temperature } = req.body;

-    if (!prompt) {
-        return res.status(400).json({error: 'prompt is required'});
-    }
+    if (!prompt) return res.status(400).json({ error: 'prompt is required' });

    res.setHeader('Content-Type', 'text/event-stream');
    res.setHeader('Cache-Control', 'no-cache');
    res.setHeader('Connection', 'keep-alive');

    try {
-        for await (const chunk of completeStream(prompt, {model, temperature})) {
-            res.write(`data: ${JSON.stringify(chunk)}\n\n`);
+        let lastModel = model;
+        let tokenCount = 0;
+
+        for await (const chunk of completeStream(prompt, { model, temperature })) {
+            if (chunk.response) {
+                res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
+            }
+            if (chunk.done) {
+                // capture final metadata from the done signal
+                lastModel  = chunk.model      ?? lastModel;
+                tokenCount = chunk.tokenCount ?? tokenCount;
+            }
        }
+
+        // Send a single done event with metadata after stream closes
+        res.write(`data: ${JSON.stringify({ done: true, model: lastModel, tokenCount })}\n\n`);
        res.write('data: [DONE]\n\n');
-    } catch (error) {
-        console.error('[Inference] Streaming error:', error.message);
-        res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`);
+
+    } catch (err) {
+        console.error('[Inference] Streaming error:', err.message);
+        res.write(`data: ${JSON.stringify({ error: err.message })}\n\n`);
    } finally {
        res.end();
    }