chat client clean up and switch to llama.cpp with models folder network sharing

2026-04-09 04:13:21 -07:00
parent 541e664da1
commit 5c6e027fc1
15 changed files with 305 additions and 305 deletions
--- a/packages/orchestration-service/src/chat/index.js
+++ b/packages/orchestration-service/src/chat/index.js
@@ -109,31 +109,35 @@ async function chatStream(externalId, userMessage, onChunk, options = {} ) {
    let tokenCount = 0;

    // 5. Parse SSE chunks
-    for await (const chunk of res.body){
+    // Replace the current SSE parsing block in chatStream:
+    for await (const chunk of res.body) {
        const lines = chunk.toString().split('\n');

        for (const line of lines) {
            if (!line.startsWith('data: ')) continue;
            const raw = line.slice(6).trim();
-            if (raw === '[DONE]') continue //stream closed sentinel
+            if (raw === '[DONE]') continue;

            try {
                const data = JSON.parse(raw);
-                if (data.model) model = data.model

+                // llama.cpp provider shape: { response, done }
                if (data.response) {
                    fullText += data.response;
                    onChunk(data.response);
                }

-                if (data.done && data.eval_count !== undefined) {
-                    tokenCount = (data.eval_count || 0) + (data.prompt_eval_count || 0)
-                }
-            } catch {
-                //partial chunk
-                //skip and wait for next
-            }
+                // model comes through on done chunk from inference route
+                if (data.model) model = data.model;

+                // token count — inference.js route sends this on the done chunk
+                if (data.done && data.tokenCount !== undefined) {
+                    tokenCount = data.tokenCount;
+                }
+
+            } catch {
+                // partial chunk — skip
+            }
        }
    }