diff --git a/packages/inference-service/src/providers/llamacpp.js b/packages/inference-service/src/providers/llamacpp.js index 9503f24..69eb6ac 100644 --- a/packages/inference-service/src/providers/llamacpp.js +++ b/packages/inference-service/src/providers/llamacpp.js @@ -73,16 +73,19 @@ async function* completeStream(prompt, options = {}) { .filter((l) => l.startsWith("data: ") && l !== "data: [DONE]"); for (const line of lines) { - const json = JSON.parse(line.slice(6)); - const delta = json.choices?.[0]?.delta?.content; + const json = JSON.parse(line.slice(6)); + const delta = json.choices?.[0]?.delta?.content; - // Capture final metadata from the stop chunk - if (json.choices?.[0]?.finish_reason === "stop") { - finalModel = json.model ?? finalModel; - finalTokenCount = json.usage?.completion_tokens ?? finalTokenCount; - } + if (json.choices?.[0]?.finish_reason === 'stop') { + finalModel = json.model ?? finalModel; + } - if (delta) yield { response: delta, done: false }; + // usage arrives in a separate final chunk with empty choices array + if (json.usage) { + finalTokenCount = (json.usage.completion_tokens ?? 0) + (json.usage.prompt_tokens ?? 0); + } + + if (delta) yield { response: delta, done: false }; } } yield { response: '', done: true, model: finalModel, tokenCount: finalTokenCount };