chat client clean up and switch to llama.cpp with models folder network sharing

This commit is contained in:
Storme-bit
2026-04-09 04:13:21 -07:00
parent 541e664da1
commit 5c6e027fc1
15 changed files with 305 additions and 305 deletions

View File

@@ -1,80 +1,90 @@
const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require('@nexusai/shared');
const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require("@nexusai/shared");
const BASE_URL = getEnv('INFERENCE_URL', LLAMACPP.DEFAULT_URL);
const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', LLAMACPP.DEFAULT_MODEL);
const BASE_URL = getEnv("INFERENCE_URL", LLAMACPP.DEFAULT_URL);
const DEFAULT_MODEL = getEnv("DEFAULT_MODEL", LLAMACPP.DEFAULT_MODEL);
function resolveOptions(options) {
return {
temperature: options.temperature ?? INFERENCE_DEFAULTS.TEMPERATURE,
maxTokens: options.maxTokens ?? INFERENCE_DEFAULTS.MAX_TOKENS,
topP: options.topP ?? INFERENCE_DEFAULTS.TOP_P,
topK: options.topK ?? INFERENCE_DEFAULTS.TOP_K,
repeatPenalty: options.repeatPenalty ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
seed: options.seed ?? INFERENCE_DEFAULTS.SEED,
};
return {
temperature: options.temperature ?? INFERENCE_DEFAULTS.TEMPERATURE,
maxTokens: options.maxTokens ?? INFERENCE_DEFAULTS.MAX_TOKENS,
topP: options.topP ?? INFERENCE_DEFAULTS.TOP_P,
topK: options.topK ?? INFERENCE_DEFAULTS.TOP_K,
repeatPenalty: options.repeatPenalty ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
seed: options.seed ?? INFERENCE_DEFAULTS.SEED,
};
}
function buildPayload(prompt, options, stream = false){
const opts = resolveOptions(options);
function buildPayload(prompt, options, stream = false) {
const opts = resolveOptions(options);
return {
model: options.model || DEFAULT_MODEL,
messages: [{ role: 'user', content: prompt }],
temperature: opts.temperature,
max_tokens: opts.maxTokens,
top_p: opts.topP,
top_k: opts.topK,
repeat_penalty: opts.repeatPenalty,
stream,
...(opts.seed !== null && { seed: opts.seed }),
};
return {
model: options.model || DEFAULT_MODEL,
messages: [{ role: "user", content: prompt }],
temperature: opts.temperature,
max_tokens: opts.maxTokens,
top_p: opts.topP,
top_k: opts.topK,
repeat_penalty: opts.repeatPenalty,
stream,
...(opts.seed !== null && { seed: opts.seed }),
};
}
async function complete(prompt, options = {} ) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(buildPayload(prompt, options, false))
})
async function complete(prompt, options = {}) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(buildPayload(prompt, options, false)),
});
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
const data = await res.json();
const choice = data.choices[0];
return {
text: choice.message.content,
model: data.model,
done: choice.finish_reason === 'stop',
evalCount: data.usage?.completion_tokens,
promptEvalCount: data.usage?.prompt_tokens,
}
if (!res.ok)
throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
const data = await res.json();
const choice = data.choices[0];
return {
text: choice.message.content,
model: data.model,
done: choice.finish_reason === "stop",
evalCount: data.usage?.completion_tokens,
promptEvalCount: data.usage?.prompt_tokens,
};
}
async function* completeStream(prompt, options = {}) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(buildPayload(prompt, options, true))
});
let finalModel = DEFAULT_MODEL;
let finalTokenCount = 0;
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(buildPayload(prompt, options, true)),
});
//OpenAI streaming sends newline-delimited JSON (NDJSON) with "data: " prefix for each chunk
//Example chunk: data: {"choices":[{"delta":{"content":"Hello"},"finish_reason":null,"index":0}]}
//we parse each chunk as it arrives
for await (const chunk of res.body){
const lines = Buffer.from(chunk).toString('utf8')
.split('\n')
.filter(l => l.startsWith('data: ') && l !== 'data: [DONE]');
for (const line of lines) {
const json = JSON.parse(line.slice(6)); //remove 'data: ' prefix
const delta = json.choices?.[0]?.delta?.content;
if (delta) yield {response: delta, done: false};
}
if (!res.ok)
throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
for await (const chunk of res.body) {
const lines = Buffer.from(chunk)
.toString("utf8")
.split("\n")
.filter((l) => l.startsWith("data: ") && l !== "data: [DONE]");
for (const line of lines) {
const json = JSON.parse(line.slice(6));
const delta = json.choices?.[0]?.delta?.content;
// Capture final metadata from the stop chunk
if (json.choices?.[0]?.finish_reason === "stop") {
finalModel = json.model ?? finalModel;
finalTokenCount = json.usage?.completion_tokens ?? finalTokenCount;
}
if (delta) yield { response: delta, done: false };
}
yield { response: '', done: true}; //signal completion at the end of the stream
}
yield { response: '', done: true, model: finalModel, tokenCount: finalTokenCount };
}
module.exports = { complete, completeStream };
module.exports = { complete, completeStream };