chat client clean up and switch to llama.cpp with models folder network sharing

This commit is contained in:
Storme-bit
2026-04-09 04:13:21 -07:00
parent 541e664da1
commit 5c6e027fc1
15 changed files with 305 additions and 305 deletions

View File

@@ -1,80 +1,90 @@
const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require('@nexusai/shared');
const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require("@nexusai/shared");
const BASE_URL = getEnv('INFERENCE_URL', LLAMACPP.DEFAULT_URL);
const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', LLAMACPP.DEFAULT_MODEL);
const BASE_URL = getEnv("INFERENCE_URL", LLAMACPP.DEFAULT_URL);
const DEFAULT_MODEL = getEnv("DEFAULT_MODEL", LLAMACPP.DEFAULT_MODEL);
function resolveOptions(options) {
return {
temperature: options.temperature ?? INFERENCE_DEFAULTS.TEMPERATURE,
maxTokens: options.maxTokens ?? INFERENCE_DEFAULTS.MAX_TOKENS,
topP: options.topP ?? INFERENCE_DEFAULTS.TOP_P,
topK: options.topK ?? INFERENCE_DEFAULTS.TOP_K,
repeatPenalty: options.repeatPenalty ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
seed: options.seed ?? INFERENCE_DEFAULTS.SEED,
};
return {
temperature: options.temperature ?? INFERENCE_DEFAULTS.TEMPERATURE,
maxTokens: options.maxTokens ?? INFERENCE_DEFAULTS.MAX_TOKENS,
topP: options.topP ?? INFERENCE_DEFAULTS.TOP_P,
topK: options.topK ?? INFERENCE_DEFAULTS.TOP_K,
repeatPenalty: options.repeatPenalty ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
seed: options.seed ?? INFERENCE_DEFAULTS.SEED,
};
}
function buildPayload(prompt, options, stream = false){
const opts = resolveOptions(options);
function buildPayload(prompt, options, stream = false) {
const opts = resolveOptions(options);
return {
model: options.model || DEFAULT_MODEL,
messages: [{ role: 'user', content: prompt }],
temperature: opts.temperature,
max_tokens: opts.maxTokens,
top_p: opts.topP,
top_k: opts.topK,
repeat_penalty: opts.repeatPenalty,
stream,
...(opts.seed !== null && { seed: opts.seed }),
};
return {
model: options.model || DEFAULT_MODEL,
messages: [{ role: "user", content: prompt }],
temperature: opts.temperature,
max_tokens: opts.maxTokens,
top_p: opts.topP,
top_k: opts.topK,
repeat_penalty: opts.repeatPenalty,
stream,
...(opts.seed !== null && { seed: opts.seed }),
};
}
async function complete(prompt, options = {} ) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(buildPayload(prompt, options, false))
})
async function complete(prompt, options = {}) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(buildPayload(prompt, options, false)),
});
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
const data = await res.json();
const choice = data.choices[0];
return {
text: choice.message.content,
model: data.model,
done: choice.finish_reason === 'stop',
evalCount: data.usage?.completion_tokens,
promptEvalCount: data.usage?.prompt_tokens,
}
if (!res.ok)
throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
const data = await res.json();
const choice = data.choices[0];
return {
text: choice.message.content,
model: data.model,
done: choice.finish_reason === "stop",
evalCount: data.usage?.completion_tokens,
promptEvalCount: data.usage?.prompt_tokens,
};
}
async function* completeStream(prompt, options = {}) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(buildPayload(prompt, options, true))
});
let finalModel = DEFAULT_MODEL;
let finalTokenCount = 0;
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(buildPayload(prompt, options, true)),
});
//OpenAI streaming sends newline-delimited JSON (NDJSON) with "data: " prefix for each chunk
//Example chunk: data: {"choices":[{"delta":{"content":"Hello"},"finish_reason":null,"index":0}]}
//we parse each chunk as it arrives
for await (const chunk of res.body){
const lines = Buffer.from(chunk).toString('utf8')
.split('\n')
.filter(l => l.startsWith('data: ') && l !== 'data: [DONE]');
for (const line of lines) {
const json = JSON.parse(line.slice(6)); //remove 'data: ' prefix
const delta = json.choices?.[0]?.delta?.content;
if (delta) yield {response: delta, done: false};
}
if (!res.ok)
throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
for await (const chunk of res.body) {
const lines = Buffer.from(chunk)
.toString("utf8")
.split("\n")
.filter((l) => l.startsWith("data: ") && l !== "data: [DONE]");
for (const line of lines) {
const json = JSON.parse(line.slice(6));
const delta = json.choices?.[0]?.delta?.content;
// Capture final metadata from the stop chunk
if (json.choices?.[0]?.finish_reason === "stop") {
finalModel = json.model ?? finalModel;
finalTokenCount = json.usage?.completion_tokens ?? finalTokenCount;
}
if (delta) yield { response: delta, done: false };
}
yield { response: '', done: true}; //signal completion at the end of the stream
}
yield { response: '', done: true, model: finalModel, tokenCount: finalTokenCount };
}
module.exports = { complete, completeStream };
module.exports = { complete, completeStream };

View File

@@ -24,22 +24,34 @@ router.post('/complete', async (req, res) => {
router.post('/complete/stream', async (req, res) => {
const { prompt, model, temperature } = req.body;
if (!prompt) {
return res.status(400).json({error: 'prompt is required'});
}
if (!prompt) return res.status(400).json({ error: 'prompt is required' });
res.setHeader('Content-Type', 'text/event-stream');
res.setHeader('Cache-Control', 'no-cache');
res.setHeader('Connection', 'keep-alive');
try {
for await (const chunk of completeStream(prompt, {model, temperature})) {
res.write(`data: ${JSON.stringify(chunk)}\n\n`);
let lastModel = model;
let tokenCount = 0;
for await (const chunk of completeStream(prompt, { model, temperature })) {
if (chunk.response) {
res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
}
if (chunk.done) {
// capture final metadata from the done signal
lastModel = chunk.model ?? lastModel;
tokenCount = chunk.tokenCount ?? tokenCount;
}
}
// Send a single done event with metadata after stream closes
res.write(`data: ${JSON.stringify({ done: true, model: lastModel, tokenCount })}\n\n`);
res.write('data: [DONE]\n\n');
} catch (error) {
console.error('[Inference] Streaming error:', error.message);
res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`);
} catch (err) {
console.error('[Inference] Streaming error:', err.message);
res.write(`data: ${JSON.stringify({ error: err.message })}\n\n`);
} finally {
res.end();
}