chat client clean up and switch to llama.cpp with models folder network sharing
This commit is contained in:
@@ -1,80 +1,90 @@
|
||||
const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require('@nexusai/shared');
|
||||
const { getEnv, LLAMACPP, INFERENCE_DEFAULTS } = require("@nexusai/shared");
|
||||
|
||||
const BASE_URL = getEnv('INFERENCE_URL', LLAMACPP.DEFAULT_URL);
|
||||
const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', LLAMACPP.DEFAULT_MODEL);
|
||||
const BASE_URL = getEnv("INFERENCE_URL", LLAMACPP.DEFAULT_URL);
|
||||
const DEFAULT_MODEL = getEnv("DEFAULT_MODEL", LLAMACPP.DEFAULT_MODEL);
|
||||
|
||||
function resolveOptions(options) {
|
||||
return {
|
||||
temperature: options.temperature ?? INFERENCE_DEFAULTS.TEMPERATURE,
|
||||
maxTokens: options.maxTokens ?? INFERENCE_DEFAULTS.MAX_TOKENS,
|
||||
topP: options.topP ?? INFERENCE_DEFAULTS.TOP_P,
|
||||
topK: options.topK ?? INFERENCE_DEFAULTS.TOP_K,
|
||||
repeatPenalty: options.repeatPenalty ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
|
||||
seed: options.seed ?? INFERENCE_DEFAULTS.SEED,
|
||||
};
|
||||
return {
|
||||
temperature: options.temperature ?? INFERENCE_DEFAULTS.TEMPERATURE,
|
||||
maxTokens: options.maxTokens ?? INFERENCE_DEFAULTS.MAX_TOKENS,
|
||||
topP: options.topP ?? INFERENCE_DEFAULTS.TOP_P,
|
||||
topK: options.topK ?? INFERENCE_DEFAULTS.TOP_K,
|
||||
repeatPenalty: options.repeatPenalty ?? INFERENCE_DEFAULTS.REPEAT_PENALTY,
|
||||
seed: options.seed ?? INFERENCE_DEFAULTS.SEED,
|
||||
};
|
||||
}
|
||||
|
||||
function buildPayload(prompt, options, stream = false){
|
||||
const opts = resolveOptions(options);
|
||||
function buildPayload(prompt, options, stream = false) {
|
||||
const opts = resolveOptions(options);
|
||||
|
||||
return {
|
||||
model: options.model || DEFAULT_MODEL,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
temperature: opts.temperature,
|
||||
max_tokens: opts.maxTokens,
|
||||
top_p: opts.topP,
|
||||
top_k: opts.topK,
|
||||
repeat_penalty: opts.repeatPenalty,
|
||||
stream,
|
||||
...(opts.seed !== null && { seed: opts.seed }),
|
||||
};
|
||||
return {
|
||||
model: options.model || DEFAULT_MODEL,
|
||||
messages: [{ role: "user", content: prompt }],
|
||||
temperature: opts.temperature,
|
||||
max_tokens: opts.maxTokens,
|
||||
top_p: opts.topP,
|
||||
top_k: opts.topK,
|
||||
repeat_penalty: opts.repeatPenalty,
|
||||
stream,
|
||||
...(opts.seed !== null && { seed: opts.seed }),
|
||||
};
|
||||
}
|
||||
|
||||
async function complete(prompt, options = {} ) {
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(buildPayload(prompt, options, false))
|
||||
})
|
||||
async function complete(prompt, options = {}) {
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(buildPayload(prompt, options, false)),
|
||||
});
|
||||
|
||||
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
|
||||
|
||||
const data = await res.json();
|
||||
const choice = data.choices[0];
|
||||
|
||||
return {
|
||||
text: choice.message.content,
|
||||
model: data.model,
|
||||
done: choice.finish_reason === 'stop',
|
||||
evalCount: data.usage?.completion_tokens,
|
||||
promptEvalCount: data.usage?.prompt_tokens,
|
||||
}
|
||||
if (!res.ok)
|
||||
throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
|
||||
|
||||
const data = await res.json();
|
||||
const choice = data.choices[0];
|
||||
|
||||
return {
|
||||
text: choice.message.content,
|
||||
model: data.model,
|
||||
done: choice.finish_reason === "stop",
|
||||
evalCount: data.usage?.completion_tokens,
|
||||
promptEvalCount: data.usage?.prompt_tokens,
|
||||
};
|
||||
}
|
||||
|
||||
async function* completeStream(prompt, options = {}) {
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(buildPayload(prompt, options, true))
|
||||
});
|
||||
let finalModel = DEFAULT_MODEL;
|
||||
let finalTokenCount = 0;
|
||||
|
||||
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(buildPayload(prompt, options, true)),
|
||||
});
|
||||
|
||||
//OpenAI streaming sends newline-delimited JSON (NDJSON) with "data: " prefix for each chunk
|
||||
//Example chunk: data: {"choices":[{"delta":{"content":"Hello"},"finish_reason":null,"index":0}]}
|
||||
//we parse each chunk as it arrives
|
||||
for await (const chunk of res.body){
|
||||
const lines = Buffer.from(chunk).toString('utf8')
|
||||
.split('\n')
|
||||
.filter(l => l.startsWith('data: ') && l !== 'data: [DONE]');
|
||||
|
||||
for (const line of lines) {
|
||||
const json = JSON.parse(line.slice(6)); //remove 'data: ' prefix
|
||||
const delta = json.choices?.[0]?.delta?.content;
|
||||
if (delta) yield {response: delta, done: false};
|
||||
}
|
||||
if (!res.ok)
|
||||
throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
|
||||
|
||||
for await (const chunk of res.body) {
|
||||
const lines = Buffer.from(chunk)
|
||||
.toString("utf8")
|
||||
.split("\n")
|
||||
.filter((l) => l.startsWith("data: ") && l !== "data: [DONE]");
|
||||
|
||||
for (const line of lines) {
|
||||
const json = JSON.parse(line.slice(6));
|
||||
const delta = json.choices?.[0]?.delta?.content;
|
||||
|
||||
// Capture final metadata from the stop chunk
|
||||
if (json.choices?.[0]?.finish_reason === "stop") {
|
||||
finalModel = json.model ?? finalModel;
|
||||
finalTokenCount = json.usage?.completion_tokens ?? finalTokenCount;
|
||||
}
|
||||
|
||||
if (delta) yield { response: delta, done: false };
|
||||
}
|
||||
yield { response: '', done: true}; //signal completion at the end of the stream
|
||||
}
|
||||
yield { response: '', done: true, model: finalModel, tokenCount: finalTokenCount };
|
||||
}
|
||||
|
||||
module.exports = { complete, completeStream };
|
||||
module.exports = { complete, completeStream };
|
||||
|
||||
@@ -24,22 +24,34 @@ router.post('/complete', async (req, res) => {
|
||||
router.post('/complete/stream', async (req, res) => {
|
||||
const { prompt, model, temperature } = req.body;
|
||||
|
||||
if (!prompt) {
|
||||
return res.status(400).json({error: 'prompt is required'});
|
||||
}
|
||||
if (!prompt) return res.status(400).json({ error: 'prompt is required' });
|
||||
|
||||
res.setHeader('Content-Type', 'text/event-stream');
|
||||
res.setHeader('Cache-Control', 'no-cache');
|
||||
res.setHeader('Connection', 'keep-alive');
|
||||
|
||||
try {
|
||||
for await (const chunk of completeStream(prompt, {model, temperature})) {
|
||||
res.write(`data: ${JSON.stringify(chunk)}\n\n`);
|
||||
let lastModel = model;
|
||||
let tokenCount = 0;
|
||||
|
||||
for await (const chunk of completeStream(prompt, { model, temperature })) {
|
||||
if (chunk.response) {
|
||||
res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`);
|
||||
}
|
||||
if (chunk.done) {
|
||||
// capture final metadata from the done signal
|
||||
lastModel = chunk.model ?? lastModel;
|
||||
tokenCount = chunk.tokenCount ?? tokenCount;
|
||||
}
|
||||
}
|
||||
|
||||
// Send a single done event with metadata after stream closes
|
||||
res.write(`data: ${JSON.stringify({ done: true, model: lastModel, tokenCount })}\n\n`);
|
||||
res.write('data: [DONE]\n\n');
|
||||
} catch (error) {
|
||||
console.error('[Inference] Streaming error:', error.message);
|
||||
res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`);
|
||||
|
||||
} catch (err) {
|
||||
console.error('[Inference] Streaming error:', err.message);
|
||||
res.write(`data: ${JSON.stringify({ error: err.message })}\n\n`);
|
||||
} finally {
|
||||
res.end();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user