implemented inference service
This commit is contained in:
63
packages/inference-service/src/providers/llamacpp.js
Normal file
63
packages/inference-service/src/providers/llamacpp.js
Normal file
@@ -0,0 +1,63 @@
|
||||
const { getEnv } = require('@nexusai/shared');
|
||||
|
||||
const BASE_URL = getEnv('INFERENCE_URL', 'http://localhost:8080');
|
||||
const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', 'local-model');
|
||||
|
||||
function buildPayload(prompt, options, stream = false){
|
||||
return {
|
||||
model: options.model || DEFAULT_MODEL,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
temperature: options.temperature ?? 0.7,
|
||||
max_tokens: options.num_predict ?? 1024,
|
||||
stream,
|
||||
};
|
||||
}
|
||||
|
||||
async function complete(prompt, options = {} ) {
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(buildPayload(prompt, options, false))
|
||||
})
|
||||
|
||||
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
|
||||
|
||||
const data = await res.json();
|
||||
const choice = data.choices[0];
|
||||
|
||||
return {
|
||||
text: choice.message.content,
|
||||
model: data.model,
|
||||
done: choice.finish_reason === 'stop',
|
||||
evalCount: data.usage?.completion_tokens,
|
||||
promptEvalCount: data.usage?.prompt_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
async function* completeStream(prompt, options = {}) {
|
||||
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(buildPayload(prompt, options, true))
|
||||
});
|
||||
|
||||
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
|
||||
|
||||
//OpenAI streaming sends newline-delimited JSON (NDJSON) with "data: " prefix for each chunk
|
||||
//Example chunk: data: {"choices":[{"delta":{"content":"Hello"},"finish_reason":null,"index":0}]}
|
||||
//we parse each chunk as it arrives
|
||||
for await (const chunk of res.body){
|
||||
const lines = Buffer.from(chunk).toString('utf8')
|
||||
.split('\n')
|
||||
.filter(l => l.startsWith('data: ') && l !== 'data: [DONE]');
|
||||
|
||||
for (const line of lines) {
|
||||
const json = JSON.parse(line.slice(6)); //remove 'data: ' prefix
|
||||
const delta = json.choices?.[0]?.delta?.content;
|
||||
if (delta) yield {response: delta, done: false};
|
||||
}
|
||||
}
|
||||
yield { response: '', done: true}; //signal completion at the end of the stream
|
||||
}
|
||||
|
||||
module.exports = { complete, completeStream };
|
||||
42
packages/inference-service/src/providers/ollama.js
Normal file
42
packages/inference-service/src/providers/ollama.js
Normal file
@@ -0,0 +1,42 @@
|
||||
const { Ollama } = require('ollama');
|
||||
const { getEnv } = require('@nexusai/shared');
|
||||
|
||||
const client = new Ollama({ host: getEnv('INFERENCE_URL', 'http://localhost:11434') });
|
||||
const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', 'companion:latest');
|
||||
|
||||
async function complete(prompt, options = {} ) {
|
||||
const response = await client.generate({
|
||||
model: options.model || DEFAULT_MODEL,
|
||||
prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: options.temperature ?? 0.7,
|
||||
num_predict: options.maxTokens ?? 1024,
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
text: response.response,
|
||||
model: response.model,
|
||||
done: response.done,
|
||||
evalCount: response.eval_count,
|
||||
promptEvalCount: response.prompt_eval_count,
|
||||
};
|
||||
}
|
||||
|
||||
async function* completeStream(prompt, options = {} ) {
|
||||
const stream = await client.generate({
|
||||
model: options.model || DEFAULT_MODEL,
|
||||
prompt,
|
||||
stream: true,
|
||||
options:{
|
||||
temperature: options.temperature ?? 0.7,
|
||||
},
|
||||
});
|
||||
|
||||
for await (const chunk of stream) {
|
||||
yield chunk;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { complete, completeStream };
|
||||
Reference in New Issue
Block a user