implemented inference service

This commit is contained in:
Storme-bit
2026-04-05 04:18:05 -07:00
parent 2bbdd21bd3
commit a449d570ea
6 changed files with 180 additions and 2 deletions

View File

@@ -1,6 +1,7 @@
require ('dotenv').config();
const express = require('express');
const {getEnv} = require('@nexusai/shared');
const inferenceRouter = require('./routes/inference');
const app = express();
app.use(express.json());
@@ -9,9 +10,16 @@ const PORT = getEnv('PORT', '3001'); // Default to 3001 if PORT is not set
// Health check endpoint
app.get('/health', (req, res) => {
res.json({ service: 'Inference Service', status: 'healthy' });
res.json({
service: 'Inference Service',
status: 'healthy',
provider: getEnv('INFERENCE_PROVIDER', 'ollama'),
model: getEnv('DEFAULT_MODEL', 'llama3.2')
});
});
app.use('/', inferenceRouter);
// Start the server
app.listen(PORT, () => {
console.log(`Inference Service is running on port ${PORT}`);

View File

@@ -0,0 +1,17 @@
// packages/inference-service/src/infer.js
const { getEnv } = require('@nexusai/shared');
const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama');
const providers = {
ollama: () => require('./providers/ollama.js'),
llamacpp: () => require('./providers/llamacpp.js'),
};
if (!providers[PROVIDER]) {
throw new Error(`Unknown inference provider: "${PROVIDER}". Valid options: ${Object.keys(providers).join(', ')}`);
}
const { complete, completeStream } = providers[PROVIDER]();
module.exports = { complete, completeStream };

View File

@@ -0,0 +1,63 @@
const { getEnv } = require('@nexusai/shared');
const BASE_URL = getEnv('INFERENCE_URL', 'http://localhost:8080');
const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', 'local-model');
function buildPayload(prompt, options, stream = false){
return {
model: options.model || DEFAULT_MODEL,
messages: [{ role: 'user', content: prompt }],
temperature: options.temperature ?? 0.7,
max_tokens: options.num_predict ?? 1024,
stream,
};
}
async function complete(prompt, options = {} ) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(buildPayload(prompt, options, false))
})
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
const data = await res.json();
const choice = data.choices[0];
return {
text: choice.message.content,
model: data.model,
done: choice.finish_reason === 'stop',
evalCount: data.usage?.completion_tokens,
promptEvalCount: data.usage?.prompt_tokens,
}
}
async function* completeStream(prompt, options = {}) {
const res = await fetch(`${BASE_URL}/v1/chat/completions`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(buildPayload(prompt, options, true))
});
if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`);
//OpenAI streaming sends newline-delimited JSON (NDJSON) with "data: " prefix for each chunk
//Example chunk: data: {"choices":[{"delta":{"content":"Hello"},"finish_reason":null,"index":0}]}
//we parse each chunk as it arrives
for await (const chunk of res.body){
const lines = Buffer.from(chunk).toString('utf8')
.split('\n')
.filter(l => l.startsWith('data: ') && l !== 'data: [DONE]');
for (const line of lines) {
const json = JSON.parse(line.slice(6)); //remove 'data: ' prefix
const delta = json.choices?.[0]?.delta?.content;
if (delta) yield {response: delta, done: false};
}
}
yield { response: '', done: true}; //signal completion at the end of the stream
}
module.exports = { complete, completeStream };

View File

@@ -0,0 +1,42 @@
const { Ollama } = require('ollama');
const { getEnv } = require('@nexusai/shared');
const client = new Ollama({ host: getEnv('INFERENCE_URL', 'http://localhost:11434') });
const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', 'companion:latest');
async function complete(prompt, options = {} ) {
const response = await client.generate({
model: options.model || DEFAULT_MODEL,
prompt,
stream: false,
options: {
temperature: options.temperature ?? 0.7,
num_predict: options.maxTokens ?? 1024,
}
});
return {
text: response.response,
model: response.model,
done: response.done,
evalCount: response.eval_count,
promptEvalCount: response.prompt_eval_count,
};
}
async function* completeStream(prompt, options = {} ) {
const stream = await client.generate({
model: options.model || DEFAULT_MODEL,
prompt,
stream: true,
options:{
temperature: options.temperature ?? 0.7,
},
});
for await (const chunk of stream) {
yield chunk;
}
}
module.exports = { complete, completeStream };

View File

@@ -0,0 +1,48 @@
const { Router } = require('express');
const { complete, completeStream } = require('../infer');
const router = Router();
// Standard completion endpoint - returns full response when done
router.post('/complete', async (req, res) => {
const { prompt, model, temperature, maxTokens } = req.body;
if (!prompt) {
return res.status(400).json({ error: 'prompt is required'});
}
try {
const result = await complete (prompt, {model, temperature, maxTokens});
res.json(result);
} catch (error) {
console.error('[Inference] Completion error:', error.message);
res.status(500).json({ error: error.message });
}
});
// Streaming completion endpoint - sends partial responses as they arrive
router.post('/complete/stream', async (req, res) => {
const { prompt, model, temperature } = req.body;
if (!prompt) {
return res.status(400).json({error: 'prompt is required'});
}
res.setHeader('Content-Type', 'text/event-stream');
res.setHeader('Cache-Control', 'no-cache');
res.setHeader('Connection', 'keep-alive');
try {
for await (const chunk of completeStream(prompt, {model, temperature})) {
res.write(`data: ${JSON.stringify(chunk)}\n\n`);
}
res.write('data: [DONE]\n\n');
} catch (error) {
console.error('[Inference] Streaming error:', error.message);
res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`);
} finally {
res.end();
}
});
module.exports = router;