From a449d570ea7df433dfd6b854f26f1e6642b6ee1b Mon Sep 17 00:00:00 2001 From: Storme-bit Date: Sun, 5 Apr 2026 04:18:05 -0700 Subject: [PATCH] implemented inference service --- packages/inference-service/package.json | 2 +- packages/inference-service/src/index.js | 10 ++- packages/inference-service/src/infer.js | 17 +++++ .../src/providers/llamacpp.js | 63 +++++++++++++++++++ .../inference-service/src/providers/ollama.js | 42 +++++++++++++ .../inference-service/src/routes/inference.js | 48 ++++++++++++++ 6 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 packages/inference-service/src/infer.js create mode 100644 packages/inference-service/src/providers/llamacpp.js create mode 100644 packages/inference-service/src/providers/ollama.js create mode 100644 packages/inference-service/src/routes/inference.js diff --git a/packages/inference-service/package.json b/packages/inference-service/package.json index b541d0e..0d33185 100644 --- a/packages/inference-service/package.json +++ b/packages/inference-service/package.json @@ -6,7 +6,7 @@ "start": "node src/index.js", "dev": "node --watch src/index.js" }, - "dependencies": { + "dependencies": { "@nexusai/shared": "^1.0.0", "dotenv": "^17.4.0", "express": "^5.2.1", diff --git a/packages/inference-service/src/index.js b/packages/inference-service/src/index.js index cf5b215..fe222f7 100644 --- a/packages/inference-service/src/index.js +++ b/packages/inference-service/src/index.js @@ -1,6 +1,7 @@ require ('dotenv').config(); const express = require('express'); const {getEnv} = require('@nexusai/shared'); +const inferenceRouter = require('./routes/inference'); const app = express(); app.use(express.json()); @@ -9,9 +10,16 @@ const PORT = getEnv('PORT', '3001'); // Default to 3001 if PORT is not set // Health check endpoint app.get('/health', (req, res) => { - res.json({ service: 'Inference Service', status: 'healthy' }); + res.json({ + service: 'Inference Service', + status: 'healthy', + provider: getEnv('INFERENCE_PROVIDER', 'ollama'), + model: getEnv('DEFAULT_MODEL', 'llama3.2') + }); }); +app.use('/', inferenceRouter); + // Start the server app.listen(PORT, () => { console.log(`Inference Service is running on port ${PORT}`); diff --git a/packages/inference-service/src/infer.js b/packages/inference-service/src/infer.js new file mode 100644 index 0000000..42a343e --- /dev/null +++ b/packages/inference-service/src/infer.js @@ -0,0 +1,17 @@ +// packages/inference-service/src/infer.js +const { getEnv } = require('@nexusai/shared'); + +const PROVIDER = getEnv('INFERENCE_PROVIDER', 'ollama'); + +const providers = { + ollama: () => require('./providers/ollama.js'), + llamacpp: () => require('./providers/llamacpp.js'), +}; + +if (!providers[PROVIDER]) { + throw new Error(`Unknown inference provider: "${PROVIDER}". Valid options: ${Object.keys(providers).join(', ')}`); +} + +const { complete, completeStream } = providers[PROVIDER](); + +module.exports = { complete, completeStream }; \ No newline at end of file diff --git a/packages/inference-service/src/providers/llamacpp.js b/packages/inference-service/src/providers/llamacpp.js new file mode 100644 index 0000000..ef5e32b --- /dev/null +++ b/packages/inference-service/src/providers/llamacpp.js @@ -0,0 +1,63 @@ +const { getEnv } = require('@nexusai/shared'); + +const BASE_URL = getEnv('INFERENCE_URL', 'http://localhost:8080'); +const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', 'local-model'); + +function buildPayload(prompt, options, stream = false){ + return { + model: options.model || DEFAULT_MODEL, + messages: [{ role: 'user', content: prompt }], + temperature: options.temperature ?? 0.7, + max_tokens: options.num_predict ?? 1024, + stream, + }; +} + +async function complete(prompt, options = {} ) { + const res = await fetch(`${BASE_URL}/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(buildPayload(prompt, options, false)) + }) + + if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`); + + const data = await res.json(); + const choice = data.choices[0]; + + return { + text: choice.message.content, + model: data.model, + done: choice.finish_reason === 'stop', + evalCount: data.usage?.completion_tokens, + promptEvalCount: data.usage?.prompt_tokens, + } +} + +async function* completeStream(prompt, options = {}) { + const res = await fetch(`${BASE_URL}/v1/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify(buildPayload(prompt, options, true)) + }); + + if (!res.ok) throw new Error(`llama.cpp error: ${res.status} ${res.statusText}`); + + //OpenAI streaming sends newline-delimited JSON (NDJSON) with "data: " prefix for each chunk + //Example chunk: data: {"choices":[{"delta":{"content":"Hello"},"finish_reason":null,"index":0}]} + //we parse each chunk as it arrives + for await (const chunk of res.body){ + const lines = Buffer.from(chunk).toString('utf8') + .split('\n') + .filter(l => l.startsWith('data: ') && l !== 'data: [DONE]'); + + for (const line of lines) { + const json = JSON.parse(line.slice(6)); //remove 'data: ' prefix + const delta = json.choices?.[0]?.delta?.content; + if (delta) yield {response: delta, done: false}; + } + } + yield { response: '', done: true}; //signal completion at the end of the stream +} + +module.exports = { complete, completeStream }; \ No newline at end of file diff --git a/packages/inference-service/src/providers/ollama.js b/packages/inference-service/src/providers/ollama.js new file mode 100644 index 0000000..bc3f42f --- /dev/null +++ b/packages/inference-service/src/providers/ollama.js @@ -0,0 +1,42 @@ +const { Ollama } = require('ollama'); +const { getEnv } = require('@nexusai/shared'); + +const client = new Ollama({ host: getEnv('INFERENCE_URL', 'http://localhost:11434') }); +const DEFAULT_MODEL = getEnv('DEFAULT_MODEL', 'companion:latest'); + +async function complete(prompt, options = {} ) { + const response = await client.generate({ + model: options.model || DEFAULT_MODEL, + prompt, + stream: false, + options: { + temperature: options.temperature ?? 0.7, + num_predict: options.maxTokens ?? 1024, + } + }); + + return { + text: response.response, + model: response.model, + done: response.done, + evalCount: response.eval_count, + promptEvalCount: response.prompt_eval_count, + }; +} + +async function* completeStream(prompt, options = {} ) { + const stream = await client.generate({ + model: options.model || DEFAULT_MODEL, + prompt, + stream: true, + options:{ + temperature: options.temperature ?? 0.7, + }, + }); + + for await (const chunk of stream) { + yield chunk; + } +} + +module.exports = { complete, completeStream }; \ No newline at end of file diff --git a/packages/inference-service/src/routes/inference.js b/packages/inference-service/src/routes/inference.js new file mode 100644 index 0000000..92f966b --- /dev/null +++ b/packages/inference-service/src/routes/inference.js @@ -0,0 +1,48 @@ +const { Router } = require('express'); +const { complete, completeStream } = require('../infer'); + +const router = Router(); + +// Standard completion endpoint - returns full response when done +router.post('/complete', async (req, res) => { + const { prompt, model, temperature, maxTokens } = req.body; + + if (!prompt) { + return res.status(400).json({ error: 'prompt is required'}); + } + + try { + const result = await complete (prompt, {model, temperature, maxTokens}); + res.json(result); + } catch (error) { + console.error('[Inference] Completion error:', error.message); + res.status(500).json({ error: error.message }); + } +}); + +// Streaming completion endpoint - sends partial responses as they arrive +router.post('/complete/stream', async (req, res) => { + const { prompt, model, temperature } = req.body; + + if (!prompt) { + return res.status(400).json({error: 'prompt is required'}); + } + + res.setHeader('Content-Type', 'text/event-stream'); + res.setHeader('Cache-Control', 'no-cache'); + res.setHeader('Connection', 'keep-alive'); + + try { + for await (const chunk of completeStream(prompt, {model, temperature})) { + res.write(`data: ${JSON.stringify(chunk)}\n\n`); + } + res.write('data: [DONE]\n\n'); + } catch (error) { + console.error('[Inference] Streaming error:', error.message); + res.write(`data: ${JSON.stringify({ error: error.message })}\n\n`); + } finally { + res.end(); + } +}); + +module.exports = router; \ No newline at end of file