const { Router } = require('express'); const { complete, completeStream } = require('../infer'); const { logger } = require('@nexusai/shared'); const router = Router(); // Standard completion endpoint - returns full response when done router.post('/complete', async (req, res) => { const { prompt, model, temperature, maxTokens, topP, topK, repeatPenalty } = req.body; if (!prompt) { return res.status(400).json({ error: 'prompt is required'}); } try { const result = await complete (prompt, {model, temperature, maxTokens, topP, topK, repeatPenalty}); res.json(result); } catch (error) { logger.error('[Inference] Completion error:', error.message); res.status(500).json({ error: error.message }); } }); // Streaming completion endpoint - sends partial responses as they arrive router.post('/complete/stream', async (req, res) => { const { prompt, model, temperature, topP, topK, repeatPenalty } = req.body; if (!prompt) return res.status(400).json({ error: 'prompt is required' }); res.setHeader('Content-Type', 'text/event-stream'); res.setHeader('Cache-Control', 'no-cache'); res.setHeader('Connection', 'keep-alive'); try { let lastModel = model; let tokenCount = 0; for await (const chunk of completeStream(prompt, { model, temperature, topP, topK, repeatPenalty })) { if (chunk.response) { res.write(`data: ${JSON.stringify({ response: chunk.response })}\n\n`); } if (chunk.done) { // capture final metadata from the done signal lastModel = chunk.model ?? lastModel; tokenCount = chunk.tokenCount ?? tokenCount; logger.info('[inference router] tokenCount from chunk:', chunk.tokenCount, '→', tokenCount); } } // Send a single done event with metadata after stream closes res.write(`data: ${JSON.stringify({ done: true, model: lastModel, tokenCount })}\n\n`); res.write('data: [DONE]\n\n'); } catch (err) { logger.error('[Inference] Streaming error:', err.message); res.write(`data: ${JSON.stringify({ error: err.message })}\n\n`); } finally { res.end(); } }); module.exports = router;