133 lines
5.3 KiB
JavaScript
133 lines
5.3 KiB
JavaScript
const semantic = require('../semantic')
|
|
const { getEnv, SERVICES, formatEpisodeText, ENTITIES } = require('@nexusai/shared');
|
|
const { upsertEntity } = require('./index');
|
|
|
|
const EXTRACTION_URL = getEnv('EXTRACTION_URL', 'http://localhost:11434');
|
|
const EXTRACTION_MODEL = getEnv('EXTRACTION_MODEL', 'qwen2.5:3b');
|
|
const EMBEDDING_SERVICE_URL = getEnv('EMBEDDING_SERVICE_URL', SERVICES.EMBEDDING_URL);
|
|
|
|
const ENTITY_TYPES = ['person', 'place', 'project', 'technology', 'concept', 'organization'];
|
|
const IGNORED_NAMES = ['good morning', 'good night', 'hello', 'goodbye', 'thanks', 'thank you'];
|
|
|
|
function buildExtractionPrompt(userMessage, aiResponse, knownEntities = []) {
|
|
const knownBlock = knownEntities.length > 0
|
|
? [
|
|
'Already known entities (use these exact name and type values if the same entity appears):',
|
|
...knownEntities.map(e => `- "${e.name}" (${e.type})`),
|
|
'',
|
|
].join('\n')
|
|
: '';
|
|
|
|
return [
|
|
'<|im_start|>system',
|
|
'You are a named entity extractor. You output only valid JSON.',
|
|
'<|im_end|>',
|
|
'<|im_start|>user',
|
|
'Read the conversation below and extract every named entity mentioned.',
|
|
`Entity types to extract: ${ENTITY_TYPES.join(', ')}`,
|
|
'For each entity found, provide: name, type, and a one-sentence notes field.',
|
|
'Return your answer as: { "entities": [ ... ] }',
|
|
'For each entity found, you MUST provide a non-empty notes field describing it based on the conversation.',
|
|
'For each entity found, provide:',
|
|
' "name": short proper noun only (max 4 words, e.g. "Sydney", "NexusAI", "Tim")',
|
|
' "type": one of the valid types',
|
|
' "notes": one specific sentence about this entity based on the conversation (not generic)',
|
|
'',
|
|
knownBlock,
|
|
'--- CONVERSATION ---', // clear delimiter helps smaller models
|
|
`User: ${userMessage}`,
|
|
`Assistant: ${aiResponse}`,
|
|
'--- END CONVERSATION ---',
|
|
'<|im_end|>',
|
|
'<|im_start|>assistant',
|
|
].join('\n');
|
|
}
|
|
|
|
async function embedEntity(entity) {
|
|
// Combine name, type and notes into a single descriptive string for embedding
|
|
const text = `${entity.name} (${entity.type}): ${entity.notes ?? entity.name}`;
|
|
|
|
const res = await fetch(`${EMBEDDING_SERVICE_URL}/embed`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({ text }),
|
|
});
|
|
|
|
if (!res.ok) throw new Error(`Embedding service error: ${res.status}`);
|
|
const data = await res.json();
|
|
return data.embedding;
|
|
}
|
|
|
|
async function extractAndStoreEntities(userMessage, aiResponse, projectId=null) {
|
|
console.log('[entities] Extraction triggered')
|
|
try {
|
|
// Fetch existing entities to guide the model toward consistent name/type pairs
|
|
const db = require('../db').getDB();
|
|
const knownEntities = db.prepare(`SELECT name, type FROM entities ORDER BY rowid DESC LIMIT 20`).all();
|
|
const prompt = buildExtractionPrompt(userMessage, aiResponse, knownEntities);
|
|
|
|
|
|
const res = await fetch(`${EXTRACTION_URL}/api/generate`, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify({
|
|
model: EXTRACTION_MODEL,
|
|
prompt: prompt,
|
|
stream: false,
|
|
format: 'json',
|
|
options: {
|
|
temperature: ENTITIES.TEMPERATURE,
|
|
num_predict: ENTITIES.NUM_PREDICT,
|
|
},
|
|
}),
|
|
});
|
|
|
|
if (!res.ok) throw new Error(`Ollama responded ${res.status}`);
|
|
|
|
const data = await res.json();
|
|
const raw = data.response?.trim() ?? '';
|
|
|
|
const parsed = JSON.parse(raw);
|
|
const entities = Array.isArray(parsed.entities) ? parsed.entities : [];
|
|
if (entities.length === 0) {
|
|
console.log('[entities] No entities found in this exchange — skipping');
|
|
return; // not an error, just nothing to extract
|
|
}
|
|
|
|
if (!Array.isArray(entities)) throw new Error('Response was not a JSON array');
|
|
|
|
let saved = 0;
|
|
|
|
|
|
for (const { name, type, notes } of entities) {
|
|
|
|
if (!name || !type || !ENTITY_TYPES.includes(type)) continue;
|
|
if (IGNORED_NAMES.includes(name.toLowerCase())) continue;
|
|
|
|
const entity = upsertEntity(name, type, notes ?? null);
|
|
console.log('[entities] Upserted entity:', entity);
|
|
|
|
// Embed and upsert to Qdrant fire-and-forget
|
|
embedEntity(entity)
|
|
.then(vector => semantic.upsertEntity(entity.id, vector, {
|
|
name: entity.name,
|
|
type: entity.type,
|
|
notes: entity.notes,
|
|
projectId: projectId ?? null,
|
|
}))
|
|
.catch(err => {
|
|
console.warn(`[entities] Failed to embed entity "${entity.name}":`, err.message);
|
|
});
|
|
|
|
saved++;
|
|
}
|
|
|
|
if (saved > 0) console.log(`[entities] Extracted and stored ${saved} entities`);
|
|
|
|
} catch (err) {
|
|
// Non-critical — log and move on, episode is already saved
|
|
console.warn('[entities] Extraction failed:', err.message);
|
|
}
|
|
}
|
|
|
|
module.exports = { extractAndStoreEntities }; |