nexusAI/packages/memory-service/src/entities/extraction.js

const semantic = require('../semantic')
const { getEnv, SERVICES, formatEpisodeText, ENTITIES } = require('@nexusai/shared');
const { upsertEntity } = require('./index');

const EXTRACTION_URL = getEnv('EXTRACTION_URL', 'http://localhost:11434');
const EXTRACTION_MODEL = getEnv('EXTRACTION_MODEL', 'qwen2.5:3b');
const EMBEDDING_SERVICE_URL = getEnv('EMBEDDING_SERVICE_URL', SERVICES.EMBEDDING_URL);

const ENTITY_TYPES = ['person', 'place', 'project', 'technology', 'concept', 'organization'];
const IGNORED_NAMES = ['good morning', 'good night', 'hello', 'goodbye', 'thanks', 'thank you'];

function buildExtractionPrompt(userMessage, aiResponse, knownEntities = []) {
    const knownBlock = knownEntities.length > 0
        ? [
            'Already known entities (use these exact name and type values if the same entity appears):',
            ...knownEntities.map(e => `- "${e.name}" (${e.type})`),
            '',
          ].join('\n')
        : '';

    return [
        '<|im_start|>system',
        'You are a named entity extractor. You output only valid JSON.',
        '<|im_end|>',
        '<|im_start|>user',
        'Read the conversation below and extract every named entity mentioned.',
        `Entity types to extract: ${ENTITY_TYPES.join(', ')}`,
        'For each entity found, provide: name, type, and a one-sentence notes field.',
        'Return your answer as: { "entities": [ ... ] }',
        'For each entity found, you MUST provide a non-empty notes field describing it based on the conversation.',
        'For each entity found, provide:',
        '  "name": short proper noun only (max 4 words, e.g. "Sydney", "NexusAI", "Tim")',
        '  "type": one of the valid types',
        '  "notes": one specific sentence about this entity based on the conversation (not generic)',
        '',
        knownBlock,
        '--- CONVERSATION ---',   // clear delimiter helps smaller models
        `User: ${userMessage}`,
        `Assistant: ${aiResponse}`,
        '--- END CONVERSATION ---',
        '<|im_end|>',
        '<|im_start|>assistant',
    ].join('\n');
}

async function embedEntity(entity) {
    // Combine name, type and notes into a single descriptive string for embedding
    const text = `${entity.name} (${entity.type}): ${entity.notes ?? entity.name}`;

    const res = await fetch(`${EMBEDDING_SERVICE_URL}/embed`, {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ text }),
    });

    if (!res.ok) throw new Error(`Embedding service error: ${res.status}`);
    const data = await res.json();
    return data.embedding;
}

async function extractAndStoreEntities(userMessage, aiResponse, projectId=null) {
    console.log('[entities] Extraction triggered')
    try {
        // Fetch existing entities to guide the model toward consistent name/type pairs
        const db = require('../db').getDB();
        const knownEntities = db.prepare(`SELECT name, type FROM entities ORDER BY rowid DESC LIMIT 20`).all();
        const prompt = buildExtractionPrompt(userMessage, aiResponse, knownEntities);


        const res = await fetch(`${EXTRACTION_URL}/api/generate`, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify({
                model: EXTRACTION_MODEL,
                prompt: prompt,
                stream: false,
                format: 'json',
                options: {
                    temperature: ENTITIES.TEMPERATURE,
                    num_predict: ENTITIES.NUM_PREDICT,
                },
            }),
        });

        if (!res.ok) throw new Error(`Ollama responded ${res.status}`);

        const data = await res.json();
        const raw = data.response?.trim() ?? '';

        const parsed = JSON.parse(raw);
        const entities = Array.isArray(parsed.entities) ? parsed.entities : [];
        if (entities.length === 0) {
            console.log('[entities] No entities found in this exchange — skipping');
            return;  // not an error, just nothing to extract
        }

        if (!Array.isArray(entities)) throw new Error('Response was not a JSON array');

        let saved = 0;


        for (const { name, type, notes } of entities) {

            if (!name || !type || !ENTITY_TYPES.includes(type)) continue;
            if (IGNORED_NAMES.includes(name.toLowerCase())) continue;

            const entity = upsertEntity(name, type, notes ?? null);
            console.log('[entities] Upserted entity:', entity);

            // Embed and upsert to Qdrant fire-and-forget
            embedEntity(entity)
                .then(vector => semantic.upsertEntity(entity.id, vector, {
                    name: entity.name,
                    type: entity.type,
                    notes: entity.notes,
                    projectId: projectId ?? null,
                }))
                .catch(err => {
                    console.warn(`[entities] Failed to embed entity "${entity.name}":`, err.message);
                });

            saved++;
        }

        if (saved > 0) console.log(`[entities] Extracted and stored ${saved} entities`);

    } catch (err) {
        // Non-critical — log and move on, episode is already saved
        console.warn('[entities] Extraction failed:', err.message);
    }
}

module.exports = { extractAndStoreEntities };