97 lines
3.2 KiB
TypeScript
97 lines
3.2 KiB
TypeScript
/**
|
|
* Recherche vectorielle sur les embeddings V2
|
|
* Cosine similarity + top-K
|
|
*
|
|
* Utilisé par : server/api/chatbot-v2.post.ts
|
|
* Données : server/data/embeddings-v2.json (généré par scripts/vectorize-v2.js)
|
|
*/
|
|
|
|
import { readFileSync, existsSync } from 'fs'
|
|
import { fileURLToPath } from 'url'
|
|
import { resolve, dirname } from 'path'
|
|
|
|
// ── Types ──────────────────────────────────────────────────────────────────────
|
|
|
|
export interface EmbeddingEntry {
|
|
fiche_id: string
|
|
nom: string
|
|
famille: number
|
|
hashtags: string[]
|
|
embedding: number[]
|
|
text_preview: string
|
|
}
|
|
|
|
export interface SearchResult {
|
|
fiche_id: string
|
|
nom: string
|
|
famille: number
|
|
hashtags: string[]
|
|
score: number
|
|
text_preview: string
|
|
}
|
|
|
|
// ── Cosine similarity ──────────────────────────────────────────────────────────
|
|
|
|
export function cosineSimilarity(a: number[], b: number[]): number {
|
|
if (a.length !== b.length) return 0
|
|
let dot = 0, normA = 0, normB = 0
|
|
for (let i = 0; i < a.length; i++) {
|
|
dot += a[i] * b[i]
|
|
normA += a[i] * a[i]
|
|
normB += b[i] * b[i]
|
|
}
|
|
const denom = Math.sqrt(normA) * Math.sqrt(normB)
|
|
return denom === 0 ? 0 : dot / denom
|
|
}
|
|
|
|
// ── Top-K search ───────────────────────────────────────────────────────────────
|
|
|
|
export function topKSearch(
|
|
embeddings: EmbeddingEntry[],
|
|
queryEmbedding: number[],
|
|
k: number = 5
|
|
): SearchResult[] {
|
|
return embeddings
|
|
.map(e => ({
|
|
fiche_id: e.fiche_id,
|
|
nom: e.nom,
|
|
famille: e.famille,
|
|
hashtags: e.hashtags,
|
|
score: cosineSimilarity(e.embedding, queryEmbedding),
|
|
text_preview: e.text_preview
|
|
}))
|
|
.sort((a, b) => b.score - a.score)
|
|
.slice(0, k)
|
|
}
|
|
|
|
// ── Chargement lazy des embeddings (cache module-level) ────────────────────────
|
|
|
|
let _embeddingsV2: EmbeddingEntry[] | null = null
|
|
|
|
export function loadEmbeddingsV2(): EmbeddingEntry[] {
|
|
if (_embeddingsV2 !== null) return _embeddingsV2
|
|
|
|
try {
|
|
// Résolution du chemin : process.cwd() pointe sur la racine projet en dev/prod Nitro
|
|
// (import.meta.url casse en bundle .nuxt compilé)
|
|
const embPath = resolve(process.cwd(), 'server', 'data', 'embeddings-v2.json')
|
|
|
|
if (!existsSync(embPath)) {
|
|
console.warn('[vectorSearch] embeddings-v2.json absent - V2 vector search désactivé')
|
|
console.warn('[vectorSearch] Lancer : MISTRAL_API_KEY=xxx node scripts/vectorize-v2.js')
|
|
_embeddingsV2 = []
|
|
return []
|
|
}
|
|
|
|
const raw = readFileSync(embPath, 'utf-8')
|
|
const data = JSON.parse(raw)
|
|
_embeddingsV2 = data.embeddings ?? []
|
|
console.log(`[vectorSearch] ${_embeddingsV2!.length} embeddings V2 chargés (${data.meta?.model ?? 'unknown'})`)
|
|
return _embeddingsV2!
|
|
} catch (e: any) {
|
|
console.warn('[vectorSearch] Erreur chargement embeddings-v2.json :', e?.message ?? e)
|
|
_embeddingsV2 = []
|
|
return []
|
|
}
|
|
}
|