astro-site-cerveau/scripts/build-carte-o.js

#!/usr/bin/env node
// Scrape AEP/Articles/ thematic folders -> public/data/carte-o.json
// Two frontmatter formats supported :
//   1. YAML standard between --- delimiters
//   2. Legacy "MOC : [[X]]\nSource : ...\nTags : ...\nDate : ...\n***" header
//
// Wikilinks [[X]] in body -> edges (resolved by label match against scraped nodes).
// Family inferred from theme directory name (5 AEP families).
// V1 cap : top 150 nodes by degree if scrape > 300 nodes.

import fs from 'node:fs/promises'
import path from 'node:path'
import { fileURLToPath } from 'node:url'
import matter from 'gray-matter'
import { globby } from 'globby'

const __filename = fileURLToPath(import.meta.url)
const __dirname = path.dirname(__filename)

const REPO_ROOT = path.resolve(__dirname, '..')
const ARTICLES_ROOT = 'C:/Users/jules/Dropbox/ATIS - IPCJRA/2 CASQUETTES/Penseur politique/AEP/Articles'
const OUTPUT = path.join(REPO_ROOT, 'public/data/carte-o.json')
const NODE_CAP_V1 = 150

// 5 AEP families : palette refined after first scrape.
const FAMILY_COLORS = {
  penseur:   '#3b82f6',  // blue
  concept:   '#10b981',  // green
  methode:   '#f59e0b',  // amber
  collectif: '#ef4444',  // red
  ressource: '#8b5cf6',  // violet
}

function slugify(str) {
  return String(str || '')
    .normalize('NFD').replace(/[̀-ͯ]/g, '')
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, '-')
    .replace(/^-+|-+$/g, '')
    .slice(0, 80) || 'untitled'
}

function inferFamily(signals) {
  // signals = { title, theme, path, tags, content }
  const haystack = [
    signals.title,
    signals.theme,
    signals.path,
    Array.isArray(signals.tags) ? signals.tags.join(' ') : signals.tags,
    (signals.content || '').slice(0, 800),
  ].filter(Boolean).join(' ').toLowerCase()

  // Order matters : check most specific first.
  // METHODE : process, outils, comment-faire
  if (/m[ée]thode|outil|pratique|community.organizing|alinsky\b|comment\b|process|protocole|recette|guide|how.to|chantier|d[ée]marche/.test(haystack)) {
    return 'methode'
  }
  // PENSEUR : noms propres, auteurs, figures
  if (/penseur|auteur|figure|harari|alinsky|piven|chouard|branco|mamdani|shift|graeber|bourdieu|lordon|stiegler|d[ée]bord|illich|gorz|servigne|vidal|haupt|pisani|lalo|rosa/.test(haystack)) {
    return 'penseur'
  }
  // COLLECTIF : organisations, mouvements, réseaux
  if (/collectif|r[ée]seau|asso|union|coop|mouvement|piraterie|sociale|syndicat|comit[ée]|crise.de.la.profession|nyc|mamdani|chantier/.test(haystack)) {
    return 'collectif'
  }
  // CONCEPT : notions, théories, critiques
  if (/concept|notion|th[ée]orie|critique|fiction|kakistocratie|imp[ée]rialisme|robustesse|sycophan|d[ée]construction|paradoxe|dialectique|ontologie|capitalisme|n[ée]olib[ée]ral|d[ée]mocratie|biais|illusion/.test(haystack)) {
    return 'concept'
  }
  // RESSOURCE : par défaut (articles brouillon, idées, agendas)
  return 'ressource'
}

// Fallback parser for legacy "MOC : [[X]]\nSource : ...\nDate : ...\n***\n" headers.
function parseLegacyHeader(raw) {
  const lines = raw.split(/\r?\n/)
  const fm = {}
  let bodyStart = 0
  let foundHeader = false

  for (let i = 0; i < Math.min(lines.length, 30); i++) {
    const line = lines[i]
    if (/^\s*\*\*\*\s*$/.test(line)) {
      bodyStart = i + 1
      foundHeader = true
      break
    }
    const m = line.match(/^([A-Za-zÀ-ÿ ]+)\s*:\s*(.+)$/)
    if (m) {
      const key = m[1].trim().toLowerCase()
      fm[key] = m[2].trim()
    }
  }

  if (!foundHeader) return { data: {}, content: raw }

  // Parse tags (space-separated #tag tokens).
  if (fm.tags) {
    fm.tags = fm.tags.match(/#[\w/-]+/g) || []
  }
  return {
    data: fm,
    content: lines.slice(bodyStart).join('\n'),
  }
}

function safeParseFrontmatter(raw) {
  // Try YAML first.
  try {
    const parsed = matter(raw)
    if (Object.keys(parsed.data).length > 0) return parsed
  } catch (_) {
    // YAML parse failed, fall through.
  }
  return parseLegacyHeader(raw)
}

function extractFirstParagraph(content, maxLen = 220) {
  // Skip headings, code blocks, callout/quote markers.
  const cleaned = content
    .replace(/^---[\s\S]*?---\n/, '')
    .replace(/```[\s\S]*?```/g, '')
    .split(/\n\n+/)
    .map(p => p.trim())
    .filter(p => p && !p.startsWith('#') && !p.startsWith('---') && !p.startsWith('|'))
  // Prefer first paragraph that looks like prose (not a list).
  const prose = cleaned.find(p => !/^\s*[->*•\d+\.\s]/.test(p) && p.length > 30)
  const first = prose || cleaned[0] || ''
  // Strip wikilinks, bold, italics, callout markers.
  return first
    .replace(/\[\[([^\]|]+)(?:\|[^\]]+)?\]\]/g, '$1')
    .replace(/[*_>]+/g, '')
    .replace(/\s+/g, ' ')
    .trim()
    .slice(0, maxLen)
}

function extractWikilinks(content) {
  // Match [[Target]], [[Target|alias]], [[Target#section]]
  // Skip image embeds ![[...]]
  const matches = [...content.matchAll(/(?<!!)\[\[([^\]|#]+)(?:[#|][^\]]*)?\]\]/g)]
  return matches.map(m => m[1].trim()).filter(Boolean)
}

async function main() {
  console.log('[carte-o] Scraping', ARTICLES_ROOT)

  // Glob all .md recursively under Articles/.
  const mdFiles = await globby(['**/*.md'], {
    cwd: ARTICLES_ROOT,
    absolute: true,
    gitignore: false,
  })
  console.log(`[carte-o] Found ${mdFiles.length} markdown files`)

  const nodes = []
  const edgesRaw = []
  const themeStats = {}

  for (const mdFile of mdFiles) {
    let raw
    try {
      raw = await fs.readFile(mdFile, 'utf-8')
    } catch (e) {
      console.warn(`[carte-o] Skip unreadable ${mdFile}`)
      continue
    }

    const relPath = path.relative(ARTICLES_ROOT, mdFile).replace(/\\/g, '/')
    const segments = relPath.split('/')
    const baseName = path.basename(mdFile, '.md')

    // Theme = first or second segment depending on structure.
    // E.g. "AEP ARTICLES, BROUILLON/AEP IA/file.md" -> theme = "AEP IA"
    //      "AEP ARTICLES, BROUILLON/file.md" -> theme = "AEP ARTICLES, BROUILLON"
    //      "Livre - le nouveau contrat social/file.md" -> theme = "Livre"
    let theme = segments[0]
    if (segments.length >= 3 && segments[0].startsWith('AEP ARTICLES')) {
      theme = segments[1]
    }

    const { data: fm, content } = safeParseFrontmatter(raw)
    const title = fm.titre || fm.title || baseName.replace(/^!\s*/, '').trim()
    const slug = slugify(title)
    const family = inferFamily({
      title,
      theme,
      path: relPath,
      tags: fm.tags,
      content,
    })
    const intention = fm.intention || extractFirstParagraph(content)

    nodes.push({
      id: slug,
      label: title,
      family,
      intention,
      slug,
      theme,
      path: relPath,
    })

    themeStats[theme] = (themeStats[theme] || 0) + 1

    // Collect wikilinks.
    const wikilinks = extractWikilinks(content)
    for (const target of wikilinks) {
      edgesRaw.push({ source: slug, targetLabel: target })
    }
  }

  // Deduplicate nodes by id (collisions on same slug).
  const nodeById = new Map()
  for (const n of nodes) {
    if (!nodeById.has(n.id)) {
      nodeById.set(n.id, n)
    }
  }
  const dedupNodes = [...nodeById.values()]

  // Resolve edges : match targetLabel against node label or id.
  const labelToId = new Map()
  for (const n of dedupNodes) {
    labelToId.set(slugify(n.label), n.id)
    labelToId.set(n.label.toLowerCase(), n.id)
  }

  const edgesResolved = []
  const edgeSet = new Set()
  for (const e of edgesRaw) {
    const candidates = [
      slugify(e.targetLabel),
      e.targetLabel.toLowerCase(),
    ]
    let targetId = null
    for (const c of candidates) {
      if (labelToId.has(c)) {
        targetId = labelToId.get(c)
        break
      }
    }
    if (!targetId || targetId === e.source) continue
    const key = e.source < targetId ? `${e.source}→${targetId}` : `${targetId}→${e.source}`
    if (edgeSet.has(key)) continue
    edgeSet.add(key)
    edgesResolved.push({ source: e.source, target: targetId })
  }

  // Compute degree for each node.
  const degree = new Map()
  for (const e of edgesResolved) {
    degree.set(e.source, (degree.get(e.source) || 0) + 1)
    degree.set(e.target, (degree.get(e.target) || 0) + 1)
  }

  // V1 cap : if > NODE_CAP_V1 nodes, keep top N by degree.
  let finalNodes = dedupNodes
  if (dedupNodes.length > NODE_CAP_V1) {
    finalNodes = [...dedupNodes]
      .sort((a, b) => (degree.get(b.id) || 0) - (degree.get(a.id) || 0))
      .slice(0, NODE_CAP_V1)
    console.log(`[carte-o] Capped from ${dedupNodes.length} to ${NODE_CAP_V1} nodes (top by degree)`)
  }

  const finalNodeIds = new Set(finalNodes.map(n => n.id))
  const finalEdges = edgesResolved.filter(e => finalNodeIds.has(e.source) && finalNodeIds.has(e.target))

  // Family distribution stats.
  const familyDist = {}
  for (const n of finalNodes) {
    familyDist[n.family] = (familyDist[n.family] || 0) + 1
  }

  // Ensure output dir exists.
  await fs.mkdir(path.dirname(OUTPUT), { recursive: true })
  await fs.writeFile(
    OUTPUT,
    JSON.stringify({
      meta: {
        generated: new Date().toISOString(),
        source: 'AEP/Articles',
        nodeCount: finalNodes.length,
        edgeCount: finalEdges.length,
        familyDistribution: familyDist,
        familyColors: FAMILY_COLORS,
        themeStats,
      },
      nodes: finalNodes,
      edges: finalEdges,
    }, null, 2),
    'utf-8',
  )

  console.log(`[carte-o] OK : ${finalNodes.length} nodes / ${finalEdges.length} edges`)
  console.log(`[carte-o] Families :`, familyDist)
  console.log(`[carte-o] Output : ${OUTPUT}`)
}

main().catch(err => {
  console.error('[carte-o] FAIL', err)
  process.exit(1)
})