#!/usr/bin/env node // Scrape AEP/Articles/ thematic folders -> public/data/carte-o.json // Two frontmatter formats supported : // 1. YAML standard between --- delimiters // 2. Legacy "MOC : [[X]]\nSource : ...\nTags : ...\nDate : ...\n***" header // // Wikilinks [[X]] in body -> edges (resolved by label match against scraped nodes). // Family inferred from theme directory name (5 AEP families). // V1 cap : top 150 nodes by degree if scrape > 300 nodes. import fs from 'node:fs/promises' import path from 'node:path' import { fileURLToPath } from 'node:url' import matter from 'gray-matter' import { globby } from 'globby' const __filename = fileURLToPath(import.meta.url) const __dirname = path.dirname(__filename) const REPO_ROOT = path.resolve(__dirname, '..') const ARTICLES_ROOT = 'C:/Users/jules/Dropbox/ATIS - IPCJRA/2 CASQUETTES/Penseur politique/AEP/Articles' const OUTPUT = path.join(REPO_ROOT, 'public/data/carte-o.json') const NODE_CAP_V1 = 150 // 5 AEP families : palette refined after first scrape. const FAMILY_COLORS = { penseur: '#3b82f6', // blue concept: '#10b981', // green methode: '#f59e0b', // amber collectif: '#ef4444', // red ressource: '#8b5cf6', // violet } function slugify(str) { return String(str || '') .normalize('NFD').replace(/[̀-ͯ]/g, '') .toLowerCase() .replace(/[^a-z0-9]+/g, '-') .replace(/^-+|-+$/g, '') .slice(0, 80) || 'untitled' } function inferFamily(signals) { // signals = { title, theme, path, tags, content } const haystack = [ signals.title, signals.theme, signals.path, Array.isArray(signals.tags) ? signals.tags.join(' ') : signals.tags, (signals.content || '').slice(0, 800), ].filter(Boolean).join(' ').toLowerCase() // Order matters : check most specific first. // METHODE : process, outils, comment-faire if (/m[ée]thode|outil|pratique|community.organizing|alinsky\b|comment\b|process|protocole|recette|guide|how.to|chantier|d[ée]marche/.test(haystack)) { return 'methode' } // PENSEUR : noms propres, auteurs, figures if (/penseur|auteur|figure|harari|alinsky|piven|chouard|branco|mamdani|shift|graeber|bourdieu|lordon|stiegler|d[ée]bord|illich|gorz|servigne|vidal|haupt|pisani|lalo|rosa/.test(haystack)) { return 'penseur' } // COLLECTIF : organisations, mouvements, réseaux if (/collectif|r[ée]seau|asso|union|coop|mouvement|piraterie|sociale|syndicat|comit[ée]|crise.de.la.profession|nyc|mamdani|chantier/.test(haystack)) { return 'collectif' } // CONCEPT : notions, théories, critiques if (/concept|notion|th[ée]orie|critique|fiction|kakistocratie|imp[ée]rialisme|robustesse|sycophan|d[ée]construction|paradoxe|dialectique|ontologie|capitalisme|n[ée]olib[ée]ral|d[ée]mocratie|biais|illusion/.test(haystack)) { return 'concept' } // RESSOURCE : par défaut (articles brouillon, idées, agendas) return 'ressource' } // Fallback parser for legacy "MOC : [[X]]\nSource : ...\nDate : ...\n***\n" headers. function parseLegacyHeader(raw) { const lines = raw.split(/\r?\n/) const fm = {} let bodyStart = 0 let foundHeader = false for (let i = 0; i < Math.min(lines.length, 30); i++) { const line = lines[i] if (/^\s*\*\*\*\s*$/.test(line)) { bodyStart = i + 1 foundHeader = true break } const m = line.match(/^([A-Za-zÀ-ÿ ]+)\s*:\s*(.+)$/) if (m) { const key = m[1].trim().toLowerCase() fm[key] = m[2].trim() } } if (!foundHeader) return { data: {}, content: raw } // Parse tags (space-separated #tag tokens). if (fm.tags) { fm.tags = fm.tags.match(/#[\w/-]+/g) || [] } return { data: fm, content: lines.slice(bodyStart).join('\n'), } } function safeParseFrontmatter(raw) { // Try YAML first. try { const parsed = matter(raw) if (Object.keys(parsed.data).length > 0) return parsed } catch (_) { // YAML parse failed, fall through. } return parseLegacyHeader(raw) } function extractFirstParagraph(content, maxLen = 220) { // Skip headings, code blocks, callout/quote markers. const cleaned = content .replace(/^---[\s\S]*?---\n/, '') .replace(/```[\s\S]*?```/g, '') .split(/\n\n+/) .map(p => p.trim()) .filter(p => p && !p.startsWith('#') && !p.startsWith('---') && !p.startsWith('|')) // Prefer first paragraph that looks like prose (not a list). const prose = cleaned.find(p => !/^\s*[->*•\d+\.\s]/.test(p) && p.length > 30) const first = prose || cleaned[0] || '' // Strip wikilinks, bold, italics, callout markers. return first .replace(/\[\[([^\]|]+)(?:\|[^\]]+)?\]\]/g, '$1') .replace(/[*_>]+/g, '') .replace(/\s+/g, ' ') .trim() .slice(0, maxLen) } function extractWikilinks(content) { // Match [[Target]], [[Target|alias]], [[Target#section]] // Skip image embeds ![[...]] const matches = [...content.matchAll(/(? m[1].trim()).filter(Boolean) } async function main() { console.log('[carte-o] Scraping', ARTICLES_ROOT) // Glob all .md recursively under Articles/. const mdFiles = await globby(['**/*.md'], { cwd: ARTICLES_ROOT, absolute: true, gitignore: false, }) console.log(`[carte-o] Found ${mdFiles.length} markdown files`) const nodes = [] const edgesRaw = [] const themeStats = {} for (const mdFile of mdFiles) { let raw try { raw = await fs.readFile(mdFile, 'utf-8') } catch (e) { console.warn(`[carte-o] Skip unreadable ${mdFile}`) continue } const relPath = path.relative(ARTICLES_ROOT, mdFile).replace(/\\/g, '/') const segments = relPath.split('/') const baseName = path.basename(mdFile, '.md') // Theme = first or second segment depending on structure. // E.g. "AEP ARTICLES, BROUILLON/AEP IA/file.md" -> theme = "AEP IA" // "AEP ARTICLES, BROUILLON/file.md" -> theme = "AEP ARTICLES, BROUILLON" // "Livre - le nouveau contrat social/file.md" -> theme = "Livre" let theme = segments[0] if (segments.length >= 3 && segments[0].startsWith('AEP ARTICLES')) { theme = segments[1] } const { data: fm, content } = safeParseFrontmatter(raw) const title = fm.titre || fm.title || baseName.replace(/^!\s*/, '').trim() const slug = slugify(title) const family = inferFamily({ title, theme, path: relPath, tags: fm.tags, content, }) const intention = fm.intention || extractFirstParagraph(content) nodes.push({ id: slug, label: title, family, intention, slug, theme, path: relPath, }) themeStats[theme] = (themeStats[theme] || 0) + 1 // Collect wikilinks. const wikilinks = extractWikilinks(content) for (const target of wikilinks) { edgesRaw.push({ source: slug, targetLabel: target }) } } // Deduplicate nodes by id (collisions on same slug). const nodeById = new Map() for (const n of nodes) { if (!nodeById.has(n.id)) { nodeById.set(n.id, n) } } const dedupNodes = [...nodeById.values()] // Resolve edges : match targetLabel against node label or id. const labelToId = new Map() for (const n of dedupNodes) { labelToId.set(slugify(n.label), n.id) labelToId.set(n.label.toLowerCase(), n.id) } const edgesResolved = [] const edgeSet = new Set() for (const e of edgesRaw) { const candidates = [ slugify(e.targetLabel), e.targetLabel.toLowerCase(), ] let targetId = null for (const c of candidates) { if (labelToId.has(c)) { targetId = labelToId.get(c) break } } if (!targetId || targetId === e.source) continue const key = e.source < targetId ? `${e.source}→${targetId}` : `${targetId}→${e.source}` if (edgeSet.has(key)) continue edgeSet.add(key) edgesResolved.push({ source: e.source, target: targetId }) } // Compute degree for each node. const degree = new Map() for (const e of edgesResolved) { degree.set(e.source, (degree.get(e.source) || 0) + 1) degree.set(e.target, (degree.get(e.target) || 0) + 1) } // V1 cap : if > NODE_CAP_V1 nodes, keep top N by degree. let finalNodes = dedupNodes if (dedupNodes.length > NODE_CAP_V1) { finalNodes = [...dedupNodes] .sort((a, b) => (degree.get(b.id) || 0) - (degree.get(a.id) || 0)) .slice(0, NODE_CAP_V1) console.log(`[carte-o] Capped from ${dedupNodes.length} to ${NODE_CAP_V1} nodes (top by degree)`) } const finalNodeIds = new Set(finalNodes.map(n => n.id)) const finalEdges = edgesResolved.filter(e => finalNodeIds.has(e.source) && finalNodeIds.has(e.target)) // Family distribution stats. const familyDist = {} for (const n of finalNodes) { familyDist[n.family] = (familyDist[n.family] || 0) + 1 } // Ensure output dir exists. await fs.mkdir(path.dirname(OUTPUT), { recursive: true }) await fs.writeFile( OUTPUT, JSON.stringify({ meta: { generated: new Date().toISOString(), source: 'AEP/Articles', nodeCount: finalNodes.length, edgeCount: finalEdges.length, familyDistribution: familyDist, familyColors: FAMILY_COLORS, themeStats, }, nodes: finalNodes, edges: finalEdges, }, null, 2), 'utf-8', ) console.log(`[carte-o] OK : ${finalNodes.length} nodes / ${finalEdges.length} edges`) console.log(`[carte-o] Families :`, familyDist) console.log(`[carte-o] Output : ${OUTPUT}`) } main().catch(err => { console.error('[carte-o] FAIL', err) process.exit(1) })