// src/lib/cluster.ts
// Pure-JS TF-IDF style co-occurrence clustering — no AI, no external deps

const STOP_WORDS = new Set([
    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
    'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does',
    'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'i', 'you', 'he',
    'she', 'we', 'they', 'it', 'this', 'that', 'these', 'those', 'my', 'your', 'his',
    'her', 'our', 'their', 'its', 'me', 'him', 'us', 'them', 'what', 'how', 'why', 'when',
    'where', 'who', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some',
    'such', 'no', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just',
    'about', 'after', 'before', 'between', 'up', 'down', 'out', 'off', 'over', 'under',
    'new', 'get', 'got', 'make', 'made', 'go', 'going', 'gone', 'also', 'from', 'by',
    'into', 'through', 'during', 's', 't', 're', 'm', 'll', 've', 'd', 'isn', 'aren',
]);

/** Extract significant 1-2 word tokens from a title */
function tokenize(title: string): string[] {
    const words = title
        .toLowerCase()
        .replace(/[^a-z0-9\s]/g, ' ')
        .split(/\s+/)
        .filter(w => w.length > 3 && !STOP_WORDS.has(w));

    // Include bigrams
    const bigrams: string[] = [];
    for (let i = 0; i < words.length - 1; i++) {
        bigrams.push(`${words[i]} ${words[i + 1]}`);
    }
    return [...words, ...bigrams];
}

/** Compute TF-IDF score for each token across all docs */
function buildIDF(docs: string[][]): Map<string, number> {
    const df = new Map<string, number>();
    const N = docs.length;
    for (const doc of docs) {
        const seen = new Set(doc);
        for (const token of seen) {
            df.set(token, (df.get(token) || 0) + 1);
        }
    }
    const idf = new Map<string, number>();
    df.forEach((count, token) => {
        idf.set(token, Math.log((N + 1) / (count + 1)) + 1);
    });
    return idf;
}

export interface NicheCluster {
    name: string;          // top keyword(s)
    keywords: string[];    // all significant tokens
    titles: string[];      // member titles
    channels: string[];    // member channels
    score: number;         // session penetration or frequency
}

/**
 * Cluster an array of {title, channel} items into topic families.
 * Returns deduplicated clusters sorted by score descending.
 */
export function clusterTitles(
    items: { title: string; channel: string; sessionId?: string }[],
    minClusterSize = 2
): NicheCluster[] {
    // Build token lists for each item
    const tokenized = items.map(item => ({
        ...item,
        tokens: new Set(tokenize(item.title)),
    }));

    const docs = tokenized.map(t => [...t.tokens]);
    const idf = buildIDF(docs);

    // Score each token by IDF weight × frequency across docs
    const tokenFreq = new Map<string, number>();
    for (const doc of docs) {
        for (const token of doc) {
            tokenFreq.set(token, (tokenFreq.get(token) || 0) + 1);
        }
    }

    // Filter to tokens that appear in ≥minClusterSize items and aren't too common
    const totalDocs = docs.length;
    const significantTokens = [...tokenFreq.entries()]
        .filter(([, freq]) => freq >= minClusterSize && freq < totalDocs * 0.7)
        .sort((a, b) => {
            const scoreA = (idf.get(a[0]) || 1) * a[1];
            const scoreB = (idf.get(b[0]) || 1) * b[1];
            return scoreB - scoreA;
        })
        .slice(0, 60)
        .map(([token]) => token);

    // Build clusters: for each top token, group items that contain it
    const seen = new Set<number>(); // indices already claimed
    const clusters: NicheCluster[] = [];

    for (const token of significantTokens) {
        const members = tokenized
            .map((item, idx) => ({ item, idx }))
            .filter(({ item, idx }) => !seen.has(idx) && item.tokens.has(token));

        if (members.length < minClusterSize) continue;

        // Find all keywords shared by ≥2 members
        const sharedTokens: string[] = [];
        for (const t of significantTokens) {
            const c = members.filter(({ item }) => item.tokens.has(t)).length;
            if (c >= minClusterSize) sharedTokens.push(t);
        }

        // Mark members as claimed
        for (const { idx } of members) seen.add(idx);

        const uniqueSessions = new Set(members.map(m => m.item.sessionId || 'x')).size;

        clusters.push({
            name: sharedTokens.slice(0, 3).join(' · '),
            keywords: sharedTokens.slice(0, 10),
            titles: members.map(m => m.item.title),
            channels: [...new Set(members.map(m => m.item.channel))],
            score: uniqueSessions * 20 + members.length * 5, // higher = more sessions + more items
        });
    }

    return clusters.sort((a, b) => b.score - a.score).slice(0, 20);
}
