Files
flynn/src/session/indexer.ts
T
2026-02-12 22:47:22 -08:00

49 lines
1.3 KiB
TypeScript

export interface HistoryMetadata {
keywords: string[];
topics: string[];
}
export interface HistoryIndexerConfig {
maxKeywords: number;
}
const STOPWORDS = new Set([
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
'of', 'on', 'or', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'you', 'your', 'we', 'this',
'they', 'their', 'our', 'but', 'not', 'can', 'just', 'into', 'about', 'after', 'before', 'than', 'then',
]);
export function tokenize(text: string): string[] {
return text
.toLowerCase()
.split(/[^a-z0-9]+/)
.filter(token => token.length >= 3 && !STOPWORDS.has(token));
}
export class SessionIndexer {
private readonly maxKeywords: number;
constructor(config: HistoryIndexerConfig) {
this.maxKeywords = config.maxKeywords;
}
indexText(text: string): HistoryMetadata {
const tokens = tokenize(text);
const frequencies = new Map<string, number>();
for (const token of tokens) {
frequencies.set(token, (frequencies.get(token) ?? 0) + 1);
}
const sorted = Array.from(frequencies.entries())
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
.slice(0, this.maxKeywords)
.map(([token]) => token);
return {
keywords: sorted,
topics: sorted.slice(0, Math.min(3, sorted.length)),
};
}
}