49 lines
1.3 KiB
TypeScript
49 lines
1.3 KiB
TypeScript
export interface HistoryMetadata {
|
|
keywords: string[];
|
|
topics: string[];
|
|
}
|
|
|
|
export interface HistoryIndexerConfig {
|
|
maxKeywords: number;
|
|
}
|
|
|
|
const STOPWORDS = new Set([
|
|
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', 'has', 'he', 'in', 'is', 'it', 'its',
|
|
'of', 'on', 'or', 'that', 'the', 'to', 'was', 'were', 'will', 'with', 'you', 'your', 'we', 'this',
|
|
'they', 'their', 'our', 'but', 'not', 'can', 'just', 'into', 'about', 'after', 'before', 'than', 'then',
|
|
]);
|
|
|
|
export function tokenize(text: string): string[] {
|
|
return text
|
|
.toLowerCase()
|
|
.split(/[^a-z0-9]+/)
|
|
.filter(token => token.length >= 3 && !STOPWORDS.has(token));
|
|
}
|
|
|
|
export class SessionIndexer {
|
|
private readonly maxKeywords: number;
|
|
|
|
constructor(config: HistoryIndexerConfig) {
|
|
this.maxKeywords = config.maxKeywords;
|
|
}
|
|
|
|
indexText(text: string): HistoryMetadata {
|
|
const tokens = tokenize(text);
|
|
const frequencies = new Map<string, number>();
|
|
|
|
for (const token of tokens) {
|
|
frequencies.set(token, (frequencies.get(token) ?? 0) + 1);
|
|
}
|
|
|
|
const sorted = Array.from(frequencies.entries())
|
|
.sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
|
|
.slice(0, this.maxKeywords)
|
|
.map(([token]) => token);
|
|
|
|
return {
|
|
keywords: sorted,
|
|
topics: sorted.slice(0, Math.min(3, sorted.length)),
|
|
};
|
|
}
|
|
}
|