feat(utils): add token based text splitting

This commit is contained in:
ItzCrazyKns
2025-12-13 22:17:51 +05:30
parent 0688630863
commit c7c327a7bb

View File

@@ -0,0 +1,74 @@
import { getEncoding } from 'js-tiktoken';
const splitRegex = /(?<=\. |\n|! |\? |; |:\s|\d+\.\s|- |\* )/g;
const enc = getEncoding('cl100k_base');
const getTokenCount = (text: string): number => {
try {
return enc.encode(text).length;
} catch {
return Math.ceil(text.length / 4);
}
};
export const splitText = (
text: string,
maxTokens = 512,
overlapTokens = 64,
): string[] => {
const segments = text.split(splitRegex).filter(Boolean);
if (segments.length === 0) {
return [];
}
const segmentTokenCounts = segments.map(getTokenCount);
const result: string[] = [];
let chunkStart = 0;
while (chunkStart < segments.length) {
let chunkEnd = chunkStart;
let currentTokenCount = 0;
while (chunkEnd < segments.length && currentTokenCount < maxTokens) {
if (currentTokenCount + segmentTokenCounts[chunkEnd] > maxTokens) {
break;
}
currentTokenCount += segmentTokenCounts[chunkEnd];
chunkEnd++;
}
let overlapBeforeStart = Math.max(0, chunkStart - 1);
let overlapBeforeTokenCount = 0;
while (overlapBeforeStart >= 0 && overlapBeforeTokenCount < overlapTokens) {
if (
overlapBeforeTokenCount + segmentTokenCounts[overlapBeforeStart] >
overlapTokens
) {
break;
}
overlapBeforeTokenCount += segmentTokenCounts[overlapBeforeStart];
overlapBeforeStart--;
}
const overlapStartIndex = Math.max(0, overlapBeforeStart + 1);
const overlapBeforeContent = segments
.slice(overlapStartIndex, chunkStart)
.join('');
const chunkContent = segments.slice(chunkStart, chunkEnd).join('');
result.push(overlapBeforeContent + chunkContent);
chunkStart = chunkEnd;
}
return result;
};