import { getEncoding } from 'js-tiktoken'; const splitRegex = /(?<=\. |\n|! |\? |; |:\s|\d+\.\s|- |\* )/g; const enc = getEncoding('cl100k_base'); const getTokenCount = (text: string): number => { try { return enc.encode(text).length; } catch { return Math.ceil(text.length / 4); } }; export const splitText = ( text: string, maxTokens = 512, overlapTokens = 64, ): string[] => { const segments = text.split(splitRegex).filter(Boolean); if (segments.length === 0) { return []; } const segmentTokenCounts = segments.map(getTokenCount); const result: string[] = []; let chunkStart = 0; while (chunkStart < segments.length) { let chunkEnd = chunkStart; let currentTokenCount = 0; while (chunkEnd < segments.length && currentTokenCount < maxTokens) { if (currentTokenCount + segmentTokenCounts[chunkEnd] > maxTokens) { break; } currentTokenCount += segmentTokenCounts[chunkEnd]; chunkEnd++; } let overlapBeforeStart = Math.max(0, chunkStart - 1); let overlapBeforeTokenCount = 0; while (overlapBeforeStart >= 0 && overlapBeforeTokenCount < overlapTokens) { if ( overlapBeforeTokenCount + segmentTokenCounts[overlapBeforeStart] > overlapTokens ) { break; } overlapBeforeTokenCount += segmentTokenCounts[overlapBeforeStart]; overlapBeforeStart--; } const overlapStartIndex = Math.max(0, overlapBeforeStart + 1); const overlapBeforeContent = segments .slice(overlapStartIndex, chunkStart) .join(''); const chunkContent = segments.slice(chunkStart, chunkEnd).join(''); result.push(overlapBeforeContent + chunkContent); chunkStart = chunkEnd; } return result; };