diff --git a/src/lib/utils/splitText.ts b/src/lib/utils/splitText.ts new file mode 100644 index 0000000..796bf4b --- /dev/null +++ b/src/lib/utils/splitText.ts @@ -0,0 +1,74 @@ +import { getEncoding } from 'js-tiktoken'; + +const splitRegex = /(?<=\. |\n|! |\? |; |:\s|\d+\.\s|- |\* )/g; + +const enc = getEncoding('cl100k_base'); + +const getTokenCount = (text: string): number => { + try { + return enc.encode(text).length; + } catch { + return Math.ceil(text.length / 4); + } +}; + +export const splitText = ( + text: string, + maxTokens = 512, + overlapTokens = 64, +): string[] => { + const segments = text.split(splitRegex).filter(Boolean); + + if (segments.length === 0) { + return []; + } + + const segmentTokenCounts = segments.map(getTokenCount); + + const result: string[] = []; + + let chunkStart = 0; + + while (chunkStart < segments.length) { + let chunkEnd = chunkStart; + let currentTokenCount = 0; + + while (chunkEnd < segments.length && currentTokenCount < maxTokens) { + if (currentTokenCount + segmentTokenCounts[chunkEnd] > maxTokens) { + break; + } + + currentTokenCount += segmentTokenCounts[chunkEnd]; + chunkEnd++; + } + + let overlapBeforeStart = Math.max(0, chunkStart - 1); + let overlapBeforeTokenCount = 0; + + while (overlapBeforeStart >= 0 && overlapBeforeTokenCount < overlapTokens) { + if ( + overlapBeforeTokenCount + segmentTokenCounts[overlapBeforeStart] > + overlapTokens + ) { + break; + } + + overlapBeforeTokenCount += segmentTokenCounts[overlapBeforeStart]; + overlapBeforeStart--; + } + + const overlapStartIndex = Math.max(0, overlapBeforeStart + 1); + + const overlapBeforeContent = segments + .slice(overlapStartIndex, chunkStart) + .join(''); + + const chunkContent = segments.slice(chunkStart, chunkEnd).join(''); + + result.push(overlapBeforeContent + chunkContent); + + chunkStart = chunkEnd; + } + + return result; +};