mirror of
https://github.com/ItzCrazyKns/Perplexica.git
synced 2025-12-14 15:48:15 +00:00
feat(utils): add token based text splitting
This commit is contained in:
74
src/lib/utils/splitText.ts
Normal file
74
src/lib/utils/splitText.ts
Normal file
@@ -0,0 +1,74 @@
|
||||
import { getEncoding } from 'js-tiktoken';
|
||||
|
||||
const splitRegex = /(?<=\. |\n|! |\? |; |:\s|\d+\.\s|- |\* )/g;
|
||||
|
||||
const enc = getEncoding('cl100k_base');
|
||||
|
||||
const getTokenCount = (text: string): number => {
|
||||
try {
|
||||
return enc.encode(text).length;
|
||||
} catch {
|
||||
return Math.ceil(text.length / 4);
|
||||
}
|
||||
};
|
||||
|
||||
export const splitText = (
|
||||
text: string,
|
||||
maxTokens = 512,
|
||||
overlapTokens = 64,
|
||||
): string[] => {
|
||||
const segments = text.split(splitRegex).filter(Boolean);
|
||||
|
||||
if (segments.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const segmentTokenCounts = segments.map(getTokenCount);
|
||||
|
||||
const result: string[] = [];
|
||||
|
||||
let chunkStart = 0;
|
||||
|
||||
while (chunkStart < segments.length) {
|
||||
let chunkEnd = chunkStart;
|
||||
let currentTokenCount = 0;
|
||||
|
||||
while (chunkEnd < segments.length && currentTokenCount < maxTokens) {
|
||||
if (currentTokenCount + segmentTokenCounts[chunkEnd] > maxTokens) {
|
||||
break;
|
||||
}
|
||||
|
||||
currentTokenCount += segmentTokenCounts[chunkEnd];
|
||||
chunkEnd++;
|
||||
}
|
||||
|
||||
let overlapBeforeStart = Math.max(0, chunkStart - 1);
|
||||
let overlapBeforeTokenCount = 0;
|
||||
|
||||
while (overlapBeforeStart >= 0 && overlapBeforeTokenCount < overlapTokens) {
|
||||
if (
|
||||
overlapBeforeTokenCount + segmentTokenCounts[overlapBeforeStart] >
|
||||
overlapTokens
|
||||
) {
|
||||
break;
|
||||
}
|
||||
|
||||
overlapBeforeTokenCount += segmentTokenCounts[overlapBeforeStart];
|
||||
overlapBeforeStart--;
|
||||
}
|
||||
|
||||
const overlapStartIndex = Math.max(0, overlapBeforeStart + 1);
|
||||
|
||||
const overlapBeforeContent = segments
|
||||
.slice(overlapStartIndex, chunkStart)
|
||||
.join('');
|
||||
|
||||
const chunkContent = segments.slice(chunkStart, chunkEnd).join('');
|
||||
|
||||
result.push(overlapBeforeContent + chunkContent);
|
||||
|
||||
chunkStart = chunkEnd;
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
Reference in New Issue
Block a user