mirror of
https://github.com/ItzCrazyKns/Perplexica.git
synced 2025-12-14 15:48:15 +00:00
feat(utils): add token based text splitting
This commit is contained in:
74
src/lib/utils/splitText.ts
Normal file
74
src/lib/utils/splitText.ts
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
import { getEncoding } from 'js-tiktoken';
|
||||||
|
|
||||||
|
const splitRegex = /(?<=\. |\n|! |\? |; |:\s|\d+\.\s|- |\* )/g;
|
||||||
|
|
||||||
|
const enc = getEncoding('cl100k_base');
|
||||||
|
|
||||||
|
const getTokenCount = (text: string): number => {
|
||||||
|
try {
|
||||||
|
return enc.encode(text).length;
|
||||||
|
} catch {
|
||||||
|
return Math.ceil(text.length / 4);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
export const splitText = (
|
||||||
|
text: string,
|
||||||
|
maxTokens = 512,
|
||||||
|
overlapTokens = 64,
|
||||||
|
): string[] => {
|
||||||
|
const segments = text.split(splitRegex).filter(Boolean);
|
||||||
|
|
||||||
|
if (segments.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const segmentTokenCounts = segments.map(getTokenCount);
|
||||||
|
|
||||||
|
const result: string[] = [];
|
||||||
|
|
||||||
|
let chunkStart = 0;
|
||||||
|
|
||||||
|
while (chunkStart < segments.length) {
|
||||||
|
let chunkEnd = chunkStart;
|
||||||
|
let currentTokenCount = 0;
|
||||||
|
|
||||||
|
while (chunkEnd < segments.length && currentTokenCount < maxTokens) {
|
||||||
|
if (currentTokenCount + segmentTokenCounts[chunkEnd] > maxTokens) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentTokenCount += segmentTokenCounts[chunkEnd];
|
||||||
|
chunkEnd++;
|
||||||
|
}
|
||||||
|
|
||||||
|
let overlapBeforeStart = Math.max(0, chunkStart - 1);
|
||||||
|
let overlapBeforeTokenCount = 0;
|
||||||
|
|
||||||
|
while (overlapBeforeStart >= 0 && overlapBeforeTokenCount < overlapTokens) {
|
||||||
|
if (
|
||||||
|
overlapBeforeTokenCount + segmentTokenCounts[overlapBeforeStart] >
|
||||||
|
overlapTokens
|
||||||
|
) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
overlapBeforeTokenCount += segmentTokenCounts[overlapBeforeStart];
|
||||||
|
overlapBeforeStart--;
|
||||||
|
}
|
||||||
|
|
||||||
|
const overlapStartIndex = Math.max(0, overlapBeforeStart + 1);
|
||||||
|
|
||||||
|
const overlapBeforeContent = segments
|
||||||
|
.slice(overlapStartIndex, chunkStart)
|
||||||
|
.join('');
|
||||||
|
|
||||||
|
const chunkContent = segments.slice(chunkStart, chunkEnd).join('');
|
||||||
|
|
||||||
|
result.push(overlapBeforeContent + chunkContent);
|
||||||
|
|
||||||
|
chunkStart = chunkEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
};
|
||||||
Reference in New Issue
Block a user