diff --git a/src/lib/uploads/store.ts b/src/lib/uploads/store.ts new file mode 100644 index 0000000..29afeb0 --- /dev/null +++ b/src/lib/uploads/store.ts @@ -0,0 +1,122 @@ +import BaseEmbedding from "../models/base/embedding"; +import UploadManager from "./manager"; +import computeSimilarity from "../utils/computeSimilarity"; +import { Chunk } from "../types"; +import { hashObj } from "../serverUtils"; +import fs from 'fs'; + +type UploadStoreParams = { + embeddingModel: BaseEmbedding; + fileIds: string[]; +} + +type StoreRecord = { + embedding: number[]; + content: string; + fileId: string; + metadata: Record +} + +class UploadStore { + embeddingModel: BaseEmbedding; + fileIds: string[]; + records: StoreRecord[] = []; + + constructor(private params: UploadStoreParams) { + this.embeddingModel = params.embeddingModel; + this.fileIds = params.fileIds; + this.initializeStore() + } + + initializeStore() { + this.fileIds.forEach((fileId) => { + const file = UploadManager.getFile(fileId) + + if (!file) { + throw new Error(`File with ID ${fileId} not found`); + } + + const chunks = UploadManager.getFileChunks(fileId); + + this.records.push(...chunks.map((chunk) => ({ + embedding: chunk.embedding, + content: chunk.content, + fileId: fileId, + metadata: { + fileName: file.name, + title: file.name, + url: `file_id://${file.id}`, + } + }))) + }) + } + + async query(queries: string[], topK: number): Promise { + const queryEmbeddings = await this.embeddingModel.embedText(queries) + + const results: { chunk: Chunk; score: number; }[][] = []; + const hashResults: string[][] = [] + + await Promise.all(queryEmbeddings.map(async (query) => { + const similarities = this.records.map((record, idx) => { + return { + chunk: { + content: record.content, + metadata: { + ...record.metadata, + fileId: record.fileId, + } + }, + score: computeSimilarity(query, record.embedding) + } as { chunk: Chunk; score: number; }; + }).sort((a, b) => b.score - a.score) + + results.push(similarities) + hashResults.push(similarities.map(s => hashObj(s))) + })) + + const chunkMap: Map = new Map(); + const scoreMap: Map = new Map(); + const k = 60; + + for (let i = 0; i < results.length; i++) { + for (let j = 0; j < results[i].length; j++) { + const chunkHash = hashResults[i][j] + + chunkMap.set(chunkHash, results[i][j].chunk); + scoreMap.set(chunkHash, (scoreMap.get(chunkHash) || 0) + results[i][j].score / (j + 1 + k)); + } + } + + const finalResults = Array.from(scoreMap.entries()) + .sort((a, b) => b[1] - a[1]) + .map(([chunkHash, _score]) => { + return chunkMap.get(chunkHash)!; + }) + + return finalResults.slice(0, topK); + } + + static getFileData(fileIds: string[]): { fileName: string; initialContent: string }[] { + const filesData: { fileName: string; initialContent: string }[] = []; + + fileIds.forEach((fileId) => { + const file = UploadManager.getFile(fileId) + + if (!file) { + throw new Error(`File with ID ${fileId} not found`); + } + + const chunks = UploadManager.getFileChunks(fileId); + + filesData.push({ + fileName: file.name, + initialContent: chunks.slice(0, 3).map(c => c.content).join('\n---\n'), + }) + }) + + return filesData + } +} + +export default UploadStore \ No newline at end of file