import path from "path"; import BaseEmbedding from "../models/base/embedding" import crypto from "crypto" import fs from 'fs'; import { splitText } from "../utils/splitText"; import { PDFParse } from 'pdf-parse'; import officeParser from 'officeparser' import { Chunk } from "../types"; const supportedMimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain'] as const type SupportedMimeType = typeof supportedMimeTypes[number]; type UploadManagerParams = { embeddingModel: BaseEmbedding; } type RecordedFile = { id: string; name: string; filePath: string; contentPath: string; uploadedAt: string; } type FileRes = { fileName: string; fileExtension: string; fileId: string; } class UploadManager { private embeddingModel: BaseEmbedding; static uploadsDir = path.join(process.cwd(), 'data', 'uploads'); static uploadedFilesRecordPath = path.join(this.uploadsDir, 'uploaded_files.json'); constructor(private params: UploadManagerParams) { this.embeddingModel = params.embeddingModel; if (!fs.existsSync(UploadManager.uploadsDir)) { fs.mkdirSync(UploadManager.uploadsDir, { recursive: true }); } if (!fs.existsSync(UploadManager.uploadedFilesRecordPath)) { const data = { files: [] } fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify(data, null, 2)); } } private static getRecordedFiles(): RecordedFile[] { const data = fs.readFileSync(UploadManager.uploadedFilesRecordPath, 'utf-8'); return JSON.parse(data).files; } private static addNewRecordedFile(fileRecord: RecordedFile) { const currentData = this.getRecordedFiles() currentData.push(fileRecord); fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify({ files: currentData }, null, 2)); } static getFile(fileId: string): RecordedFile | null { const recordedFiles = this.getRecordedFiles(); return recordedFiles.find(f => f.id === fileId) || null; } static getFileChunks(fileId: string): { content: string; embedding: number[] }[] { try { const recordedFile = this.getFile(fileId); if (!recordedFile) { throw new Error(`File with ID ${fileId} not found`); } const contentData = JSON.parse(fs.readFileSync(recordedFile.contentPath, 'utf-8')) return contentData.chunks; } catch (err) { console.log('Error getting file chunks:', err); return []; } } private async extractContentAndEmbed(filePath: string, fileType: SupportedMimeType): Promise { switch (fileType) { case 'text/plain': const content = fs.readFileSync(filePath, 'utf-8'); const splittedText = splitText(content, 512, 128) const embeddings = await this.embeddingModel.embedText(splittedText) if (embeddings.length !== splittedText.length) { throw new Error('Embeddings and text chunks length mismatch'); } const contentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json'; const data = { chunks: splittedText.map((text, i) => { return { content: text, embedding: embeddings[i], } }) } fs.writeFileSync(contentPath, JSON.stringify(data, null, 2)); return contentPath; case 'application/pdf': const pdfBuffer = fs.readFileSync(filePath); const parser = new PDFParse({ data: pdfBuffer }) const pdfText = await parser.getText().then(res => res.text) const pdfSplittedText = splitText(pdfText, 512, 128) const pdfEmbeddings = await this.embeddingModel.embedText(pdfSplittedText) if (pdfEmbeddings.length !== pdfSplittedText.length) { throw new Error('Embeddings and text chunks length mismatch'); } const pdfContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json'; const pdfData = { chunks: pdfSplittedText.map((text, i) => { return { content: text, embedding: pdfEmbeddings[i], } }) } fs.writeFileSync(pdfContentPath, JSON.stringify(pdfData, null, 2)); return pdfContentPath; case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': const docBuffer = fs.readFileSync(filePath); const docText = await officeParser.parseOfficeAsync(docBuffer) const docSplittedText = splitText(docText, 512, 128) const docEmbeddings = await this.embeddingModel.embedText(docSplittedText) if (docEmbeddings.length !== docSplittedText.length) { throw new Error('Embeddings and text chunks length mismatch'); } const docContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json'; const docData = { chunks: docSplittedText.map((text, i) => { return { content: text, embedding: docEmbeddings[i], } }) } fs.writeFileSync(docContentPath, JSON.stringify(docData, null, 2)); return docContentPath; default: throw new Error(`Unsupported file type: ${fileType}`); } } async processFiles(files: File[]): Promise { const processedFiles: FileRes[] = []; await Promise.all(files.map(async (file) => { if (!(supportedMimeTypes as unknown as string[]).includes(file.type)) { throw new Error(`File type ${file.type} not supported`); } const fileId = crypto.randomBytes(16).toString('hex'); const fileExtension = file.name.split('.').pop(); const fileName = `${crypto.randomBytes(16).toString('hex')}.${fileExtension}`; const filePath = path.join(UploadManager.uploadsDir, fileName); const buffer = Buffer.from(await file.arrayBuffer()) fs.writeFileSync(filePath, buffer); const contentFilePath = await this.extractContentAndEmbed(filePath, file.type as SupportedMimeType); const fileRecord: RecordedFile = { id: fileId, name: file.name, filePath: filePath, contentPath: contentFilePath, uploadedAt: new Date().toISOString(), } UploadManager.addNewRecordedFile(fileRecord); processedFiles.push({ fileExtension: fileExtension || '', fileId, fileName: file.name }); })) return processedFiles; } } export default UploadManager;