mirror of
https://github.com/ItzCrazyKns/Perplexica.git
synced 2025-12-14 15:48:15 +00:00
feat(uploads): add uploads manager
This commit is contained in:
217
src/lib/uploads/manager.ts
Normal file
217
src/lib/uploads/manager.ts
Normal file
@@ -0,0 +1,217 @@
|
||||
import path from "path";
|
||||
import BaseEmbedding from "../models/base/embedding"
|
||||
import crypto from "crypto"
|
||||
import fs from 'fs';
|
||||
import { splitText } from "../utils/splitText";
|
||||
import { PDFParse } from 'pdf-parse';
|
||||
import officeParser from 'officeparser'
|
||||
import { Chunk } from "../types";
|
||||
|
||||
const supportedMimeTypes = ['application/pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'text/plain'] as const
|
||||
|
||||
type SupportedMimeType = typeof supportedMimeTypes[number];
|
||||
|
||||
type UploadManagerParams = {
|
||||
embeddingModel: BaseEmbedding<any>;
|
||||
}
|
||||
|
||||
type RecordedFile = {
|
||||
id: string;
|
||||
name: string;
|
||||
filePath: string;
|
||||
contentPath: string;
|
||||
uploadedAt: string;
|
||||
}
|
||||
|
||||
type FileRes = {
|
||||
fileName: string;
|
||||
fileExtension: string;
|
||||
fileId: string;
|
||||
}
|
||||
|
||||
class UploadManager {
|
||||
private embeddingModel: BaseEmbedding<any>;
|
||||
static uploadsDir = path.join(process.cwd(), 'data', 'uploads');
|
||||
static uploadedFilesRecordPath = path.join(this.uploadsDir, 'uploaded_files.json');
|
||||
|
||||
constructor(private params: UploadManagerParams) {
|
||||
this.embeddingModel = params.embeddingModel;
|
||||
|
||||
if (!fs.existsSync(UploadManager.uploadsDir)) {
|
||||
fs.mkdirSync(UploadManager.uploadsDir, { recursive: true });
|
||||
}
|
||||
|
||||
if (!fs.existsSync(UploadManager.uploadedFilesRecordPath)) {
|
||||
const data = {
|
||||
files: []
|
||||
}
|
||||
|
||||
fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify(data, null, 2));
|
||||
}
|
||||
}
|
||||
|
||||
private static getRecordedFiles(): RecordedFile[] {
|
||||
const data = fs.readFileSync(UploadManager.uploadedFilesRecordPath, 'utf-8');
|
||||
return JSON.parse(data).files;
|
||||
}
|
||||
|
||||
private static addNewRecordedFile(fileRecord: RecordedFile) {
|
||||
const currentData = this.getRecordedFiles()
|
||||
|
||||
currentData.push(fileRecord);
|
||||
|
||||
fs.writeFileSync(UploadManager.uploadedFilesRecordPath, JSON.stringify({ files: currentData }, null, 2));
|
||||
}
|
||||
|
||||
static getFile(fileId: string): RecordedFile | null {
|
||||
const recordedFiles = this.getRecordedFiles();
|
||||
|
||||
return recordedFiles.find(f => f.id === fileId) || null;
|
||||
}
|
||||
|
||||
static getFileChunks(fileId: string): { content: string; embedding: number[] }[] {
|
||||
try {
|
||||
const recordedFile = this.getFile(fileId);
|
||||
|
||||
if (!recordedFile) {
|
||||
throw new Error(`File with ID ${fileId} not found`);
|
||||
}
|
||||
|
||||
const contentData = JSON.parse(fs.readFileSync(recordedFile.contentPath, 'utf-8'))
|
||||
|
||||
return contentData.chunks;
|
||||
} catch (err) {
|
||||
console.log('Error getting file chunks:', err);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
private async extractContentAndEmbed(filePath: string, fileType: SupportedMimeType): Promise<string> {
|
||||
switch (fileType) {
|
||||
case 'text/plain':
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
|
||||
const splittedText = splitText(content, 256, 64)
|
||||
const embeddings = await this.embeddingModel.embedText(splittedText)
|
||||
|
||||
if (embeddings.length !== splittedText.length) {
|
||||
throw new Error('Embeddings and text chunks length mismatch');
|
||||
}
|
||||
|
||||
const contentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
|
||||
|
||||
const data = {
|
||||
chunks: splittedText.map((text, i) => {
|
||||
return {
|
||||
content: text,
|
||||
embedding: embeddings[i],
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fs.writeFileSync(contentPath, JSON.stringify(data, null, 2));
|
||||
|
||||
return contentPath;
|
||||
case 'application/pdf':
|
||||
const pdfBuffer = fs.readFileSync(filePath);
|
||||
|
||||
const parser = new PDFParse({
|
||||
data: pdfBuffer
|
||||
})
|
||||
|
||||
const pdfText = await parser.getText().then(res => res.text)
|
||||
|
||||
const pdfSplittedText = splitText(pdfText, 256, 64)
|
||||
const pdfEmbeddings = await this.embeddingModel.embedText(pdfSplittedText)
|
||||
|
||||
if (pdfEmbeddings.length !== pdfSplittedText.length) {
|
||||
throw new Error('Embeddings and text chunks length mismatch');
|
||||
}
|
||||
|
||||
const pdfContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
|
||||
|
||||
const pdfData = {
|
||||
chunks: pdfSplittedText.map((text, i) => {
|
||||
return {
|
||||
content: text,
|
||||
embedding: pdfEmbeddings[i],
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fs.writeFileSync(pdfContentPath, JSON.stringify(pdfData, null, 2));
|
||||
|
||||
return pdfContentPath;
|
||||
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
|
||||
const docBuffer = fs.readFileSync(filePath);
|
||||
|
||||
const docText = await officeParser.parseOfficeAsync(docBuffer)
|
||||
|
||||
const docSplittedText = splitText(docText, 256, 64)
|
||||
const docEmbeddings = await this.embeddingModel.embedText(docSplittedText)
|
||||
|
||||
if (docEmbeddings.length !== docSplittedText.length) {
|
||||
throw new Error('Embeddings and text chunks length mismatch');
|
||||
}
|
||||
|
||||
const docContentPath = filePath.split('.').slice(0, -1).join('.') + '.content.json';
|
||||
|
||||
const docData = {
|
||||
chunks: docSplittedText.map((text, i) => {
|
||||
return {
|
||||
content: text,
|
||||
embedding: docEmbeddings[i],
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fs.writeFileSync(docContentPath, JSON.stringify(docData, null, 2));
|
||||
|
||||
return docContentPath;
|
||||
default:
|
||||
throw new Error(`Unsupported file type: ${fileType}`);
|
||||
}
|
||||
}
|
||||
|
||||
async processFiles(files: File[]): Promise<FileRes[]> {
|
||||
const processedFiles: FileRes[] = [];
|
||||
|
||||
await Promise.all(files.map(async (file) => {
|
||||
if (!(supportedMimeTypes as unknown as string[]).includes(file.type)) {
|
||||
throw new Error(`File type ${file.type} not supported`);
|
||||
}
|
||||
|
||||
const fileId = crypto.randomBytes(16).toString('hex');
|
||||
|
||||
const fileExtension = file.name.split('.').pop();
|
||||
const fileName = `${crypto.randomBytes(16).toString('hex')}.${fileExtension}`;
|
||||
const filePath = path.join(UploadManager.uploadsDir, fileName);
|
||||
|
||||
const buffer = Buffer.from(await file.arrayBuffer())
|
||||
|
||||
fs.writeFileSync(filePath, buffer);
|
||||
|
||||
const contentFilePath = await this.extractContentAndEmbed(filePath, file.type as SupportedMimeType);
|
||||
|
||||
const fileRecord: RecordedFile = {
|
||||
id: fileId,
|
||||
name: file.name,
|
||||
filePath: filePath,
|
||||
contentPath: contentFilePath,
|
||||
uploadedAt: new Date().toISOString(),
|
||||
}
|
||||
|
||||
UploadManager.addNewRecordedFile(fileRecord);
|
||||
|
||||
processedFiles.push({
|
||||
fileExtension: fileExtension || '',
|
||||
fileId,
|
||||
fileName: file.name
|
||||
});
|
||||
}))
|
||||
|
||||
return processedFiles;
|
||||
}
|
||||
}
|
||||
|
||||
export default UploadManager;
|
||||
Reference in New Issue
Block a user