mirror of
https://github.com/ItzCrazyKns/Perplexica.git
synced 2025-04-30 08:12:26 +00:00
101 lines
2.6 KiB
TypeScript
101 lines
2.6 KiB
TypeScript
import axios from 'axios';
|
|
import { htmlToText } from 'html-to-text';
|
|
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
|
|
import { Document } from '@langchain/core/documents';
|
|
import pdfParse from 'pdf-parse';
|
|
import logger from './logger';
|
|
|
|
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
|
|
const splitter = new RecursiveCharacterTextSplitter();
|
|
|
|
let docs: Document[] = [];
|
|
|
|
await Promise.all(
|
|
links.map(async (link) => {
|
|
link =
|
|
link.startsWith('http://') || link.startsWith('https://')
|
|
? link
|
|
: `https://${link}`;
|
|
|
|
try {
|
|
const res = await axios.get(link, {
|
|
responseType: 'arraybuffer',
|
|
});
|
|
|
|
const isPdf = res.headers['content-type'] === 'application/pdf';
|
|
|
|
if (isPdf) {
|
|
const pdfText = await pdfParse(res.data);
|
|
const parsedText = pdfText.text
|
|
.replace(/(\r\n|\n|\r)/gm, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
|
|
const splittedText = await splitter.splitText(parsedText);
|
|
const title = 'PDF Document';
|
|
|
|
const linkDocs = splittedText.map((text) => {
|
|
return new Document({
|
|
pageContent: text,
|
|
metadata: {
|
|
title: title,
|
|
url: link,
|
|
},
|
|
});
|
|
});
|
|
|
|
docs.push(...linkDocs);
|
|
return;
|
|
}
|
|
|
|
const parsedText = htmlToText(res.data.toString('utf8'), {
|
|
selectors: [
|
|
{
|
|
selector: 'a',
|
|
options: {
|
|
ignoreHref: true,
|
|
},
|
|
},
|
|
],
|
|
})
|
|
.replace(/(\r\n|\n|\r)/gm, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
|
|
const splittedText = await splitter.splitText(parsedText);
|
|
const title = res.data
|
|
.toString('utf8')
|
|
.match(/<title>(.*?)<\/title>/)?.[1];
|
|
|
|
const linkDocs = splittedText.map((text) => {
|
|
return new Document({
|
|
pageContent: text,
|
|
metadata: {
|
|
title: title || link,
|
|
url: link,
|
|
},
|
|
});
|
|
});
|
|
|
|
docs.push(...linkDocs);
|
|
} catch (err) {
|
|
console.error(
|
|
'An error occurred while getting documents from links: ',
|
|
err,
|
|
);
|
|
docs.push(
|
|
new Document({
|
|
pageContent: `Failed to retrieve content from the link: ${err}`,
|
|
metadata: {
|
|
title: 'Failed to retrieve content',
|
|
url: link,
|
|
},
|
|
}),
|
|
);
|
|
}
|
|
}),
|
|
);
|
|
|
|
return docs;
|
|
};
|