diff --git a/sample.config.toml b/sample.config.toml index cddb19d..be86993 100644 --- a/sample.config.toml +++ b/sample.config.toml @@ -24,4 +24,7 @@ MODEL_NAME = "" API_URL = "" # Ollama API URL - http://host.docker.internal:11434 [API_ENDPOINTS] -SEARXNG = "http://localhost:32768" # SearxNG API URL \ No newline at end of file +SEARXNG = "http://localhost:32768" # SearxNG API URL + +[API_KEYS] +JINA = "" diff --git a/src/config.ts b/src/config.ts index ab2a5db..acf6dae 100644 --- a/src/config.ts +++ b/src/config.ts @@ -35,6 +35,9 @@ interface Config { API_ENDPOINTS: { SEARXNG: string; }; + API_KEYS: { + JINA: string; + } } type RecursivePartial = { @@ -75,6 +78,9 @@ export const getCustomOpenaiApiUrl = () => export const getCustomOpenaiModelName = () => loadConfig().MODELS.CUSTOM_OPENAI.MODEL_NAME; +export const getJinaApiKey = () => + loadConfig().API_KEYS.JINA; + const mergeConfigs = (current: any, update: any): any => { if (update === null || update === undefined) { return current; diff --git a/src/utils/documents.ts b/src/utils/documents.ts index 5cd0366..934d4a2 100644 --- a/src/utils/documents.ts +++ b/src/utils/documents.ts @@ -1,22 +1,30 @@ import axios from 'axios'; import { htmlToText } from 'html-to-text'; -import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import { RecursiveCharacterTextSplitter, MarkdownTextSplitter } from 'langchain/text_splitter'; import { Document } from '@langchain/core/documents'; import pdfParse from 'pdf-parse'; import logger from './logger'; +import { getJinaApiKey } from '../config'; export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => { - const splitter = new RecursiveCharacterTextSplitter(); + links = links.map(link => link.startsWith('http://') || link.startsWith('https://') + ? link + : `https://${link}`) + + if (getJinaApiKey()) { + return await getDocumentsFromJinaReader({ links }); + } + + return await getDocumentsFromLocal({ links }); +}; + +const getDocumentsFromLocal = async ({links}: {links: string[]}) => { + const splitter = new RecursiveCharacterTextSplitter(); let docs: Document[] = []; await Promise.all( links.map(async (link) => { - link = - link.startsWith('http://') || link.startsWith('https://') - ? link - : `https://${link}`; - try { const res = await axios.get(link, { responseType: 'arraybuffer', @@ -94,6 +102,62 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => { } }), ); - + return docs; +}; + +const getDocumentsFromJinaReader = async ({links}: { links: string[] }) => { + const splitter = new MarkdownTextSplitter(); + let docs: Document[] = []; + + await Promise.all( + links.map(async link => { + try { + const res = await axios.get(`https://r.jina.ai/${link}`, { + headers: { + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${getJinaApiKey()}`, + } + }); + + if(res.data.code === 200) { + const data = res.data.data + const splittedText = await splitter.splitText(data.content); + const linkDocs = splittedText.map((text) => { + return new Document({ + pageContent: text, + metadata: { + title: data.title, + url: link, + }, + }); + }); + + docs.push(...linkDocs); + return; + } else { + docs.push( + new Document({ + pageContent: `Failed to retrieve content from the link in the Jina reader API, code: ${res.data.code}`, + metadata: { + title: 'Failed to retrieve content', + url: link, + }, + }), + ); + } + } catch (err) { + docs.push( + new Document({ + pageContent: `Failed to retrieve content from the link: ${err.message}`, + metadata: { + title: 'Failed to retrieve content', + url: link, + }, + }), + ); + } + }) + ); return docs; };