From dd2f4effcaef4a92dd1d73935918a068912c341f Mon Sep 17 00:00:00 2001 From: wellCh4n Date: Wed, 19 Feb 2025 12:32:32 +0800 Subject: [PATCH 1/2] feat(app): add jina reader --- sample.config.toml | 5 ++- src/config.ts | 6 ++++ src/utils/documents.ts | 82 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 83 insertions(+), 10 deletions(-) diff --git a/sample.config.toml b/sample.config.toml index 7b09d67..ff96ccf 100644 --- a/sample.config.toml +++ b/sample.config.toml @@ -23,4 +23,7 @@ API_URL = "" API_URL = "" # Ollama API URL - http://host.docker.internal:11434 [API_ENDPOINTS] -SEARXNG = "http://localhost:32768" # SearxNG API URL \ No newline at end of file +SEARXNG = "http://localhost:32768" # SearxNG API URL + +[API_KEYS] +JINA = "" diff --git a/src/config.ts b/src/config.ts index ab2a5db..acf6dae 100644 --- a/src/config.ts +++ b/src/config.ts @@ -35,6 +35,9 @@ interface Config { API_ENDPOINTS: { SEARXNG: string; }; + API_KEYS: { + JINA: string; + } } type RecursivePartial = { @@ -75,6 +78,9 @@ export const getCustomOpenaiApiUrl = () => export const getCustomOpenaiModelName = () => loadConfig().MODELS.CUSTOM_OPENAI.MODEL_NAME; +export const getJinaApiKey = () => + loadConfig().API_KEYS.JINA; + const mergeConfigs = (current: any, update: any): any => { if (update === null || update === undefined) { return current; diff --git a/src/utils/documents.ts b/src/utils/documents.ts index 5cd0366..dfd9fc4 100644 --- a/src/utils/documents.ts +++ b/src/utils/documents.ts @@ -1,22 +1,30 @@ import axios from 'axios'; import { htmlToText } from 'html-to-text'; -import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import { RecursiveCharacterTextSplitter, MarkdownTextSplitter } from 'langchain/text_splitter'; import { Document } from '@langchain/core/documents'; import pdfParse from 'pdf-parse'; import logger from './logger'; +import { getJinaApiKey } from '../config'; export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => { - const splitter = new RecursiveCharacterTextSplitter(); + links = links.map(link => link.startsWith('http://') || link.startsWith('https://') + ? link + : `https://${link}`) + + if (getJinaApiKey()) { + return await getDocumentsFromJinaReader({ links }); + } + + return await getDocumentsFromLocal({ links }); +}; + +const getDocumentsFromLocal = async ({links}: {links: string[]}) => { + const splitter = new RecursiveCharacterTextSplitter(); let docs: Document[] = []; await Promise.all( links.map(async (link) => { - link = - link.startsWith('http://') || link.startsWith('https://') - ? link - : `https://${link}`; - try { const res = await axios.get(link, { responseType: 'arraybuffer', @@ -94,6 +102,62 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => { } }), ); - return docs; -}; +} + +const getDocumentsFromJinaReader = async ({links}: { links: string[] }) => { + const splitter = new MarkdownTextSplitter(); + let docs: Document[] = []; + + await Promise.all( + links.map(async link => { + try { + const res = await axios.get(`https://r.jina.ai/${link}`, { + headers: { + 'Accept': 'application/json', + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${getJinaApiKey()}`, + } + }); + + if(res.data.code === 200) { + const data = res.data.data + const splittedText = await splitter.splitText(data.content); + const linkDocs = splittedText.map((text) => { + return new Document({ + pageContent: text, + metadata: { + title: data.title, + url: link, + }, + }); + }); + + docs.push(...linkDocs); + return; + } else { + docs.push( + new Document({ + pageContent: `Failed to retrieve content from the link in the Jina reader API, code: ${res.data.code}`, + metadata: { + title: 'Failed to retrieve content', + url: link, + }, + }), + ); + } + } catch (err) { + docs.push( + new Document({ + pageContent: `Failed to retrieve content from the link: ${err.message}`, + metadata: { + title: 'Failed to retrieve content', + url: link, + }, + }), + ); + } + }) + ); + return docs; +} From 47c7bb688fafc61eb5c77b95ee2619096958042a Mon Sep 17 00:00:00 2001 From: wellCh4n Date: Wed, 19 Feb 2025 12:33:37 +0800 Subject: [PATCH 2/2] feat(app): add jina reader --- src/utils/documents.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/documents.ts b/src/utils/documents.ts index dfd9fc4..934d4a2 100644 --- a/src/utils/documents.ts +++ b/src/utils/documents.ts @@ -103,7 +103,7 @@ const getDocumentsFromLocal = async ({links}: {links: string[]}) => { }), ); return docs; -} +}; const getDocumentsFromJinaReader = async ({links}: { links: string[] }) => { const splitter = new MarkdownTextSplitter(); @@ -160,4 +160,4 @@ const getDocumentsFromJinaReader = async ({links}: { links: string[] }) => { }) ); return docs; -} +};