Compare commits

...

3 Commits

Author SHA1 Message Date
562a6e890d Merge 47c7bb688f into 64d2a467b0 2025-03-17 23:30:41 +05:30
47c7bb688f feat(app): add jina reader 2025-02-19 12:33:37 +08:00
dd2f4effca feat(app): add jina reader 2025-02-19 12:32:32 +08:00
3 changed files with 82 additions and 9 deletions

View File

@ -24,4 +24,7 @@ MODEL_NAME = ""
API_URL = "" # Ollama API URL - http://host.docker.internal:11434
[API_ENDPOINTS]
SEARXNG = "http://localhost:32768" # SearxNG API URL
SEARXNG = "http://localhost:32768" # SearxNG API URL
[API_KEYS]
JINA = ""

View File

@ -35,6 +35,9 @@ interface Config {
API_ENDPOINTS: {
SEARXNG: string;
};
API_KEYS: {
JINA: string;
}
}
type RecursivePartial<T> = {
@ -75,6 +78,9 @@ export const getCustomOpenaiApiUrl = () =>
export const getCustomOpenaiModelName = () =>
loadConfig().MODELS.CUSTOM_OPENAI.MODEL_NAME;
export const getJinaApiKey = () =>
loadConfig().API_KEYS.JINA;
const mergeConfigs = (current: any, update: any): any => {
if (update === null || update === undefined) {
return current;

View File

@ -1,22 +1,30 @@
import axios from 'axios';
import { htmlToText } from 'html-to-text';
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { RecursiveCharacterTextSplitter, MarkdownTextSplitter } from 'langchain/text_splitter';
import { Document } from '@langchain/core/documents';
import pdfParse from 'pdf-parse';
import logger from './logger';
import { getJinaApiKey } from '../config';
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
const splitter = new RecursiveCharacterTextSplitter();
links = links.map(link => link.startsWith('http://') || link.startsWith('https://')
? link
: `https://${link}`)
if (getJinaApiKey()) {
return await getDocumentsFromJinaReader({ links });
}
return await getDocumentsFromLocal({ links });
};
const getDocumentsFromLocal = async ({links}: {links: string[]}) => {
const splitter = new RecursiveCharacterTextSplitter();
let docs: Document[] = [];
await Promise.all(
links.map(async (link) => {
link =
link.startsWith('http://') || link.startsWith('https://')
? link
: `https://${link}`;
try {
const res = await axios.get(link, {
responseType: 'arraybuffer',
@ -94,6 +102,62 @@ export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
}
}),
);
return docs;
};
const getDocumentsFromJinaReader = async ({links}: { links: string[] }) => {
const splitter = new MarkdownTextSplitter();
let docs: Document[] = [];
await Promise.all(
links.map(async link => {
try {
const res = await axios.get(`https://r.jina.ai/${link}`, {
headers: {
'Accept': 'application/json',
'Content-Type': 'application/json',
'Authorization': `Bearer ${getJinaApiKey()}`,
}
});
if(res.data.code === 200) {
const data = res.data.data
const splittedText = await splitter.splitText(data.content);
const linkDocs = splittedText.map((text) => {
return new Document({
pageContent: text,
metadata: {
title: data.title,
url: link,
},
});
});
docs.push(...linkDocs);
return;
} else {
docs.push(
new Document({
pageContent: `Failed to retrieve content from the link in the Jina reader API, code: ${res.data.code}`,
metadata: {
title: 'Failed to retrieve content',
url: link,
},
}),
);
}
} catch (err) {
docs.push(
new Document({
pageContent: `Failed to retrieve content from the link: ${err.message}`,
metadata: {
title: 'Failed to retrieve content',
url: link,
},
}),
);
}
})
);
return docs;
};