Added multiple search backends

Add support for multiple search engines/google as a search engine
This commit is contained in:
Lars Erhardt
2025-02-28 09:33:50 +01:00
committed by GitHub
11 changed files with 1338 additions and 1463 deletions

3
.gitignore vendored
View File

@ -38,3 +38,6 @@ Thumbs.db
# Db
db.sqlite
/searxng
# Dev
docker-compose-dev.yaml

View File

@ -2,6 +2,7 @@
PORT = 3001 # Port to run the server on
SIMILARITY_MEASURE = "cosine" # "cosine" or "dot"
KEEP_ALIVE = "5m" # How long to keep Ollama models loaded into memory. (Instead of using -1 use "-1m")
SEARCH_ENGINE_BACKEND = "google" # "google" | "searxng" | "ddg" | "bing" | "brave"
[MODELS.OPENAI]
API_KEY = ""
@ -22,5 +23,15 @@ API_URL = ""
[MODELS.OLLAMA]
API_URL = "" # Ollama API URL - http://host.docker.internal:11434
[API_ENDPOINTS]
SEARXNG = "http://localhost:32768" # SearxNG API URL
[SEARCH_ENGINES.GOOGLE]
API_KEY = ""
CSE_ID = ""
[SEARCH_ENGINES.SEARXNG]
ENDPOINT = ""
[SEARCH_ENGINES.BING]
SUBSCRIPTION_KEY = ""
[SEARCH_ENGINES.BRAVE]
API_KEY = ""

View File

@ -7,7 +7,9 @@ import { PromptTemplate } from '@langchain/core/prompts';
import formatChatHistoryAsString from '../utils/formatHistory';
import { BaseMessage } from '@langchain/core/messages';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { searchSearxng } from '../lib/searxng';
import { searchSearxng } from '../lib/searchEngines/searxng';
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
import { getSearchEngineBackend } from '../config';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
const imageSearchChainPrompt = `
@ -36,6 +38,51 @@ type ImageSearchChainInput = {
query: string;
};
async function performImageSearch(query: string) {
const searchEngine = getSearchEngineBackend();
let images = [];
switch (searchEngine) {
case 'google': {
const googleResult = await searchGooglePSE(query);
images = googleResult.results.map((result) => {
if (result.img_src && result.url && result.title) {
return {
img_src: result.img_src,
url: result.url,
title: result.title,
source: result.displayLink
};
}
})
.filter(Boolean);
break;
}
case 'searxng': {
const searxResult = await searchSearxng(query, {
engines: ['google images', 'bing images'],
pageno: 1,
});
searxResult.results.forEach((result) => {
if (result.img_src && result.url && result.title) {
images.push({
img_src: result.img_src,
url: result.url,
title: result.title,
});
}
});
break;
}
default:
throw new Error(`Unknown search engine ${searchEngine}`);
}
return images;
}
const strParser = new StringOutputParser();
const createImageSearchChain = (llm: BaseChatModel) => {
@ -52,22 +99,7 @@ const createImageSearchChain = (llm: BaseChatModel) => {
llm,
strParser,
RunnableLambda.from(async (input: string) => {
const res = await searchSearxng(input, {
engines: ['bing images', 'google images'],
});
const images = [];
res.results.forEach((result) => {
if (result.img_src && result.url && result.title) {
images.push({
img_src: result.img_src,
url: result.url,
title: result.title,
});
}
});
const images = await performImageSearch(input);
return images.slice(0, 10);
}),
]);

View File

@ -7,7 +7,9 @@ import { PromptTemplate } from '@langchain/core/prompts';
import formatChatHistoryAsString from '../utils/formatHistory';
import { BaseMessage } from '@langchain/core/messages';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { searchSearxng } from '../lib/searxng';
import { searchSearxng } from '../lib/searchEngines/searxng';
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
import { getSearchEngineBackend } from '../config';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
const VideoSearchChainPrompt = `
@ -38,27 +40,33 @@ type VideoSearchChainInput = {
const strParser = new StringOutputParser();
const createVideoSearchChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
RunnableMap.from({
chat_history: (input: VideoSearchChainInput) => {
return formatChatHistoryAsString(input.chat_history);
},
query: (input: VideoSearchChainInput) => {
return input.query;
},
}),
PromptTemplate.fromTemplate(VideoSearchChainPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
const res = await searchSearxng(input, {
async function performVideoSearch(query: string) {
const searchEngine = getSearchEngineBackend();
const youtubeQuery = `${query} site:youtube.com`;
let videos = [];
switch (searchEngine) {
case 'google': {
const googleResult = await searchGooglePSE(youtubeQuery);
googleResult.results.forEach((result) => { // Use .results instead of .originalres
if (result.img_src && result.url && result.title) {
const videoId = new URL(result.url).searchParams.get('v');
videos.push({
img_src: result.img_src,
url: result.url,
title: result.title,
iframe_src: videoId ? `https://www.youtube.com/embed/${videoId}` : null
});
}
});
break;
}
case 'searxng': {
const searxResult = await searchSearxng(query, {
engines: ['youtube'],
});
const videos = [];
res.results.forEach((result) => {
searxResult.results.forEach((result) => {
if (
result.thumbnail &&
result.url &&
@ -73,7 +81,31 @@ const createVideoSearchChain = (llm: BaseChatModel) => {
});
}
});
break;
}
default:
throw new Error(`Unknown search engine ${searchEngine}`);
}
return videos;
}
const createVideoSearchChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
RunnableMap.from({
chat_history: (input: VideoSearchChainInput) => {
return formatChatHistoryAsString(input.chat_history);
},
query: (input: VideoSearchChainInput) => {
return input.query;
},
}),
PromptTemplate.fromTemplate(VideoSearchChainPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
const videos = await performVideoSearch(input);
return videos.slice(0, 10);
}),
]);
@ -87,4 +119,4 @@ const handleVideoSearch = (
return VideoSearchChain.invoke(input);
};
export default handleVideoSearch;
export default handleVideoSearch;

View File

@ -9,6 +9,7 @@ interface Config {
PORT: number;
SIMILARITY_MEASURE: string;
KEEP_ALIVE: string;
SEARCH_ENGINE_BACKEND: string;
};
MODELS: {
OPENAI: {
@ -32,8 +33,20 @@ interface Config {
MODEL_NAME: string;
};
};
API_ENDPOINTS: {
SEARXNG: string;
SEARCH_ENGINES: {
GOOGLE: {
API_KEY: string;
CSE_ID: string;
};
SEARXNG: {
ENDPOINT: string;
};
BING: {
SUBSCRIPTION_KEY: string;
};
BRAVE: {
API_KEY: string;
};
};
}
@ -61,8 +74,18 @@ export const getAnthropicApiKey = () => loadConfig().MODELS.ANTHROPIC.API_KEY;
export const getGeminiApiKey = () => loadConfig().MODELS.GEMINI.API_KEY;
export const getSearchEngineBackend = () => loadConfig().GENERAL.SEARCH_ENGINE_BACKEND;
export const getGoogleApiKey = () => loadConfig().SEARCH_ENGINES.GOOGLE.API_KEY;
export const getGoogleCseId = () => loadConfig().SEARCH_ENGINES.GOOGLE.CSE_ID;
export const getBraveApiKey = () => loadConfig().SEARCH_ENGINES.BRAVE.API_KEY;
export const getBingSubscriptionKey = () => loadConfig().SEARCH_ENGINES.BING.SUBSCRIPTION_KEY;
export const getSearxngApiEndpoint = () =>
process.env.SEARXNG_API_URL || loadConfig().API_ENDPOINTS.SEARXNG;
process.env.SEARXNG_API_URL || loadConfig().SEARCH_ENGINES.SEARXNG.ENDPOINT;
export const getOllamaApiEndpoint = () => loadConfig().MODELS.OLLAMA.API_URL;

View File

@ -0,0 +1,84 @@
import axios from 'axios';
import { getGoogleApiKey, getGoogleCseId } from '../../config';
interface GooglePSESearchResult {
kind: string;
title: string;
htmlTitle: string;
link: string;
displayLink: string;
snippet?: string;
htmlSnippet?: string;
cacheId?: string;
formattedUrl: string;
htmlFormattedUrl: string;
pagemap?: {
videoobject: any;
cse_thumbnail?: Array<{
src: string;
width: string;
height: string;
}>;
metatags?: Array<{
[key: string]: string;
'author'?: string;
}>;
cse_image?: Array<{
src: string;
}>;
};
fileFormat?: string;
image?: {
contextLink: string;
thumbnailLink: string;
};
mime?: string;
labels?: Array<{
name: string;
displayName: string;
}>;
}
export const searchGooglePSE = async (query: string) => {
try {
const [googleApiKey, googleCseID] = await Promise.all([
getGoogleApiKey(),
getGoogleCseId()
]);
const url = new URL(`https://www.googleapis.com/customsearch/v1`);
url.searchParams.append('q', query);
url.searchParams.append('cx', googleCseID);
url.searchParams.append('key', googleApiKey);
const res = await axios.get(url.toString());
if (res.data.error) {
throw new Error(`Google PSE Error: ${res.data.error.message}`);
}
const originalres = res.data.items;
const results = originalres.map((item: GooglePSESearchResult) => ({
title: item.title,
url: item.link,
content: item.snippet,
img_src: item.pagemap?.cse_image?.[0]?.src
|| item.pagemap?.cse_thumbnail?.[0]?.src
|| item.image?.thumbnailLink,
...(item.pagemap?.videoobject?.[0] && {
videoData: {
duration: item.pagemap.videoobject[0].duration,
embedUrl: item.pagemap.videoobject[0].embedurl
}
})
}));
return { results, originalres };
} catch (error) {
const errorMessage = error.response?.data
? JSON.stringify(error.response.data, null, 2)
: error.message || 'Unknown error';
throw new Error(`Google PSE Error: ${errorMessage}`);
}
};

View File

@ -1,5 +1,5 @@
import axios from 'axios';
import { getSearxngApiEndpoint } from '../config';
import { getSearxngApiEndpoint } from '../../config';
interface SearxngSearchOptions {
categories?: string[];

View File

@ -1,42 +1,80 @@
import express from 'express';
import { searchSearxng } from '../lib/searxng';
import { searchSearxng } from '../lib/searchEngines/searxng';
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
import { getSearchEngineBackend } from '../config';
import logger from '../utils/logger';
const router = express.Router();
async function performSearch(query: string, site: string) {
const searchEngine = getSearchEngineBackend();
switch (searchEngine) {
case 'google': {
const googleResult = await searchGooglePSE(query);
return googleResult.originalres.map(item => {
const imageSources = [
item.pagemap?.cse_image?.[0]?.src,
item.pagemap?.cse_thumbnail?.[0]?.src,
item.pagemap?.metatags?.[0]?.['og:image'],
item.pagemap?.metatags?.[0]?.['twitter:image'],
item.pagemap?.metatags?.[0]?.['image'],
].filter(Boolean); // Remove undefined values
return {
title: item.title,
url: item.link,
content: item.snippet,
thumbnail: imageSources[0], // First available image
img_src: imageSources[0], // Same as thumbnail for consistency
iframe_src: null,
author: item.pagemap?.metatags?.[0]?.['og:site_name'] || site,
publishedDate: item.pagemap?.metatags?.[0]?.['article:published_time']
};
});
}
case 'searxng': {
const searxResult = await searchSearxng(query, {
engines: ['bing news'],
pageno: 1,
});
return searxResult.results;
}
default:
throw new Error(`Unknown search engine ${searchEngine}`);
}
}
router.get('/', async (req, res) => {
try {
const queries = [
{ site: 'businessinsider.com', topic: 'AI' },
{ site: 'www.exchangewire.com', topic: 'AI' },
{ site: 'yahoo.com', topic: 'AI' },
{ site: 'businessinsider.com', topic: 'tech' },
{ site: 'www.exchangewire.com', topic: 'tech' },
{ site: 'yahoo.com', topic: 'tech' },
];
const data = (
await Promise.all([
searchSearxng('site:businessinsider.com AI', {
engines: ['bing news'],
pageno: 1,
}),
searchSearxng('site:www.exchangewire.com AI', {
engines: ['bing news'],
pageno: 1,
}),
searchSearxng('site:yahoo.com AI', {
engines: ['bing news'],
pageno: 1,
}),
searchSearxng('site:businessinsider.com tech', {
engines: ['bing news'],
pageno: 1,
}),
searchSearxng('site:www.exchangewire.com tech', {
engines: ['bing news'],
pageno: 1,
}),
searchSearxng('site:yahoo.com tech', {
engines: ['bing news'],
pageno: 1,
}),
])
await Promise.all(
queries.map(async ({ site, topic }) => {
try {
const query = `site:${site} ${topic}`;
return await performSearch(query, site);
} catch (error) {
logger.error(`Error searching ${site}: ${error.message}`);
return [];
}
})
)
)
.map((result) => result.results)
.flat()
.sort(() => Math.random() - 0.5);
.sort(() => Math.random() - 0.5)
.filter(item => item.title && item.url && item.content);
return res.json({ blogs: data });
} catch (err: any) {

View File

@ -17,7 +17,9 @@ import LineListOutputParser from '../lib/outputParsers/listLineOutputParser';
import LineOutputParser from '../lib/outputParsers/lineOutputParser';
import { getDocumentsFromLinks } from '../utils/documents';
import { Document } from 'langchain/document';
import { searchSearxng } from '../lib/searxng';
import { searchSearxng } from '../lib/searchEngines/searxng';
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
import { getSearchEngineBackend } from '../config';
import path from 'path';
import fs from 'fs';
import computeSimilarity from '../utils/computeSimilarity';
@ -203,10 +205,27 @@ class MetaSearchAgent implements MetaSearchAgentType {
return { query: question, docs: docs };
} else {
const res = await searchSearxng(question, {
language: 'en',
engines: this.config.activeEngines,
});
const searchEngine = getSearchEngineBackend();
let res;
switch (searchEngine) {
case 'searxng':
res = await searchSearxng(question, {
language: 'en',
engines: this.config.activeEngines,
});
break;
case 'google':
res = await searchGooglePSE(question);
break;
default:
throw new Error(`Unknown search engine ${searchEngine}`);
}
if (!res?.results) {
throw new Error(`No results found for search engine: ${searchEngine}`);
}
const documents = res.results.map(
(result) =>

File diff suppressed because it is too large Load Diff

1313
yarn.lock

File diff suppressed because it is too large Load Diff