mirror of
https://github.com/ItzCrazyKns/Perplexica.git
synced 2025-05-03 01:32:29 +00:00
added multi search engine support (didnt test) WIP
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@ -38,3 +38,6 @@ Thumbs.db
|
|||||||
# Db
|
# Db
|
||||||
db.sqlite
|
db.sqlite
|
||||||
/searxng
|
/searxng
|
||||||
|
|
||||||
|
# Dev
|
||||||
|
docker-compose-dev.yaml
|
||||||
|
@ -8,6 +8,8 @@ import formatChatHistoryAsString from '../utils/formatHistory';
|
|||||||
import { BaseMessage } from '@langchain/core/messages';
|
import { BaseMessage } from '@langchain/core/messages';
|
||||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||||
import { searchSearxng } from '../lib/searchEngines/searxng';
|
import { searchSearxng } from '../lib/searchEngines/searxng';
|
||||||
|
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
|
||||||
|
import { getSearchEngineBackend } from '../config';
|
||||||
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
|
|
||||||
const imageSearchChainPrompt = `
|
const imageSearchChainPrompt = `
|
||||||
@ -36,6 +38,59 @@ type ImageSearchChainInput = {
|
|||||||
query: string;
|
query: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
async function performImageSearch(query: string) {
|
||||||
|
const searchEngine = getSearchEngineBackend();
|
||||||
|
let images = [];
|
||||||
|
|
||||||
|
switch (searchEngine) {
|
||||||
|
case 'google': {
|
||||||
|
const googleResult = await searchGooglePSE(query);
|
||||||
|
images = googleResult.originalres
|
||||||
|
.map((result) => {
|
||||||
|
// Extract image URL from multiple possible locations in Google's response
|
||||||
|
const imageSrc = result.pagemap?.cse_image?.[0]?.src ||
|
||||||
|
result.pagemap?.cse_thumbnail?.[0]?.src ||
|
||||||
|
result.image?.thumbnailLink;
|
||||||
|
|
||||||
|
if (imageSrc && result.link && result.title) {
|
||||||
|
images.push({
|
||||||
|
img_src: imageSrc,
|
||||||
|
url: result.link,
|
||||||
|
title: result.title,
|
||||||
|
// Add additional metadata if needed
|
||||||
|
source: result.displayLink,
|
||||||
|
fileFormat: result.fileFormat,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter(Boolean);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'searxng': {
|
||||||
|
const searxResult = await searchSearxng(query, {
|
||||||
|
engines: ['google images', 'bing images'],
|
||||||
|
pageno: 1,
|
||||||
|
});
|
||||||
|
searxResult.results.forEach((result) => {
|
||||||
|
if (result.img_src && result.url && result.title) {
|
||||||
|
images.push({
|
||||||
|
img_src: result.img_src,
|
||||||
|
url: result.url,
|
||||||
|
title: result.title,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new Error(`Unknown search engine ${searchEngine}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return images;
|
||||||
|
}
|
||||||
|
|
||||||
const strParser = new StringOutputParser();
|
const strParser = new StringOutputParser();
|
||||||
|
|
||||||
const createImageSearchChain = (llm: BaseChatModel) => {
|
const createImageSearchChain = (llm: BaseChatModel) => {
|
||||||
@ -52,22 +107,7 @@ const createImageSearchChain = (llm: BaseChatModel) => {
|
|||||||
llm,
|
llm,
|
||||||
strParser,
|
strParser,
|
||||||
RunnableLambda.from(async (input: string) => {
|
RunnableLambda.from(async (input: string) => {
|
||||||
const res = await searchSearxng(input, {
|
const images = await performImageSearch(input);
|
||||||
engines: ['bing images', 'google images'],
|
|
||||||
});
|
|
||||||
|
|
||||||
const images = [];
|
|
||||||
|
|
||||||
res.results.forEach((result) => {
|
|
||||||
if (result.img_src && result.url && result.title) {
|
|
||||||
images.push({
|
|
||||||
img_src: result.img_src,
|
|
||||||
url: result.url,
|
|
||||||
title: result.title,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return images.slice(0, 10);
|
return images.slice(0, 10);
|
||||||
}),
|
}),
|
||||||
]);
|
]);
|
||||||
|
@ -8,6 +8,8 @@ import formatChatHistoryAsString from '../utils/formatHistory';
|
|||||||
import { BaseMessage } from '@langchain/core/messages';
|
import { BaseMessage } from '@langchain/core/messages';
|
||||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||||
import { searchSearxng } from '../lib/searchEngines/searxng';
|
import { searchSearxng } from '../lib/searchEngines/searxng';
|
||||||
|
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
|
||||||
|
import { getSearchEngineBackend } from '../config';
|
||||||
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||||
|
|
||||||
const VideoSearchChainPrompt = `
|
const VideoSearchChainPrompt = `
|
||||||
@ -38,27 +40,45 @@ type VideoSearchChainInput = {
|
|||||||
|
|
||||||
const strParser = new StringOutputParser();
|
const strParser = new StringOutputParser();
|
||||||
|
|
||||||
const createVideoSearchChain = (llm: BaseChatModel) => {
|
function extractYouTubeVideoId(url: string): string | null {
|
||||||
return RunnableSequence.from([
|
const regex = /(?:v=|\/embed\/|\.be\/)([a-zA-Z0-9_-]{11})/;
|
||||||
RunnableMap.from({
|
const match = url.match(regex);
|
||||||
chat_history: (input: VideoSearchChainInput) => {
|
return match ? match[1] : null;
|
||||||
return formatChatHistoryAsString(input.chat_history);
|
}
|
||||||
},
|
|
||||||
query: (input: VideoSearchChainInput) => {
|
async function performVideoSearch(query: string) {
|
||||||
return input.query;
|
const searchEngine = getSearchEngineBackend();
|
||||||
},
|
const youtubeQuery = `${query} site:youtube.com`;
|
||||||
}),
|
let videos = [];
|
||||||
PromptTemplate.fromTemplate(VideoSearchChainPrompt),
|
|
||||||
llm,
|
switch (searchEngine) {
|
||||||
strParser,
|
case 'google': {
|
||||||
RunnableLambda.from(async (input: string) => {
|
const googleResult = await searchGooglePSE(youtubeQuery);
|
||||||
const res = await searchSearxng(input, {
|
googleResult.originalres.results.forEach((result) => {
|
||||||
|
// Extract video metadata from Google PSE results
|
||||||
|
const thumbnail = result.pagemap?.cse_thumbnail?.[0]?.src
|
||||||
|
|| result.pagemap?.videoobject?.[0]?.thumbnailurl;
|
||||||
|
|
||||||
|
if (thumbnail && result.link && result.title) {
|
||||||
|
videos.push({
|
||||||
|
img_src: thumbnail,
|
||||||
|
url: result.link,
|
||||||
|
title: result.title,
|
||||||
|
// Construct iframe URL from YouTube video ID
|
||||||
|
iframe_src: result.link.includes('youtube.com/watch?v=')
|
||||||
|
? `https://www.youtube.com/embed/${result.link.split('v=')[1].split('&')[0]}`
|
||||||
|
: null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'searxng': {
|
||||||
|
const searxResult = await searchSearxng(query, {
|
||||||
engines: ['youtube'],
|
engines: ['youtube'],
|
||||||
});
|
});
|
||||||
|
searxResult.results.forEach((result) => {
|
||||||
const videos = [];
|
|
||||||
|
|
||||||
res.results.forEach((result) => {
|
|
||||||
if (
|
if (
|
||||||
result.thumbnail &&
|
result.thumbnail &&
|
||||||
result.url &&
|
result.url &&
|
||||||
@ -73,7 +93,31 @@ const createVideoSearchChain = (llm: BaseChatModel) => {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new Error(`Unknown search engine ${searchEngine}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return videos;
|
||||||
|
}
|
||||||
|
|
||||||
|
const createVideoSearchChain = (llm: BaseChatModel) => {
|
||||||
|
return RunnableSequence.from([
|
||||||
|
RunnableMap.from({
|
||||||
|
chat_history: (input: VideoSearchChainInput) => {
|
||||||
|
return formatChatHistoryAsString(input.chat_history);
|
||||||
|
},
|
||||||
|
query: (input: VideoSearchChainInput) => {
|
||||||
|
return input.query;
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
PromptTemplate.fromTemplate(VideoSearchChainPrompt),
|
||||||
|
llm,
|
||||||
|
strParser,
|
||||||
|
RunnableLambda.from(async (input: string) => {
|
||||||
|
const videos = await performVideoSearch(input);
|
||||||
return videos.slice(0, 10);
|
return videos.slice(0, 10);
|
||||||
}),
|
}),
|
||||||
]);
|
]);
|
||||||
|
73
src/lib/searchEngines/google_pse.ts
Normal file
73
src/lib/searchEngines/google_pse.ts
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
import axios from 'axios';
|
||||||
|
import { getGoogleApiKey, getGoogleCseId } from '../../config';
|
||||||
|
|
||||||
|
interface GooglePSESearchResult {
|
||||||
|
kind: string;
|
||||||
|
title: string;
|
||||||
|
htmlTitle: string;
|
||||||
|
link: string;
|
||||||
|
displayLink: string;
|
||||||
|
snippet?: string;
|
||||||
|
htmlSnippet?: string;
|
||||||
|
cacheId?: string;
|
||||||
|
formattedUrl: string;
|
||||||
|
htmlFormattedUrl: string;
|
||||||
|
pagemap?: {
|
||||||
|
videoobject: any;
|
||||||
|
cse_thumbnail?: Array<{
|
||||||
|
src: string;
|
||||||
|
width: string;
|
||||||
|
height: string;
|
||||||
|
}>;
|
||||||
|
metatags?: Array<{
|
||||||
|
[key: string]: string;
|
||||||
|
'author'?: string;
|
||||||
|
}>;
|
||||||
|
cse_image?: Array<{
|
||||||
|
src: string;
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
fileFormat?: string;
|
||||||
|
image?: {
|
||||||
|
contextLink: string;
|
||||||
|
thumbnailLink: string;
|
||||||
|
};
|
||||||
|
mime?: string;
|
||||||
|
labels?: Array<{
|
||||||
|
name: string;
|
||||||
|
displayName: string;
|
||||||
|
}>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const searchGooglePSE = async (query: string) => {
|
||||||
|
try {
|
||||||
|
const [googleApiKey, googleCseID] = await Promise.all([
|
||||||
|
getGoogleApiKey(),
|
||||||
|
getGoogleCseId()
|
||||||
|
]);
|
||||||
|
|
||||||
|
const url = new URL(`https://www.googleapis.com/customsearch/v1`);
|
||||||
|
url.searchParams.append('q', query);
|
||||||
|
url.searchParams.append('cx', googleCseID);
|
||||||
|
url.searchParams.append('key', googleApiKey);
|
||||||
|
|
||||||
|
const res = await axios.get(url.toString());
|
||||||
|
|
||||||
|
if (res.data.error) {
|
||||||
|
throw new Error(`Google PSE Error: ${res.data.error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const originalres = res.data.items;
|
||||||
|
|
||||||
|
const results = originalres.map((item: GooglePSESearchResult) => ({
|
||||||
|
title: item.title,
|
||||||
|
url: item.link,
|
||||||
|
content: item.snippet,
|
||||||
|
img_src: item.pagemap?.cse_image?.[0]?.src,
|
||||||
|
}));
|
||||||
|
|
||||||
|
return { results, originalres };
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error('Google PSE Error:', error.response?.data || error.message);
|
||||||
|
}
|
||||||
|
};
|
@ -1,42 +1,81 @@
|
|||||||
import express from 'express';
|
import express from 'express';
|
||||||
import { searchSearxng } from '../lib/searchEngines/searxng';
|
import { searchSearxng } from '../lib/searchEngines/searxng';
|
||||||
|
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
|
||||||
|
import { getSearchEngineBackend } from '../config';
|
||||||
import logger from '../utils/logger';
|
import logger from '../utils/logger';
|
||||||
|
|
||||||
const router = express.Router();
|
const router = express.Router();
|
||||||
|
|
||||||
|
const searchEngine = getSearchEngineBackend();
|
||||||
|
|
||||||
|
async function performSearch(query: string, site: string, searchEngine: string) {
|
||||||
|
switch (searchEngine) {
|
||||||
|
case 'google': {
|
||||||
|
const googleResult = await searchGooglePSE(query);
|
||||||
|
|
||||||
|
return googleResult.originalres.map(item => {
|
||||||
|
const imageSources = [
|
||||||
|
item.pagemap?.cse_image?.[0]?.src,
|
||||||
|
item.pagemap?.cse_thumbnail?.[0]?.src,
|
||||||
|
item.pagemap?.metatags?.[0]?.['og:image'],
|
||||||
|
item.pagemap?.metatags?.[0]?.['twitter:image'],
|
||||||
|
item.pagemap?.metatags?.[0]?.['image'],
|
||||||
|
].filter(Boolean); // Remove undefined values
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: item.title,
|
||||||
|
url: item.link,
|
||||||
|
content: item.snippet,
|
||||||
|
thumbnail: imageSources[0], // First available image
|
||||||
|
img_src: imageSources[0], // Same as thumbnail for consistency
|
||||||
|
iframe_src: null,
|
||||||
|
author: item.pagemap?.metatags?.[0]?.['og:site_name'] || site,
|
||||||
|
publishedDate: item.pagemap?.metatags?.[0]?.['article:published_time']
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
case 'searxng': {
|
||||||
|
const searxResult = await searchSearxng(query, {
|
||||||
|
engines: ['bing news'],
|
||||||
|
pageno: 1,
|
||||||
|
});
|
||||||
|
return searxResult.results;
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
throw new Error(`Unknown search engine ${searchEngine}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
router.get('/', async (req, res) => {
|
router.get('/', async (req, res) => {
|
||||||
try {
|
try {
|
||||||
|
const queries = [
|
||||||
|
{ site: 'businessinsider.com', topic: 'AI' },
|
||||||
|
{ site: 'www.exchangewire.com', topic: 'AI' },
|
||||||
|
{ site: 'yahoo.com', topic: 'AI' },
|
||||||
|
{ site: 'businessinsider.com', topic: 'tech' },
|
||||||
|
{ site: 'www.exchangewire.com', topic: 'tech' },
|
||||||
|
{ site: 'yahoo.com', topic: 'tech' },
|
||||||
|
];
|
||||||
|
|
||||||
const data = (
|
const data = (
|
||||||
await Promise.all([
|
await Promise.all(
|
||||||
searchSearxng('site:businessinsider.com AI', {
|
queries.map(async ({ site, topic }) => {
|
||||||
engines: ['bing news'],
|
try {
|
||||||
pageno: 1,
|
const query = `site:${site} ${topic}`;
|
||||||
}),
|
return await performSearch(query, site, searchEngine);
|
||||||
searchSearxng('site:www.exchangewire.com AI', {
|
} catch (error) {
|
||||||
engines: ['bing news'],
|
logger.error(`Error searching ${site}: ${error.message}`);
|
||||||
pageno: 1,
|
return [];
|
||||||
}),
|
}
|
||||||
searchSearxng('site:yahoo.com AI', {
|
})
|
||||||
engines: ['bing news'],
|
)
|
||||||
pageno: 1,
|
|
||||||
}),
|
|
||||||
searchSearxng('site:businessinsider.com tech', {
|
|
||||||
engines: ['bing news'],
|
|
||||||
pageno: 1,
|
|
||||||
}),
|
|
||||||
searchSearxng('site:www.exchangewire.com tech', {
|
|
||||||
engines: ['bing news'],
|
|
||||||
pageno: 1,
|
|
||||||
}),
|
|
||||||
searchSearxng('site:yahoo.com tech', {
|
|
||||||
engines: ['bing news'],
|
|
||||||
pageno: 1,
|
|
||||||
}),
|
|
||||||
])
|
|
||||||
)
|
)
|
||||||
.map((result) => result.results)
|
|
||||||
.flat()
|
.flat()
|
||||||
.sort(() => Math.random() - 0.5);
|
.sort(() => Math.random() - 0.5)
|
||||||
|
.filter(item => item.title && item.url && item.content);
|
||||||
|
|
||||||
return res.json({ blogs: data });
|
return res.json({ blogs: data });
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
|
@ -18,6 +18,8 @@ import LineOutputParser from '../lib/outputParsers/lineOutputParser';
|
|||||||
import { getDocumentsFromLinks } from '../utils/documents';
|
import { getDocumentsFromLinks } from '../utils/documents';
|
||||||
import { Document } from 'langchain/document';
|
import { Document } from 'langchain/document';
|
||||||
import { searchSearxng } from '../lib/searchEngines/searxng';
|
import { searchSearxng } from '../lib/searchEngines/searxng';
|
||||||
|
import { searchGooglePSE } from '../lib/searchEngines/google_pse';
|
||||||
|
import { getSearchEngineBackend } from '../config';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import computeSimilarity from '../utils/computeSimilarity';
|
import computeSimilarity from '../utils/computeSimilarity';
|
||||||
@ -203,10 +205,27 @@ class MetaSearchAgent implements MetaSearchAgentType {
|
|||||||
|
|
||||||
return { query: question, docs: docs };
|
return { query: question, docs: docs };
|
||||||
} else {
|
} else {
|
||||||
const res = await searchSearxng(question, {
|
|
||||||
language: 'en',
|
const searchEngine = getSearchEngineBackend();
|
||||||
engines: this.config.activeEngines,
|
|
||||||
});
|
let res;
|
||||||
|
switch (searchEngine) {
|
||||||
|
case 'searxng':
|
||||||
|
res = await searchSearxng(question, {
|
||||||
|
language: 'en',
|
||||||
|
engines: this.config.activeEngines,
|
||||||
|
});
|
||||||
|
break;
|
||||||
|
case 'google':
|
||||||
|
res = await searchGooglePSE(question);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new Error(`Unknown search engine ${searchEngine}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!res?.results) {
|
||||||
|
throw new Error(`No results found for search engine: ${searchEngine}`);
|
||||||
|
}
|
||||||
|
|
||||||
const documents = res.results.map(
|
const documents = res.results.map(
|
||||||
(result) =>
|
(result) =>
|
||||||
|
Reference in New Issue
Block a user