diff --git a/.gitignore b/.gitignore index af50413..240381c 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,6 @@ Thumbs.db # Db db.sqlite /searxng + +# Dev +docker-compose-dev.yaml diff --git a/src/chains/imageSearchAgent.ts b/src/chains/imageSearchAgent.ts index c37b9ca..4c73876 100644 --- a/src/chains/imageSearchAgent.ts +++ b/src/chains/imageSearchAgent.ts @@ -8,6 +8,8 @@ import formatChatHistoryAsString from '../utils/formatHistory'; import { BaseMessage } from '@langchain/core/messages'; import { StringOutputParser } from '@langchain/core/output_parsers'; import { searchSearxng } from '../lib/searchEngines/searxng'; +import { searchGooglePSE } from '../lib/searchEngines/google_pse'; +import { getSearchEngineBackend } from '../config'; import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; const imageSearchChainPrompt = ` @@ -36,6 +38,59 @@ type ImageSearchChainInput = { query: string; }; +async function performImageSearch(query: string) { + const searchEngine = getSearchEngineBackend(); + let images = []; + + switch (searchEngine) { + case 'google': { + const googleResult = await searchGooglePSE(query); + images = googleResult.originalres + .map((result) => { + // Extract image URL from multiple possible locations in Google's response + const imageSrc = result.pagemap?.cse_image?.[0]?.src || + result.pagemap?.cse_thumbnail?.[0]?.src || + result.image?.thumbnailLink; + + if (imageSrc && result.link && result.title) { + images.push({ + img_src: imageSrc, + url: result.link, + title: result.title, + // Add additional metadata if needed + source: result.displayLink, + fileFormat: result.fileFormat, + }); + } + }) + .filter(Boolean); + break; + } + + case 'searxng': { + const searxResult = await searchSearxng(query, { + engines: ['google images', 'bing images'], + pageno: 1, + }); + searxResult.results.forEach((result) => { + if (result.img_src && result.url && result.title) { + images.push({ + img_src: result.img_src, + url: result.url, + title: result.title, + }); + } + }); + break; + } + + default: + throw new Error(`Unknown search engine ${searchEngine}`); + } + + return images; +} + const strParser = new StringOutputParser(); const createImageSearchChain = (llm: BaseChatModel) => { @@ -52,22 +107,7 @@ const createImageSearchChain = (llm: BaseChatModel) => { llm, strParser, RunnableLambda.from(async (input: string) => { - const res = await searchSearxng(input, { - engines: ['bing images', 'google images'], - }); - - const images = []; - - res.results.forEach((result) => { - if (result.img_src && result.url && result.title) { - images.push({ - img_src: result.img_src, - url: result.url, - title: result.title, - }); - } - }); - + const images = await performImageSearch(input); return images.slice(0, 10); }), ]); diff --git a/src/chains/videoSearchAgent.ts b/src/chains/videoSearchAgent.ts index b6b9756..df352fd 100644 --- a/src/chains/videoSearchAgent.ts +++ b/src/chains/videoSearchAgent.ts @@ -8,6 +8,8 @@ import formatChatHistoryAsString from '../utils/formatHistory'; import { BaseMessage } from '@langchain/core/messages'; import { StringOutputParser } from '@langchain/core/output_parsers'; import { searchSearxng } from '../lib/searchEngines/searxng'; +import { searchGooglePSE } from '../lib/searchEngines/google_pse'; +import { getSearchEngineBackend } from '../config'; import type { BaseChatModel } from '@langchain/core/language_models/chat_models'; const VideoSearchChainPrompt = ` @@ -38,27 +40,45 @@ type VideoSearchChainInput = { const strParser = new StringOutputParser(); -const createVideoSearchChain = (llm: BaseChatModel) => { - return RunnableSequence.from([ - RunnableMap.from({ - chat_history: (input: VideoSearchChainInput) => { - return formatChatHistoryAsString(input.chat_history); - }, - query: (input: VideoSearchChainInput) => { - return input.query; - }, - }), - PromptTemplate.fromTemplate(VideoSearchChainPrompt), - llm, - strParser, - RunnableLambda.from(async (input: string) => { - const res = await searchSearxng(input, { +function extractYouTubeVideoId(url: string): string | null { + const regex = /(?:v=|\/embed\/|\.be\/)([a-zA-Z0-9_-]{11})/; + const match = url.match(regex); + return match ? match[1] : null; +} + +async function performVideoSearch(query: string) { + const searchEngine = getSearchEngineBackend(); + const youtubeQuery = `${query} site:youtube.com`; + let videos = []; + + switch (searchEngine) { + case 'google': { + const googleResult = await searchGooglePSE(youtubeQuery); + googleResult.originalres.results.forEach((result) => { + // Extract video metadata from Google PSE results + const thumbnail = result.pagemap?.cse_thumbnail?.[0]?.src + || result.pagemap?.videoobject?.[0]?.thumbnailurl; + + if (thumbnail && result.link && result.title) { + videos.push({ + img_src: thumbnail, + url: result.link, + title: result.title, + // Construct iframe URL from YouTube video ID + iframe_src: result.link.includes('youtube.com/watch?v=') + ? `https://www.youtube.com/embed/${result.link.split('v=')[1].split('&')[0]}` + : null, + }); + } + }); + break; + } + + case 'searxng': { + const searxResult = await searchSearxng(query, { engines: ['youtube'], }); - - const videos = []; - - res.results.forEach((result) => { + searxResult.results.forEach((result) => { if ( result.thumbnail && result.url && @@ -73,7 +93,31 @@ const createVideoSearchChain = (llm: BaseChatModel) => { }); } }); + break; + } + default: + throw new Error(`Unknown search engine ${searchEngine}`); + } + + return videos; +} + +const createVideoSearchChain = (llm: BaseChatModel) => { + return RunnableSequence.from([ + RunnableMap.from({ + chat_history: (input: VideoSearchChainInput) => { + return formatChatHistoryAsString(input.chat_history); + }, + query: (input: VideoSearchChainInput) => { + return input.query; + }, + }), + PromptTemplate.fromTemplate(VideoSearchChainPrompt), + llm, + strParser, + RunnableLambda.from(async (input: string) => { + const videos = await performVideoSearch(input); return videos.slice(0, 10); }), ]); @@ -87,4 +131,4 @@ const handleVideoSearch = ( return VideoSearchChain.invoke(input); }; -export default handleVideoSearch; +export default handleVideoSearch; \ No newline at end of file diff --git a/src/lib/searchEngines/google_pse.ts b/src/lib/searchEngines/google_pse.ts new file mode 100644 index 0000000..5232b0c --- /dev/null +++ b/src/lib/searchEngines/google_pse.ts @@ -0,0 +1,73 @@ +import axios from 'axios'; +import { getGoogleApiKey, getGoogleCseId } from '../../config'; + +interface GooglePSESearchResult { + kind: string; + title: string; + htmlTitle: string; + link: string; + displayLink: string; + snippet?: string; + htmlSnippet?: string; + cacheId?: string; + formattedUrl: string; + htmlFormattedUrl: string; + pagemap?: { + videoobject: any; + cse_thumbnail?: Array<{ + src: string; + width: string; + height: string; + }>; + metatags?: Array<{ + [key: string]: string; + 'author'?: string; + }>; + cse_image?: Array<{ + src: string; + }>; + }; + fileFormat?: string; + image?: { + contextLink: string; + thumbnailLink: string; + }; + mime?: string; + labels?: Array<{ + name: string; + displayName: string; + }>; +} + +export const searchGooglePSE = async (query: string) => { + try { + const [googleApiKey, googleCseID] = await Promise.all([ + getGoogleApiKey(), + getGoogleCseId() + ]); + + const url = new URL(`https://www.googleapis.com/customsearch/v1`); + url.searchParams.append('q', query); + url.searchParams.append('cx', googleCseID); + url.searchParams.append('key', googleApiKey); + + const res = await axios.get(url.toString()); + + if (res.data.error) { + throw new Error(`Google PSE Error: ${res.data.error.message}`); + } + + const originalres = res.data.items; + + const results = originalres.map((item: GooglePSESearchResult) => ({ + title: item.title, + url: item.link, + content: item.snippet, + img_src: item.pagemap?.cse_image?.[0]?.src, + })); + + return { results, originalres }; + } catch (error) { + throw new Error('Google PSE Error:', error.response?.data || error.message); + } +}; diff --git a/src/routes/discover.ts b/src/routes/discover.ts index 675f9a1..80022fc 100644 --- a/src/routes/discover.ts +++ b/src/routes/discover.ts @@ -1,42 +1,81 @@ import express from 'express'; import { searchSearxng } from '../lib/searchEngines/searxng'; +import { searchGooglePSE } from '../lib/searchEngines/google_pse'; +import { getSearchEngineBackend } from '../config'; import logger from '../utils/logger'; const router = express.Router(); +const searchEngine = getSearchEngineBackend(); + +async function performSearch(query: string, site: string, searchEngine: string) { + switch (searchEngine) { + case 'google': { + const googleResult = await searchGooglePSE(query); + + return googleResult.originalres.map(item => { + const imageSources = [ + item.pagemap?.cse_image?.[0]?.src, + item.pagemap?.cse_thumbnail?.[0]?.src, + item.pagemap?.metatags?.[0]?.['og:image'], + item.pagemap?.metatags?.[0]?.['twitter:image'], + item.pagemap?.metatags?.[0]?.['image'], + ].filter(Boolean); // Remove undefined values + + return { + title: item.title, + url: item.link, + content: item.snippet, + thumbnail: imageSources[0], // First available image + img_src: imageSources[0], // Same as thumbnail for consistency + iframe_src: null, + author: item.pagemap?.metatags?.[0]?.['og:site_name'] || site, + publishedDate: item.pagemap?.metatags?.[0]?.['article:published_time'] + }; + }); + } + + case 'searxng': { + const searxResult = await searchSearxng(query, { + engines: ['bing news'], + pageno: 1, + }); + return searxResult.results; + } + + default: + throw new Error(`Unknown search engine ${searchEngine}`); + } +} + + router.get('/', async (req, res) => { try { + const queries = [ + { site: 'businessinsider.com', topic: 'AI' }, + { site: 'www.exchangewire.com', topic: 'AI' }, + { site: 'yahoo.com', topic: 'AI' }, + { site: 'businessinsider.com', topic: 'tech' }, + { site: 'www.exchangewire.com', topic: 'tech' }, + { site: 'yahoo.com', topic: 'tech' }, + ]; + const data = ( - await Promise.all([ - searchSearxng('site:businessinsider.com AI', { - engines: ['bing news'], - pageno: 1, - }), - searchSearxng('site:www.exchangewire.com AI', { - engines: ['bing news'], - pageno: 1, - }), - searchSearxng('site:yahoo.com AI', { - engines: ['bing news'], - pageno: 1, - }), - searchSearxng('site:businessinsider.com tech', { - engines: ['bing news'], - pageno: 1, - }), - searchSearxng('site:www.exchangewire.com tech', { - engines: ['bing news'], - pageno: 1, - }), - searchSearxng('site:yahoo.com tech', { - engines: ['bing news'], - pageno: 1, - }), - ]) + await Promise.all( + queries.map(async ({ site, topic }) => { + try { + const query = `site:${site} ${topic}`; + return await performSearch(query, site, searchEngine); + } catch (error) { + logger.error(`Error searching ${site}: ${error.message}`); + return []; + } + }) + ) ) - .map((result) => result.results) .flat() - .sort(() => Math.random() - 0.5); + .sort(() => Math.random() - 0.5) + .filter(item => item.title && item.url && item.content); return res.json({ blogs: data }); } catch (err: any) { diff --git a/src/search/metaSearchAgent.ts b/src/search/metaSearchAgent.ts index 2cac9b6..1db9af6 100644 --- a/src/search/metaSearchAgent.ts +++ b/src/search/metaSearchAgent.ts @@ -18,6 +18,8 @@ import LineOutputParser from '../lib/outputParsers/lineOutputParser'; import { getDocumentsFromLinks } from '../utils/documents'; import { Document } from 'langchain/document'; import { searchSearxng } from '../lib/searchEngines/searxng'; +import { searchGooglePSE } from '../lib/searchEngines/google_pse'; +import { getSearchEngineBackend } from '../config'; import path from 'path'; import fs from 'fs'; import computeSimilarity from '../utils/computeSimilarity'; @@ -203,10 +205,27 @@ class MetaSearchAgent implements MetaSearchAgentType { return { query: question, docs: docs }; } else { - const res = await searchSearxng(question, { - language: 'en', - engines: this.config.activeEngines, - }); + + const searchEngine = getSearchEngineBackend(); + + let res; + switch (searchEngine) { + case 'searxng': + res = await searchSearxng(question, { + language: 'en', + engines: this.config.activeEngines, + }); + break; + case 'google': + res = await searchGooglePSE(question); + break; + default: + throw new Error(`Unknown search engine ${searchEngine}`); + } + + if (!res?.results) { + throw new Error(`No results found for search engine: ${searchEngine}`); + } const documents = res.results.map( (result) =>