From 49fafaa096e2d94c7f0dfcb453c02ad36ac87237 Mon Sep 17 00:00:00 2001 From: ItzCrazyKns <95534749+ItzCrazyKns@users.noreply.github.com> Date: Sat, 19 Jul 2025 16:10:04 +0530 Subject: [PATCH] feat(metaSearchAgent): implement structured outputs --- src/lib/prompts/webSearch.ts | 70 ++++------ src/lib/search/metaSearchAgent.ts | 204 ++++++++++++++++-------------- 2 files changed, 133 insertions(+), 141 deletions(-) diff --git a/src/lib/prompts/webSearch.ts b/src/lib/prompts/webSearch.ts index 1a431ea..5952562 100644 --- a/src/lib/prompts/webSearch.ts +++ b/src/lib/prompts/webSearch.ts @@ -1,63 +1,41 @@ export const webSearchRetrieverPrompt = ` -You are an AI question rephraser. You will be given a conversation and a follow-up question, you will have to rephrase the follow up question so it is a standalone question and can be used by another LLM to search the web for information to answer it. -If it is a simple writing task or a greeting (unless the greeting contains a question after it) like Hi, Hello, How are you, etc. than a question then you need to return \`not_needed\` as the response (This is because the LLM won't need to search the web for finding information on this topic). -If the user asks some question from some URL or wants you to summarize a PDF or a webpage (via URL) you need to return the links inside the \`links\` XML block and the question inside the \`question\` XML block. If the user wants to you to summarize the webpage or the PDF you need to return \`summarize\` inside the \`question\` XML block in place of a question and the link to summarize in the \`links\` XML block. -You must always return the rephrased question inside the \`question\` XML block, if there are no links in the follow-up question then don't insert a \`links\` XML block in your response. +You are an AI question rephraser. You will be given a conversation and a follow-up question; rephrase it into a standalone question that another LLM can use to search the web. -There are several examples attached for your reference inside the below \`examples\` XML block +Return ONLY a JSON object that matches this schema: +query: string // the standalone question (or "summarize") +links: string[] // URLs extracted from the user query (empty if none) +searchRequired: boolean // true if web search is needed, false for greetings/simple writing tasks +searchMode: "" | "normal" | "news" // "" when searchRequired is false; "news" if the user asks for news/articles, otherwise "normal" - -1. Follow up question: What is the capital of France -Rephrased question:\` - -Capital of france - -\` +Rules +- Greetings / simple writing tasks → query:"", links:[], searchRequired:false, searchMode:"" +- Summarizing a URL → query:"summarize", links:[url...], searchRequired:true, searchMode:"normal" +- Asking for news/articles → searchMode:"news" + +Examples +1. Follow-up: What is the capital of France? +"query":"capital of France","links":[],"searchRequired":true,"searchMode":"normal" 2. Hi, how are you? -Rephrased question\` - -not_needed - -\` +"query":"","links":[],"searchRequired":false,"searchMode":"" -3. Follow up question: What is Docker? -Rephrased question: \` - -What is Docker - -\` +3. Follow-up: What is Docker? +"query":"what is Docker","links":[],"searchRequired":true,"searchMode":"normal" -4. Follow up question: Can you tell me what is X from https://example.com -Rephrased question: \` - -Can you tell me what is X? - +4. Follow-up: Can you tell me what is X from https://example.com? +"query":"what is X","links":["https://example.com"],"searchRequired":true,"searchMode":"normal" - -https://example.com - -\` +5. Follow-up: Summarize the content from https://example.com +"query":"summarize","links":["https://example.com"],"searchRequired":true,"searchMode":"normal" -5. Follow up question: Summarize the content from https://example.com -Rephrased question: \` - -summarize - - - -https://example.com - -\` - - -Anything below is the part of the actual conversation and you need to use conversation and the follow-up question to rephrase the follow-up question as a standalone question based on the guidelines shared above. +6. Follow-up: Latest news about AI +"query":"latest news about AI","links":[],"searchRequired":true,"searchMode":"news" {chat_history} -Follow up question: {query} +Follow-up question: {query} Rephrased question: `; diff --git a/src/lib/search/metaSearchAgent.ts b/src/lib/search/metaSearchAgent.ts index 67b7c58..c3bf389 100644 --- a/src/lib/search/metaSearchAgent.ts +++ b/src/lib/search/metaSearchAgent.ts @@ -24,6 +24,7 @@ import computeSimilarity from '../utils/computeSimilarity'; import formatChatHistoryAsString from '../utils/formatHistory'; import eventEmitter from 'events'; import { StreamEvent } from '@langchain/core/tracers/log_stream'; +import { z } from 'zod'; export interface MetaSearchAgentType { searchAndAnswer: ( @@ -52,6 +53,17 @@ type BasicChainInput = { query: string; }; +const retrieverLLMOutputSchema = z.object({ + query: z.string().describe('The query to search the web for.'), + links: z + .array(z.string()) + .describe('The links to search/summarize if present'), + searchRequired: z + .boolean() + .describe('Wether there is a need to search the web'), + searchMode: z.enum(['', 'normal', 'news']).describe('The search mode.'), +}); + class MetaSearchAgent implements MetaSearchAgentType { private config: Config; private strParser = new StringOutputParser(); @@ -62,73 +74,71 @@ class MetaSearchAgent implements MetaSearchAgentType { private async createSearchRetrieverChain(llm: BaseChatModel) { (llm as unknown as ChatOpenAI).temperature = 0; - return RunnableSequence.from([ PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt), - llm, - this.strParser, - RunnableLambda.from(async (input: string) => { - const linksOutputParser = new LineListOutputParser({ - key: 'links', - }); + Object.assign( + Object.create(Object.getPrototypeOf(llm)), + llm, + ).withStructuredOutput(retrieverLLMOutputSchema, { + ...(llm.metadata?.['model-type'] === 'groq' + ? { + method: 'json-object', + } + : {}), + }), + RunnableLambda.from( + async (input: z.infer) => { + let question = input.query; + const links = input.links; - const questionOutputParser = new LineOutputParser({ - key: 'question', - }); - - const links = await linksOutputParser.parse(input); - let question = this.config.summarizer - ? await questionOutputParser.parse(input) - : input; - - if (question === 'not_needed') { - return { query: '', docs: [] }; - } - - if (links.length > 0) { - if (question.length === 0) { - question = 'summarize'; + if (!input.searchRequired) { + return { query: '', docs: [] }; } - let docs: Document[] = []; - - const linkDocs = await getDocumentsFromLinks({ links }); - - const docGroups: Document[] = []; - - linkDocs.map((doc) => { - const URLDocExists = docGroups.find( - (d) => - d.metadata.url === doc.metadata.url && - d.metadata.totalDocs < 10, - ); - - if (!URLDocExists) { - docGroups.push({ - ...doc, - metadata: { - ...doc.metadata, - totalDocs: 1, - }, - }); + if (links.length > 0) { + if (question.length === 0) { + question = 'summarize'; } - const docIndex = docGroups.findIndex( - (d) => - d.metadata.url === doc.metadata.url && - d.metadata.totalDocs < 10, - ); + let docs: Document[] = []; - if (docIndex !== -1) { - docGroups[docIndex].pageContent = - docGroups[docIndex].pageContent + `\n\n` + doc.pageContent; - docGroups[docIndex].metadata.totalDocs += 1; - } - }); + const linkDocs = await getDocumentsFromLinks({ links }); - await Promise.all( - docGroups.map(async (doc) => { - const res = await llm.invoke(` + const docGroups: Document[] = []; + + linkDocs.map((doc) => { + const URLDocExists = docGroups.find( + (d) => + d.metadata.url === doc.metadata.url && + d.metadata.totalDocs < 10, + ); + + if (!URLDocExists) { + docGroups.push({ + ...doc, + metadata: { + ...doc.metadata, + totalDocs: 1, + }, + }); + } + + const docIndex = docGroups.findIndex( + (d) => + d.metadata.url === doc.metadata.url && + d.metadata.totalDocs < 10, + ); + + if (docIndex !== -1) { + docGroups[docIndex].pageContent = + docGroups[docIndex].pageContent + `\n\n` + doc.pageContent; + docGroups[docIndex].metadata.totalDocs += 1; + } + }); + + await Promise.all( + docGroups.map(async (doc) => { + const res = await llm.invoke(` You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query. If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary. @@ -189,46 +199,50 @@ class MetaSearchAgent implements MetaSearchAgentType { Make sure to answer the query in the summary. `); - const document = new Document({ - pageContent: res.content as string, - metadata: { - title: doc.metadata.title, - url: doc.metadata.url, - }, - }); + const document = new Document({ + pageContent: res.content as string, + metadata: { + title: doc.metadata.title, + url: doc.metadata.url, + }, + }); - docs.push(document); - }), - ); - - return { query: question, docs: docs }; - } else { - question = question.replace(/.*?<\/think>/g, ''); - - const res = await searchSearxng(question, { - language: 'en', - engines: this.config.activeEngines, - }); - - const documents = res.results.map( - (result) => - new Document({ - pageContent: - result.content || - (this.config.activeEngines.includes('youtube') - ? result.title - : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */, - metadata: { - title: result.title, - url: result.url, - ...(result.img_src && { img_src: result.img_src }), - }, + docs.push(document); }), - ); + ); - return { query: question, docs: documents }; - } - }), + return { query: question, docs: docs }; + } else { + question = question.replace(/.*?<\/think>/g, ''); + + const res = await searchSearxng(question, { + language: 'en', + engines: + input.searchMode === 'normal' + ? this.config.activeEngines + : ['bing news'], + }); + + const documents = res.results.map( + (result) => + new Document({ + pageContent: + result.content || + (this.config.activeEngines.includes('youtube') + ? result.title + : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */, + metadata: { + title: result.title, + url: result.url, + ...(result.img_src && { img_src: result.img_src }), + }, + }), + ); + + return { query: question, docs: documents }; + } + }, + ), ]); }