From 49fafaa096e2d94c7f0dfcb453c02ad36ac87237 Mon Sep 17 00:00:00 2001
From: ItzCrazyKns <95534749+ItzCrazyKns@users.noreply.github.com>
Date: Sat, 19 Jul 2025 16:10:04 +0530
Subject: [PATCH] feat(metaSearchAgent): implement structured outputs
---
src/lib/prompts/webSearch.ts | 70 ++++------
src/lib/search/metaSearchAgent.ts | 204 ++++++++++++++++--------------
2 files changed, 133 insertions(+), 141 deletions(-)
diff --git a/src/lib/prompts/webSearch.ts b/src/lib/prompts/webSearch.ts
index 1a431ea..5952562 100644
--- a/src/lib/prompts/webSearch.ts
+++ b/src/lib/prompts/webSearch.ts
@@ -1,63 +1,41 @@
export const webSearchRetrieverPrompt = `
-You are an AI question rephraser. You will be given a conversation and a follow-up question, you will have to rephrase the follow up question so it is a standalone question and can be used by another LLM to search the web for information to answer it.
-If it is a simple writing task or a greeting (unless the greeting contains a question after it) like Hi, Hello, How are you, etc. than a question then you need to return \`not_needed\` as the response (This is because the LLM won't need to search the web for finding information on this topic).
-If the user asks some question from some URL or wants you to summarize a PDF or a webpage (via URL) you need to return the links inside the \`links\` XML block and the question inside the \`question\` XML block. If the user wants to you to summarize the webpage or the PDF you need to return \`summarize\` inside the \`question\` XML block in place of a question and the link to summarize in the \`links\` XML block.
-You must always return the rephrased question inside the \`question\` XML block, if there are no links in the follow-up question then don't insert a \`links\` XML block in your response.
+You are an AI question rephraser. You will be given a conversation and a follow-up question; rephrase it into a standalone question that another LLM can use to search the web.
-There are several examples attached for your reference inside the below \`examples\` XML block
+Return ONLY a JSON object that matches this schema:
+query: string // the standalone question (or "summarize")
+links: string[] // URLs extracted from the user query (empty if none)
+searchRequired: boolean // true if web search is needed, false for greetings/simple writing tasks
+searchMode: "" | "normal" | "news" // "" when searchRequired is false; "news" if the user asks for news/articles, otherwise "normal"
-
-1. Follow up question: What is the capital of France
-Rephrased question:\`
-
-Capital of france
-
-\`
+Rules
+- Greetings / simple writing tasks → query:"", links:[], searchRequired:false, searchMode:""
+- Summarizing a URL → query:"summarize", links:[url...], searchRequired:true, searchMode:"normal"
+- Asking for news/articles → searchMode:"news"
+
+Examples
+1. Follow-up: What is the capital of France?
+"query":"capital of France","links":[],"searchRequired":true,"searchMode":"normal"
2. Hi, how are you?
-Rephrased question\`
-
-not_needed
-
-\`
+"query":"","links":[],"searchRequired":false,"searchMode":""
-3. Follow up question: What is Docker?
-Rephrased question: \`
-
-What is Docker
-
-\`
+3. Follow-up: What is Docker?
+"query":"what is Docker","links":[],"searchRequired":true,"searchMode":"normal"
-4. Follow up question: Can you tell me what is X from https://example.com
-Rephrased question: \`
-
-Can you tell me what is X?
-
+4. Follow-up: Can you tell me what is X from https://example.com?
+"query":"what is X","links":["https://example.com"],"searchRequired":true,"searchMode":"normal"
-
-https://example.com
-
-\`
+5. Follow-up: Summarize the content from https://example.com
+"query":"summarize","links":["https://example.com"],"searchRequired":true,"searchMode":"normal"
-5. Follow up question: Summarize the content from https://example.com
-Rephrased question: \`
-
-summarize
-
-
-
-https://example.com
-
-\`
-
-
-Anything below is the part of the actual conversation and you need to use conversation and the follow-up question to rephrase the follow-up question as a standalone question based on the guidelines shared above.
+6. Follow-up: Latest news about AI
+"query":"latest news about AI","links":[],"searchRequired":true,"searchMode":"news"
{chat_history}
-Follow up question: {query}
+Follow-up question: {query}
Rephrased question:
`;
diff --git a/src/lib/search/metaSearchAgent.ts b/src/lib/search/metaSearchAgent.ts
index 67b7c58..c3bf389 100644
--- a/src/lib/search/metaSearchAgent.ts
+++ b/src/lib/search/metaSearchAgent.ts
@@ -24,6 +24,7 @@ import computeSimilarity from '../utils/computeSimilarity';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import { StreamEvent } from '@langchain/core/tracers/log_stream';
+import { z } from 'zod';
export interface MetaSearchAgentType {
searchAndAnswer: (
@@ -52,6 +53,17 @@ type BasicChainInput = {
query: string;
};
+const retrieverLLMOutputSchema = z.object({
+ query: z.string().describe('The query to search the web for.'),
+ links: z
+ .array(z.string())
+ .describe('The links to search/summarize if present'),
+ searchRequired: z
+ .boolean()
+ .describe('Wether there is a need to search the web'),
+ searchMode: z.enum(['', 'normal', 'news']).describe('The search mode.'),
+});
+
class MetaSearchAgent implements MetaSearchAgentType {
private config: Config;
private strParser = new StringOutputParser();
@@ -62,73 +74,71 @@ class MetaSearchAgent implements MetaSearchAgentType {
private async createSearchRetrieverChain(llm: BaseChatModel) {
(llm as unknown as ChatOpenAI).temperature = 0;
-
return RunnableSequence.from([
PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt),
- llm,
- this.strParser,
- RunnableLambda.from(async (input: string) => {
- const linksOutputParser = new LineListOutputParser({
- key: 'links',
- });
+ Object.assign(
+ Object.create(Object.getPrototypeOf(llm)),
+ llm,
+ ).withStructuredOutput(retrieverLLMOutputSchema, {
+ ...(llm.metadata?.['model-type'] === 'groq'
+ ? {
+ method: 'json-object',
+ }
+ : {}),
+ }),
+ RunnableLambda.from(
+ async (input: z.infer) => {
+ let question = input.query;
+ const links = input.links;
- const questionOutputParser = new LineOutputParser({
- key: 'question',
- });
-
- const links = await linksOutputParser.parse(input);
- let question = this.config.summarizer
- ? await questionOutputParser.parse(input)
- : input;
-
- if (question === 'not_needed') {
- return { query: '', docs: [] };
- }
-
- if (links.length > 0) {
- if (question.length === 0) {
- question = 'summarize';
+ if (!input.searchRequired) {
+ return { query: '', docs: [] };
}
- let docs: Document[] = [];
-
- const linkDocs = await getDocumentsFromLinks({ links });
-
- const docGroups: Document[] = [];
-
- linkDocs.map((doc) => {
- const URLDocExists = docGroups.find(
- (d) =>
- d.metadata.url === doc.metadata.url &&
- d.metadata.totalDocs < 10,
- );
-
- if (!URLDocExists) {
- docGroups.push({
- ...doc,
- metadata: {
- ...doc.metadata,
- totalDocs: 1,
- },
- });
+ if (links.length > 0) {
+ if (question.length === 0) {
+ question = 'summarize';
}
- const docIndex = docGroups.findIndex(
- (d) =>
- d.metadata.url === doc.metadata.url &&
- d.metadata.totalDocs < 10,
- );
+ let docs: Document[] = [];
- if (docIndex !== -1) {
- docGroups[docIndex].pageContent =
- docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
- docGroups[docIndex].metadata.totalDocs += 1;
- }
- });
+ const linkDocs = await getDocumentsFromLinks({ links });
- await Promise.all(
- docGroups.map(async (doc) => {
- const res = await llm.invoke(`
+ const docGroups: Document[] = [];
+
+ linkDocs.map((doc) => {
+ const URLDocExists = docGroups.find(
+ (d) =>
+ d.metadata.url === doc.metadata.url &&
+ d.metadata.totalDocs < 10,
+ );
+
+ if (!URLDocExists) {
+ docGroups.push({
+ ...doc,
+ metadata: {
+ ...doc.metadata,
+ totalDocs: 1,
+ },
+ });
+ }
+
+ const docIndex = docGroups.findIndex(
+ (d) =>
+ d.metadata.url === doc.metadata.url &&
+ d.metadata.totalDocs < 10,
+ );
+
+ if (docIndex !== -1) {
+ docGroups[docIndex].pageContent =
+ docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
+ docGroups[docIndex].metadata.totalDocs += 1;
+ }
+ });
+
+ await Promise.all(
+ docGroups.map(async (doc) => {
+ const res = await llm.invoke(`
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
@@ -189,46 +199,50 @@ class MetaSearchAgent implements MetaSearchAgentType {
Make sure to answer the query in the summary.
`);
- const document = new Document({
- pageContent: res.content as string,
- metadata: {
- title: doc.metadata.title,
- url: doc.metadata.url,
- },
- });
+ const document = new Document({
+ pageContent: res.content as string,
+ metadata: {
+ title: doc.metadata.title,
+ url: doc.metadata.url,
+ },
+ });
- docs.push(document);
- }),
- );
-
- return { query: question, docs: docs };
- } else {
- question = question.replace(/.*?<\/think>/g, '');
-
- const res = await searchSearxng(question, {
- language: 'en',
- engines: this.config.activeEngines,
- });
-
- const documents = res.results.map(
- (result) =>
- new Document({
- pageContent:
- result.content ||
- (this.config.activeEngines.includes('youtube')
- ? result.title
- : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
- metadata: {
- title: result.title,
- url: result.url,
- ...(result.img_src && { img_src: result.img_src }),
- },
+ docs.push(document);
}),
- );
+ );
- return { query: question, docs: documents };
- }
- }),
+ return { query: question, docs: docs };
+ } else {
+ question = question.replace(/.*?<\/think>/g, '');
+
+ const res = await searchSearxng(question, {
+ language: 'en',
+ engines:
+ input.searchMode === 'normal'
+ ? this.config.activeEngines
+ : ['bing news'],
+ });
+
+ const documents = res.results.map(
+ (result) =>
+ new Document({
+ pageContent:
+ result.content ||
+ (this.config.activeEngines.includes('youtube')
+ ? result.title
+ : '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
+ metadata: {
+ title: result.title,
+ url: result.url,
+ ...(result.img_src && { img_src: result.img_src }),
+ },
+ }),
+ );
+
+ return { query: question, docs: documents };
+ }
+ },
+ ),
]);
}