feat(metaSearchAgent): implement structured outputs

This commit is contained in:
ItzCrazyKns
2025-07-19 16:10:04 +05:30
parent ca9b32a23b
commit 49fafaa096
2 changed files with 133 additions and 141 deletions

View File

@ -1,63 +1,41 @@
export const webSearchRetrieverPrompt = `
You are an AI question rephraser. You will be given a conversation and a follow-up question, you will have to rephrase the follow up question so it is a standalone question and can be used by another LLM to search the web for information to answer it.
If it is a simple writing task or a greeting (unless the greeting contains a question after it) like Hi, Hello, How are you, etc. than a question then you need to return \`not_needed\` as the response (This is because the LLM won't need to search the web for finding information on this topic).
If the user asks some question from some URL or wants you to summarize a PDF or a webpage (via URL) you need to return the links inside the \`links\` XML block and the question inside the \`question\` XML block. If the user wants to you to summarize the webpage or the PDF you need to return \`summarize\` inside the \`question\` XML block in place of a question and the link to summarize in the \`links\` XML block.
You must always return the rephrased question inside the \`question\` XML block, if there are no links in the follow-up question then don't insert a \`links\` XML block in your response.
You are an AI question rephraser. You will be given a conversation and a follow-up question; rephrase it into a standalone question that another LLM can use to search the web.
There are several examples attached for your reference inside the below \`examples\` XML block
Return ONLY a JSON object that matches this schema:
query: string // the standalone question (or "summarize")
links: string[] // URLs extracted from the user query (empty if none)
searchRequired: boolean // true if web search is needed, false for greetings/simple writing tasks
searchMode: "" | "normal" | "news" // "" when searchRequired is false; "news" if the user asks for news/articles, otherwise "normal"
<examples>
1. Follow up question: What is the capital of France
Rephrased question:\`
<question>
Capital of france
</question>
\`
Rules
- Greetings / simple writing tasks → query:"", links:[], searchRequired:false, searchMode:""
- Summarizing a URL → query:"summarize", links:[url...], searchRequired:true, searchMode:"normal"
- Asking for news/articles → searchMode:"news"
Examples
1. Follow-up: What is the capital of France?
"query":"capital of France","links":[],"searchRequired":true,"searchMode":"normal"
2. Hi, how are you?
Rephrased question\`
<question>
not_needed
</question>
\`
"query":"","links":[],"searchRequired":false,"searchMode":""
3. Follow up question: What is Docker?
Rephrased question: \`
<question>
What is Docker
</question>
\`
3. Follow-up: What is Docker?
"query":"what is Docker","links":[],"searchRequired":true,"searchMode":"normal"
4. Follow up question: Can you tell me what is X from https://example.com
Rephrased question: \`
<question>
Can you tell me what is X?
</question>
4. Follow-up: Can you tell me what is X from https://example.com?
"query":"what is X","links":["https://example.com"],"searchRequired":true,"searchMode":"normal"
<links>
https://example.com
</links>
\`
5. Follow-up: Summarize the content from https://example.com
"query":"summarize","links":["https://example.com"],"searchRequired":true,"searchMode":"normal"
5. Follow up question: Summarize the content from https://example.com
Rephrased question: \`
<question>
summarize
</question>
<links>
https://example.com
</links>
\`
</examples>
Anything below is the part of the actual conversation and you need to use conversation and the follow-up question to rephrase the follow-up question as a standalone question based on the guidelines shared above.
6. Follow-up: Latest news about AI
"query":"latest news about AI","links":[],"searchRequired":true,"searchMode":"news"
<conversation>
{chat_history}
</conversation>
Follow up question: {query}
Follow-up question: {query}
Rephrased question:
`;

View File

@ -24,6 +24,7 @@ import computeSimilarity from '../utils/computeSimilarity';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import { StreamEvent } from '@langchain/core/tracers/log_stream';
import { z } from 'zod';
export interface MetaSearchAgentType {
searchAndAnswer: (
@ -52,6 +53,17 @@ type BasicChainInput = {
query: string;
};
const retrieverLLMOutputSchema = z.object({
query: z.string().describe('The query to search the web for.'),
links: z
.array(z.string())
.describe('The links to search/summarize if present'),
searchRequired: z
.boolean()
.describe('Wether there is a need to search the web'),
searchMode: z.enum(['', 'normal', 'news']).describe('The search mode.'),
});
class MetaSearchAgent implements MetaSearchAgentType {
private config: Config;
private strParser = new StringOutputParser();
@ -62,73 +74,71 @@ class MetaSearchAgent implements MetaSearchAgentType {
private async createSearchRetrieverChain(llm: BaseChatModel) {
(llm as unknown as ChatOpenAI).temperature = 0;
return RunnableSequence.from([
PromptTemplate.fromTemplate(this.config.queryGeneratorPrompt),
llm,
this.strParser,
RunnableLambda.from(async (input: string) => {
const linksOutputParser = new LineListOutputParser({
key: 'links',
});
Object.assign(
Object.create(Object.getPrototypeOf(llm)),
llm,
).withStructuredOutput(retrieverLLMOutputSchema, {
...(llm.metadata?.['model-type'] === 'groq'
? {
method: 'json-object',
}
: {}),
}),
RunnableLambda.from(
async (input: z.infer<typeof retrieverLLMOutputSchema>) => {
let question = input.query;
const links = input.links;
const questionOutputParser = new LineOutputParser({
key: 'question',
});
const links = await linksOutputParser.parse(input);
let question = this.config.summarizer
? await questionOutputParser.parse(input)
: input;
if (question === 'not_needed') {
return { query: '', docs: [] };
}
if (links.length > 0) {
if (question.length === 0) {
question = 'summarize';
if (!input.searchRequired) {
return { query: '', docs: [] };
}
let docs: Document[] = [];
const linkDocs = await getDocumentsFromLinks({ links });
const docGroups: Document[] = [];
linkDocs.map((doc) => {
const URLDocExists = docGroups.find(
(d) =>
d.metadata.url === doc.metadata.url &&
d.metadata.totalDocs < 10,
);
if (!URLDocExists) {
docGroups.push({
...doc,
metadata: {
...doc.metadata,
totalDocs: 1,
},
});
if (links.length > 0) {
if (question.length === 0) {
question = 'summarize';
}
const docIndex = docGroups.findIndex(
(d) =>
d.metadata.url === doc.metadata.url &&
d.metadata.totalDocs < 10,
);
let docs: Document[] = [];
if (docIndex !== -1) {
docGroups[docIndex].pageContent =
docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
docGroups[docIndex].metadata.totalDocs += 1;
}
});
const linkDocs = await getDocumentsFromLinks({ links });
await Promise.all(
docGroups.map(async (doc) => {
const res = await llm.invoke(`
const docGroups: Document[] = [];
linkDocs.map((doc) => {
const URLDocExists = docGroups.find(
(d) =>
d.metadata.url === doc.metadata.url &&
d.metadata.totalDocs < 10,
);
if (!URLDocExists) {
docGroups.push({
...doc,
metadata: {
...doc.metadata,
totalDocs: 1,
},
});
}
const docIndex = docGroups.findIndex(
(d) =>
d.metadata.url === doc.metadata.url &&
d.metadata.totalDocs < 10,
);
if (docIndex !== -1) {
docGroups[docIndex].pageContent =
docGroups[docIndex].pageContent + `\n\n` + doc.pageContent;
docGroups[docIndex].metadata.totalDocs += 1;
}
});
await Promise.all(
docGroups.map(async (doc) => {
const res = await llm.invoke(`
You are a web search summarizer, tasked with summarizing a piece of text retrieved from a web search. Your job is to summarize the
text into a detailed, 2-4 paragraph explanation that captures the main ideas and provides a comprehensive answer to the query.
If the query is \"summarize\", you should provide a detailed summary of the text. If the query is a specific question, you should answer it in the summary.
@ -189,46 +199,50 @@ class MetaSearchAgent implements MetaSearchAgentType {
Make sure to answer the query in the summary.
`);
const document = new Document({
pageContent: res.content as string,
metadata: {
title: doc.metadata.title,
url: doc.metadata.url,
},
});
const document = new Document({
pageContent: res.content as string,
metadata: {
title: doc.metadata.title,
url: doc.metadata.url,
},
});
docs.push(document);
}),
);
return { query: question, docs: docs };
} else {
question = question.replace(/<think>.*?<\/think>/g, '');
const res = await searchSearxng(question, {
language: 'en',
engines: this.config.activeEngines,
});
const documents = res.results.map(
(result) =>
new Document({
pageContent:
result.content ||
(this.config.activeEngines.includes('youtube')
? result.title
: '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
docs.push(document);
}),
);
);
return { query: question, docs: documents };
}
}),
return { query: question, docs: docs };
} else {
question = question.replace(/<think>.*?<\/think>/g, '');
const res = await searchSearxng(question, {
language: 'en',
engines:
input.searchMode === 'normal'
? this.config.activeEngines
: ['bing news'],
});
const documents = res.results.map(
(result) =>
new Document({
pageContent:
result.content ||
(this.config.activeEngines.includes('youtube')
? result.title
: '') /* Todo: Implement transcript grabbing using Youtubei (source: https://www.npmjs.com/package/youtubei) */,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
}),
);
return { query: question, docs: documents };
}
},
),
]);
}