feat(scrape-url): prevent context overflow

2026-07-14 23:23:28 +00:00 · 2026-04-09 19:21:25 +05:30
parent ebcf5384f3
commit 40a7cdeb4b
1 changed files with 82 additions and 1 deletions
--- a/src/lib/agents/search/researcher/actions/scrapeURL.ts
+++ b/src/lib/agents/search/researcher/actions/scrapeURL.ts
@@ -2,6 +2,49 @@ import z from 'zod';
 import { ResearchAction } from '../../types';
 import { Chunk, ReadingResearchBlock } from '@/lib/types';
 import Scraper from '@/lib/scraper';
+import { splitText } from '@/lib/utils/splitText';
+
+const extractorPrompt = `
+                  Assistant is an AI information extractor. Assistant will be shared with scraped information from a website along with the queries used to retrieve that information. Assistant's task is to extract relevant facts from the scraped data to answer the queries.
+            
+                  ## Things to taken into consideration when extracting information:
+                  1. Relevance to the query: The extracted information must dynamically adjust based on the query's intent. If the query asks "What is [X]", you must extract the definition/identity. If the query asks for "[X] specs" or "features", you must provide deep, granular technical details.
+                     - Example: For "What is [Product]", extract the core definition. For "[Product] capabilities", extract every technical function mentioned.
+                  2. Concentrate on extracting factual information that can help in answering the question rather than opinions or commentary. Ignore marketing fluff like "best-in-class" or "seamless."
+                  3. Noise to signal ratio: If the scraped data is noisy (headers, footers, UI text), ignore it and extract only the high-value information. 
+                     - Example: Discard "Click for more" or "Subscribe now" messages.
+                  4. Avoid using filler sentences or words; extract concise, telegram-style information.
+                     - Example: Change "The device features a weight of only 1.2kg" to "Weight: 1.2kg."
+                  5. Duplicate information: If a fact appears multiple times (e.g., in a paragraph and a technical table), merge the details into a single, high-density bullet point to avoid redundancy.
+                  6. Numerical Data Integrity: NEVER summarize or generalize numbers, benchmarks, or table data. Extract raw values exactly as they appear.
+                     - Example: Do not say "Improved coding scores." Say "LiveCodeBench v6: 80.0%."
+            
+                  ## Example
+                  For example, if the query is "What are the health benefits of green tea?" and the scraped data contains various pieces of information about green tea, Assistant should focus on extracting factual information related to the health benefits of green tea such as "Green tea contains antioxidants which can help in reducing inflammation" and ignore irrelevant information such as "Green tea is a popular beverage worldwide".
+                  
+                  It can also remove filler words to reduce the sentence to "Contains antioxidants; reduces inflammation." 
+                  
+                  For tables/numerical data extraction, Assistant should extract the raw numerical data or the content of the table without trying to summarize it to avoid losing important details. For example, if a table lists specific battery life hours for different modes, Assistant should list every mode and its corresponding hour count rather than giving a general average.
+                  
+                  Make sure the extracted facts are in bullet points format to make it easier to read and understand.
+            
+                  ## Output format
+                  Assistant should reply with a JSON object containing a key "extracted_facts" which is a string of the bulleted facts. Return only raw JSON without markdown formatting (no \`\`\`json blocks).
+            
+                  <example_output>
+                  {
+                    "extracted_facts": "- Fact 1\n- Fact 2\n- Fact 3"
+                  }
+                  </example_output>
+                  `;
+
+const extractorSchema = z.object({
+  extracted_facts: z
+    .string()
+    .describe(
+      'The extracted facts that are relevant to the query and can help in answering the question should be listed here in a concise manner.',
+    ),
+});

 const schema = z.object({
  urls: z.array(z.string()).describe('A list of URLs to scrape content from.'),
@@ -101,8 +144,46 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
            );
          }

+          const chunks = splitText(scraped.content, 4000, 500);
+
+          let accumulatedContent = '';
+
+          if (chunks.length > 1) {
+            try {
+              await Promise.all(
+                chunks.map(async (chunk) => {
+                  const extracted = await additionalConfig.llm.generateObject<
+                    typeof extractorSchema
+                  >({
+                    messages: [
+                      {
+                        role: 'system',
+                        content: extractorPrompt,
+                      },
+                      {
+                        role: 'user',
+                        content: `<queries>Summarize</queries>\n<scraped_data>${chunk}</scraped_data>`,
+                      },
+                    ],
+                    schema: extractorSchema,
+                  });
+
+                  accumulatedContent += extracted.extracted_facts + '\n';
+                }),
+              );
+            } catch (err) {
+              console.log(
+                'Error during extraction, falling back to raw content',
+                err,
+              );
+              accumulatedContent = chunks[0];
+            }
+          } else {
+            accumulatedContent = scraped.content;
+          }
+
          results.push({
-            content: scraped.content,
+            content: accumulatedContent,
            metadata: {
              url,
              title: scraped.title,