feat(scrape-action): use scraper

This commit is contained in:
ItzCrazyKns
2026-04-08 23:23:03 +05:30
parent 7a6fad95ef
commit 5991416142

View File

@@ -1,10 +1,7 @@
import z from 'zod'; import z from 'zod';
import { ResearchAction } from '../../types'; import { ResearchAction } from '../../types';
import { Chunk, ReadingResearchBlock } from '@/lib/types'; import { Chunk, ReadingResearchBlock } from '@/lib/types';
import TurnDown from 'turndown'; import Scraper from '@/lib/scraper';
import path from 'path';
const turndownService = new TurnDown();
const schema = z.object({ const schema = z.object({
urls: z.array(z.string()).describe('A list of URLs to scrape content from.'), urls: z.array(z.string()).describe('A list of URLs to scrape content from.'),
@@ -39,11 +36,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
await Promise.all( await Promise.all(
params.urls.map(async (url) => { params.urls.map(async (url) => {
try { try {
const res = await fetch(url); const scraped = await Scraper.scrape(url);
const text = await res.text();
const title =
text.match(/<title>(.*?)<\/title>/i)?.[1] || `Content from ${url}`;
if ( if (
!readingEmitted && !readingEmitted &&
@@ -59,7 +52,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
content: '', content: '',
metadata: { metadata: {
url, url,
title: title, title: scraped.title,
}, },
}, },
], ],
@@ -92,7 +85,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
content: '', content: '',
metadata: { metadata: {
url, url,
title: title, title: scraped.title,
}, },
}); });
@@ -108,13 +101,11 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
); );
} }
const markdown = turndownService.turndown(text);
results.push({ results.push({
content: markdown, content: scraped.content,
metadata: { metadata: {
url, url,
title: title, title: scraped.title,
}, },
}); });
} catch (error) { } catch (error) {
@@ -122,7 +113,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
content: `Failed to fetch content from ${url}: ${error}`, content: `Failed to fetch content from ${url}: ${error}`,
metadata: { metadata: {
url, url,
title: `Error fetching ${url}`, title: `Error scraping ${url}`,
}, },
}); });
} }