feat(actions): add scrape URL action

This commit is contained in:
ItzCrazyKns
2025-12-06 18:54:37 +05:30
parent a14f3e9464
commit d0124b9f06
4 changed files with 78 additions and 36 deletions

View File

@@ -52,6 +52,7 @@
"rfc6902": "^5.1.2", "rfc6902": "^5.1.2",
"sonner": "^1.4.41", "sonner": "^1.4.41",
"tailwind-merge": "^2.2.2", "tailwind-merge": "^2.2.2",
"turndown": "^7.2.2",
"winston": "^3.17.0", "winston": "^3.17.0",
"yahoo-finance2": "^3.10.2", "yahoo-finance2": "^3.10.2",
"yet-another-react-lightbox": "^3.17.2", "yet-another-react-lightbox": "^3.17.2",
@@ -65,6 +66,7 @@
"@types/pdf-parse": "^1.1.4", "@types/pdf-parse": "^1.1.4",
"@types/react": "^18", "@types/react": "^18",
"@types/react-dom": "^18", "@types/react-dom": "^18",
"@types/turndown": "^5.0.6",
"autoprefixer": "^10.0.1", "autoprefixer": "^10.0.1",
"drizzle-kit": "^0.30.5", "drizzle-kit": "^0.30.5",
"eslint": "^8", "eslint": "^8",

View File

@@ -1,10 +1,12 @@
import doneAction from './done'; import doneAction from './done';
import planAction from './plan'; import planAction from './plan';
import ActionRegistry from './registry'; import ActionRegistry from './registry';
import scrapeURLAction from './scrapeURL';
import webSearchAction from './webSearch'; import webSearchAction from './webSearch';
ActionRegistry.register(webSearchAction); ActionRegistry.register(webSearchAction);
ActionRegistry.register(doneAction); ActionRegistry.register(doneAction);
ActionRegistry.register(planAction); ActionRegistry.register(planAction);
ActionRegistry.register(scrapeURLAction);
export { ActionRegistry }; export { ActionRegistry };

View File

@@ -0,0 +1,57 @@
import z from 'zod';
import { ResearchAction } from '../../types';
import { Chunk } from '@/lib/types';
import TurnDown from 'turndown';
const turndownService = new TurnDown();
const schema = z.object({
urls: z.array(z.string()).describe('A list of URLs to scrape content from.'),
});
const scrapeURLAction: ResearchAction<typeof schema> = {
name: 'scrape_url',
description:
'Use after __plan to scrape and extract content from the provided URLs. This is useful when you need detailed information from specific web pages or if the user asks you to summarize or analyze content from certain links.',
schema: schema,
enabled: (_) => true,
execute: async (params, additionalConfig) => {
const results: Chunk[] = [];
await Promise.all(
params.urls.map(async (url) => {
try {
const res = await fetch(url);
const text = await res.text();
const title =
text.match(/<title>(.*?)<\/title>/i)?.[1] || `Content from ${url}`;
const markdown = turndownService.turndown(text);
results.push({
content: markdown,
metadata: {
url,
title: title,
},
});
} catch (error) {
results.push({
content: `Failed to fetch content from ${url}: ${error}`,
metadata: {
url,
title: `Error fetching ${url}`,
},
});
}
}),
);
return {
type: 'search_results',
results,
};
},
};
export default scrapeURLAction;

View File

@@ -888,6 +888,11 @@
dependencies: dependencies:
js-tiktoken "^1.0.12" js-tiktoken "^1.0.12"
"@mixmark-io/domino@^2.2.0":
version "2.2.0"
resolved "https://registry.yarnpkg.com/@mixmark-io/domino/-/domino-2.2.0.tgz#4e8ec69bf1afeb7a14f0628b7e2c0f35bdb336c3"
integrity sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==
"@next/env@15.2.2": "@next/env@15.2.2":
version "15.2.2" version "15.2.2"
resolved "https://registry.yarnpkg.com/@next/env/-/env-15.2.2.tgz#6345352365a811c523cecf284874ff489b675e59" resolved "https://registry.yarnpkg.com/@next/env/-/env-15.2.2.tgz#6345352365a811c523cecf284874ff489b675e59"
@@ -1227,6 +1232,11 @@
resolved "https://registry.yarnpkg.com/@types/trusted-types/-/trusted-types-2.0.7.tgz#baccb07a970b91707df3a3e8ba6896c57ead2d11" resolved "https://registry.yarnpkg.com/@types/trusted-types/-/trusted-types-2.0.7.tgz#baccb07a970b91707df3a3e8ba6896c57ead2d11"
integrity sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw== integrity sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==
"@types/turndown@^5.0.6":
version "5.0.6"
resolved "https://registry.yarnpkg.com/@types/turndown/-/turndown-5.0.6.tgz#42a27397298a312d6088f29c0ff4819c518c1ecb"
integrity sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg==
"@types/uuid@^10.0.0": "@types/uuid@^10.0.0":
version "10.0.0" version "10.0.0"
resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-10.0.0.tgz#e9c07fe50da0f53dc24970cca94d619ff03f6f6d" resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-10.0.0.tgz#e9c07fe50da0f53dc24970cca94d619ff03f6f6d"
@@ -1840,32 +1850,6 @@ complex.js@^2.2.5:
resolved "https://registry.yarnpkg.com/complex.js/-/complex.js-2.4.3.tgz#72ee9c303a9b89ebcfeca0d39f74927d38721fce" resolved "https://registry.yarnpkg.com/complex.js/-/complex.js-2.4.3.tgz#72ee9c303a9b89ebcfeca0d39f74927d38721fce"
integrity sha512-UrQVSUur14tNX6tiP4y8T4w4FeJAX3bi2cIv0pu/DTLFNxoq7z2Yh83Vfzztj6Px3X/lubqQ9IrPp7Bpn6p4MQ== integrity sha512-UrQVSUur14tNX6tiP4y8T4w4FeJAX3bi2cIv0pu/DTLFNxoq7z2Yh83Vfzztj6Px3X/lubqQ9IrPp7Bpn6p4MQ==
compute-cosine-similarity@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/compute-cosine-similarity/-/compute-cosine-similarity-1.1.0.tgz#0086a06b0239deb90f231f0da894efdc48884609"
integrity sha512-FXhNx0ILLjGi9Z9+lglLzM12+0uoTnYkHm7GiadXDAr0HGVLm25OivUS1B/LPkbzzvlcXz/1EvWg9ZYyJSdhTw==
dependencies:
compute-dot "^1.1.0"
compute-l2norm "^1.1.0"
validate.io-array "^1.0.5"
validate.io-function "^1.0.2"
compute-dot@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/compute-dot/-/compute-dot-1.1.0.tgz#01a5ba2c7af73b99002acb258459c9576a8232dc"
integrity sha512-L5Ocet4DdMrXboss13K59OK23GXjiSia7+7Ukc7q4Bl+RVpIXK2W9IHMbWDZkh+JUEvJAwOKRaJDiFUa1LTnJg==
dependencies:
validate.io-array "^1.0.3"
validate.io-function "^1.0.2"
compute-l2norm@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/compute-l2norm/-/compute-l2norm-1.1.0.tgz#bd09131c6b36c8d70c68334e176009a4e0a989ac"
integrity sha512-6EHh1Elj90eU28SXi+h2PLnTQvZmkkHWySpoFz+WOlVNLz3DQoC4ISUHSV9n5jMxPHtKGJ01F4uu2PsXBB8sSg==
dependencies:
validate.io-array "^1.0.3"
validate.io-function "^1.0.2"
concat-map@0.0.1: concat-map@0.0.1:
version "0.0.1" version "0.0.1"
resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b"
@@ -5368,6 +5352,13 @@ tunnel-agent@^0.6.0:
dependencies: dependencies:
safe-buffer "^5.0.1" safe-buffer "^5.0.1"
turndown@^7.2.2:
version "7.2.2"
resolved "https://registry.yarnpkg.com/turndown/-/turndown-7.2.2.tgz#9557642b54046c5912b3d433f34dd588de455a43"
integrity sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==
dependencies:
"@mixmark-io/domino" "^2.2.0"
type-check@^0.4.0, type-check@~0.4.0: type-check@^0.4.0, type-check@~0.4.0:
version "0.4.0" version "0.4.0"
resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1" resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1"
@@ -5546,16 +5537,6 @@ uuid@^9.0.0:
resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30" resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30"
integrity sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA== integrity sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==
validate.io-array@^1.0.3, validate.io-array@^1.0.5:
version "1.0.6"
resolved "https://registry.yarnpkg.com/validate.io-array/-/validate.io-array-1.0.6.tgz#5b5a2cafd8f8b85abb2f886ba153f2d93a27774d"
integrity sha512-DeOy7CnPEziggrOO5CZhVKJw6S3Yi7e9e65R1Nl/RTN1vTQKnzjfvks0/8kQ40FP/dsjRAOd4hxmJ7uLa6vxkg==
validate.io-function@^1.0.2:
version "1.0.2"
resolved "https://registry.yarnpkg.com/validate.io-function/-/validate.io-function-1.0.2.tgz#343a19802ed3b1968269c780e558e93411c0bad7"
integrity sha512-LlFybRJEriSuBnUhQyG5bwglhh50EpTL2ul23MPIuR1odjO7XaMLFV8vHGwp7AZciFxtYOeiSCT5st+XSPONiQ==
web-streams-polyfill@4.0.0-beta.3: web-streams-polyfill@4.0.0-beta.3:
version "4.0.0-beta.3" version "4.0.0-beta.3"
resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz#2898486b74f5156095e473efe989dcf185047a38" resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz#2898486b74f5156095e473efe989dcf185047a38"