From d0124b9f06a084d1ec4d849c3347b7595969f089 Mon Sep 17 00:00:00 2001 From: ItzCrazyKns <95534749+ItzCrazyKns@users.noreply.github.com> Date: Sat, 6 Dec 2025 18:54:37 +0530 Subject: [PATCH] feat(actions): add scrape URL action --- package.json | 2 + .../agents/search/researcher/actions/index.ts | 2 + .../search/researcher/actions/scrapeURL.ts | 57 +++++++++++++++++++ yarn.lock | 53 ++++++----------- 4 files changed, 78 insertions(+), 36 deletions(-) create mode 100644 src/lib/agents/search/researcher/actions/scrapeURL.ts diff --git a/package.json b/package.json index b742559..18c5ad8 100644 --- a/package.json +++ b/package.json @@ -52,6 +52,7 @@ "rfc6902": "^5.1.2", "sonner": "^1.4.41", "tailwind-merge": "^2.2.2", + "turndown": "^7.2.2", "winston": "^3.17.0", "yahoo-finance2": "^3.10.2", "yet-another-react-lightbox": "^3.17.2", @@ -65,6 +66,7 @@ "@types/pdf-parse": "^1.1.4", "@types/react": "^18", "@types/react-dom": "^18", + "@types/turndown": "^5.0.6", "autoprefixer": "^10.0.1", "drizzle-kit": "^0.30.5", "eslint": "^8", diff --git a/src/lib/agents/search/researcher/actions/index.ts b/src/lib/agents/search/researcher/actions/index.ts index c3790a5..15c5734 100644 --- a/src/lib/agents/search/researcher/actions/index.ts +++ b/src/lib/agents/search/researcher/actions/index.ts @@ -1,10 +1,12 @@ import doneAction from './done'; import planAction from './plan'; import ActionRegistry from './registry'; +import scrapeURLAction from './scrapeURL'; import webSearchAction from './webSearch'; ActionRegistry.register(webSearchAction); ActionRegistry.register(doneAction); ActionRegistry.register(planAction); +ActionRegistry.register(scrapeURLAction); export { ActionRegistry }; diff --git a/src/lib/agents/search/researcher/actions/scrapeURL.ts b/src/lib/agents/search/researcher/actions/scrapeURL.ts new file mode 100644 index 0000000..0fd3f7b --- /dev/null +++ b/src/lib/agents/search/researcher/actions/scrapeURL.ts @@ -0,0 +1,57 @@ +import z from 'zod'; +import { ResearchAction } from '../../types'; +import { Chunk } from '@/lib/types'; +import TurnDown from 'turndown'; + +const turndownService = new TurnDown(); + +const schema = z.object({ + urls: z.array(z.string()).describe('A list of URLs to scrape content from.'), +}); + +const scrapeURLAction: ResearchAction = { + name: 'scrape_url', + description: + 'Use after __plan to scrape and extract content from the provided URLs. This is useful when you need detailed information from specific web pages or if the user asks you to summarize or analyze content from certain links.', + schema: schema, + enabled: (_) => true, + execute: async (params, additionalConfig) => { + const results: Chunk[] = []; + + await Promise.all( + params.urls.map(async (url) => { + try { + const res = await fetch(url); + const text = await res.text(); + + const title = + text.match(/(.*?)<\/title>/i)?.[1] || `Content from ${url}`; + const markdown = turndownService.turndown(text); + + results.push({ + content: markdown, + metadata: { + url, + title: title, + }, + }); + } catch (error) { + results.push({ + content: `Failed to fetch content from ${url}: ${error}`, + metadata: { + url, + title: `Error fetching ${url}`, + }, + }); + } + }), + ); + + return { + type: 'search_results', + results, + }; + }, +}; + +export default scrapeURLAction; diff --git a/yarn.lock b/yarn.lock index e20ba09..da3e2df 100644 --- a/yarn.lock +++ b/yarn.lock @@ -888,6 +888,11 @@ dependencies: js-tiktoken "^1.0.12" +"@mixmark-io/domino@^2.2.0": + version "2.2.0" + resolved "https://registry.yarnpkg.com/@mixmark-io/domino/-/domino-2.2.0.tgz#4e8ec69bf1afeb7a14f0628b7e2c0f35bdb336c3" + integrity sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw== + "@next/env@15.2.2": version "15.2.2" resolved "https://registry.yarnpkg.com/@next/env/-/env-15.2.2.tgz#6345352365a811c523cecf284874ff489b675e59" @@ -1227,6 +1232,11 @@ resolved "https://registry.yarnpkg.com/@types/trusted-types/-/trusted-types-2.0.7.tgz#baccb07a970b91707df3a3e8ba6896c57ead2d11" integrity sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw== +"@types/turndown@^5.0.6": + version "5.0.6" + resolved "https://registry.yarnpkg.com/@types/turndown/-/turndown-5.0.6.tgz#42a27397298a312d6088f29c0ff4819c518c1ecb" + integrity sha512-ru00MoyeeouE5BX4gRL+6m/BsDfbRayOskWqUvh7CLGW+UXxHQItqALa38kKnOiZPqJrtzJUgAC2+F0rL1S4Pg== + "@types/uuid@^10.0.0": version "10.0.0" resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-10.0.0.tgz#e9c07fe50da0f53dc24970cca94d619ff03f6f6d" @@ -1840,32 +1850,6 @@ complex.js@^2.2.5: resolved "https://registry.yarnpkg.com/complex.js/-/complex.js-2.4.3.tgz#72ee9c303a9b89ebcfeca0d39f74927d38721fce" integrity sha512-UrQVSUur14tNX6tiP4y8T4w4FeJAX3bi2cIv0pu/DTLFNxoq7z2Yh83Vfzztj6Px3X/lubqQ9IrPp7Bpn6p4MQ== -compute-cosine-similarity@^1.1.0: - version "1.1.0" - resolved "https://registry.yarnpkg.com/compute-cosine-similarity/-/compute-cosine-similarity-1.1.0.tgz#0086a06b0239deb90f231f0da894efdc48884609" - integrity sha512-FXhNx0ILLjGi9Z9+lglLzM12+0uoTnYkHm7GiadXDAr0HGVLm25OivUS1B/LPkbzzvlcXz/1EvWg9ZYyJSdhTw== - dependencies: - compute-dot "^1.1.0" - compute-l2norm "^1.1.0" - validate.io-array "^1.0.5" - validate.io-function "^1.0.2" - -compute-dot@^1.1.0: - version "1.1.0" - resolved "https://registry.yarnpkg.com/compute-dot/-/compute-dot-1.1.0.tgz#01a5ba2c7af73b99002acb258459c9576a8232dc" - integrity sha512-L5Ocet4DdMrXboss13K59OK23GXjiSia7+7Ukc7q4Bl+RVpIXK2W9IHMbWDZkh+JUEvJAwOKRaJDiFUa1LTnJg== - dependencies: - validate.io-array "^1.0.3" - validate.io-function "^1.0.2" - -compute-l2norm@^1.1.0: - version "1.1.0" - resolved "https://registry.yarnpkg.com/compute-l2norm/-/compute-l2norm-1.1.0.tgz#bd09131c6b36c8d70c68334e176009a4e0a989ac" - integrity sha512-6EHh1Elj90eU28SXi+h2PLnTQvZmkkHWySpoFz+WOlVNLz3DQoC4ISUHSV9n5jMxPHtKGJ01F4uu2PsXBB8sSg== - dependencies: - validate.io-array "^1.0.3" - validate.io-function "^1.0.2" - concat-map@0.0.1: version "0.0.1" resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b" @@ -5368,6 +5352,13 @@ tunnel-agent@^0.6.0: dependencies: safe-buffer "^5.0.1" +turndown@^7.2.2: + version "7.2.2" + resolved "https://registry.yarnpkg.com/turndown/-/turndown-7.2.2.tgz#9557642b54046c5912b3d433f34dd588de455a43" + integrity sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ== + dependencies: + "@mixmark-io/domino" "^2.2.0" + type-check@^0.4.0, type-check@~0.4.0: version "0.4.0" resolved "https://registry.yarnpkg.com/type-check/-/type-check-0.4.0.tgz#07b8203bfa7056c0657050e3ccd2c37730bab8f1" @@ -5546,16 +5537,6 @@ uuid@^9.0.0: resolved "https://registry.yarnpkg.com/uuid/-/uuid-9.0.1.tgz#e188d4c8853cc722220392c424cd637f32293f30" integrity sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA== -validate.io-array@^1.0.3, validate.io-array@^1.0.5: - version "1.0.6" - resolved "https://registry.yarnpkg.com/validate.io-array/-/validate.io-array-1.0.6.tgz#5b5a2cafd8f8b85abb2f886ba153f2d93a27774d" - integrity sha512-DeOy7CnPEziggrOO5CZhVKJw6S3Yi7e9e65R1Nl/RTN1vTQKnzjfvks0/8kQ40FP/dsjRAOd4hxmJ7uLa6vxkg== - -validate.io-function@^1.0.2: - version "1.0.2" - resolved "https://registry.yarnpkg.com/validate.io-function/-/validate.io-function-1.0.2.tgz#343a19802ed3b1968269c780e558e93411c0bad7" - integrity sha512-LlFybRJEriSuBnUhQyG5bwglhh50EpTL2ul23MPIuR1odjO7XaMLFV8vHGwp7AZciFxtYOeiSCT5st+XSPONiQ== - web-streams-polyfill@4.0.0-beta.3: version "4.0.0-beta.3" resolved "https://registry.yarnpkg.com/web-streams-polyfill/-/web-streams-polyfill-4.0.0-beta.3.tgz#2898486b74f5156095e473efe989dcf185047a38"