mirror of
https://github.com/ItzCrazyKns/Perplexica.git
synced 2026-04-10 13:54:28 +00:00
feat(scrape-action): use scraper
This commit is contained in:
@@ -1,10 +1,7 @@
|
|||||||
import z from 'zod';
|
import z from 'zod';
|
||||||
import { ResearchAction } from '../../types';
|
import { ResearchAction } from '../../types';
|
||||||
import { Chunk, ReadingResearchBlock } from '@/lib/types';
|
import { Chunk, ReadingResearchBlock } from '@/lib/types';
|
||||||
import TurnDown from 'turndown';
|
import Scraper from '@/lib/scraper';
|
||||||
import path from 'path';
|
|
||||||
|
|
||||||
const turndownService = new TurnDown();
|
|
||||||
|
|
||||||
const schema = z.object({
|
const schema = z.object({
|
||||||
urls: z.array(z.string()).describe('A list of URLs to scrape content from.'),
|
urls: z.array(z.string()).describe('A list of URLs to scrape content from.'),
|
||||||
@@ -39,11 +36,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
|
|||||||
await Promise.all(
|
await Promise.all(
|
||||||
params.urls.map(async (url) => {
|
params.urls.map(async (url) => {
|
||||||
try {
|
try {
|
||||||
const res = await fetch(url);
|
const scraped = await Scraper.scrape(url);
|
||||||
const text = await res.text();
|
|
||||||
|
|
||||||
const title =
|
|
||||||
text.match(/<title>(.*?)<\/title>/i)?.[1] || `Content from ${url}`;
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
!readingEmitted &&
|
!readingEmitted &&
|
||||||
@@ -59,7 +52,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
|
|||||||
content: '',
|
content: '',
|
||||||
metadata: {
|
metadata: {
|
||||||
url,
|
url,
|
||||||
title: title,
|
title: scraped.title,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
@@ -92,7 +85,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
|
|||||||
content: '',
|
content: '',
|
||||||
metadata: {
|
metadata: {
|
||||||
url,
|
url,
|
||||||
title: title,
|
title: scraped.title,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -108,13 +101,11 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const markdown = turndownService.turndown(text);
|
|
||||||
|
|
||||||
results.push({
|
results.push({
|
||||||
content: markdown,
|
content: scraped.content,
|
||||||
metadata: {
|
metadata: {
|
||||||
url,
|
url,
|
||||||
title: title,
|
title: scraped.title,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -122,7 +113,7 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
|
|||||||
content: `Failed to fetch content from ${url}: ${error}`,
|
content: `Failed to fetch content from ${url}: ${error}`,
|
||||||
metadata: {
|
metadata: {
|
||||||
url,
|
url,
|
||||||
title: `Error fetching ${url}`,
|
title: `Error scraping ${url}`,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user