From cbe538cc360a4de0bf9d18422f045ebbc2fceca2 Mon Sep 17 00:00:00 2001 From: ItzCrazyKns <95534749+ItzCrazyKns@users.noreply.github.com> Date: Wed, 8 Apr 2026 23:20:15 +0530 Subject: [PATCH] feat(scraper): add scraper --- src/lib/scraper.ts | 116 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/lib/scraper.ts diff --git a/src/lib/scraper.ts b/src/lib/scraper.ts new file mode 100644 index 00000000..767367dd --- /dev/null +++ b/src/lib/scraper.ts @@ -0,0 +1,116 @@ +import { JSDOM } from 'jsdom'; +import { Readability } from '@mozilla/readability'; +import { Mutex } from 'async-mutex'; + +class Scraper { + private static browser: any | undefined; + private static IDLE_KILL_TIMEOUT = 30000; + private static NAVIGATION_TIMEOUT = 20000; + private static idleTimeout: NodeJS.Timeout | undefined; + private static browserMutex = new Mutex(); + private static userCount = 0; + + private static async initBrowser() { + await this.browserMutex.runExclusive(async () => { + if (!this.browser) { + const { chromium } = await import('playwright'); + this.browser = await chromium.launch({ + headless: true, + channel: 'chromium-headless-shell', + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--disable-blink-features=AutomationControlled', + ], + }); + } + + if (this.idleTimeout) clearTimeout(this.idleTimeout); + }); + } + + private static scheduleIdleKill() { + if (this.idleTimeout) clearTimeout(this.idleTimeout); + + this.idleTimeout = setTimeout(async () => { + await this.browserMutex.runExclusive(async () => { + if (this.browser && this.userCount === 0) { + { + await this.browser.close(); + this.browser = undefined; + } + } + }); + }, this.IDLE_KILL_TIMEOUT); + } + + static async scrape( + url: string, + ): Promise<{ content: string; title: string }> { + await this.initBrowser(); + + if (!this.browser) throw new Error('Browser not initialized'); + + const context = await this.browser.newContext({ + userAgent: + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', + }); + + await context.addInitScript(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); + }); + + const page = await context.newPage(); + + this.userCount++; + + try { + await page.goto(url, { + waitUntil: 'domcontentloaded', + timeout: this.NAVIGATION_TIMEOUT, + }); + + await page + .waitForLoadState('load', { timeout: 5000 }) + .catch(() => undefined); + await page.waitForTimeout(500); + + const html = await page.content(); + + const dom = new JSDOM(html, { + url, + }); + + const content = new Readability(dom.window.document).parse(); + + const title = await page.title(); + + return { + content: ` + # ${title ?? 'No title'} - ${url} + ${content?.textContent?.trim() ?? 'No content available'} + `, + title, + }; + } catch (err) { + console.log(`Error scraping ${url}:`, err); + + return { + title: 'Failed to scrape', + content: `# ${url}\n\nError scraping content.`, + }; + } finally { + this.userCount--; + + await context.close().catch(() => undefined); + + if (this.userCount === 0) { + this.scheduleIdleKill(); + } + } + } +} + +export default Scraper;