feat(scraper): add scraper

2026-07-12 14:13:28 +00:00 · 2026-04-08 23:20:15 +05:30
parent f6a47fd3e1
commit cbe538cc36
1 changed files with 116 additions and 0 deletions
--- a/src/lib/scraper.ts
+++ b/src/lib/scraper.ts
@@ -0,0 +1,116 @@
+import { JSDOM } from 'jsdom';
+import { Readability } from '@mozilla/readability';
+import { Mutex } from 'async-mutex';
+
+class Scraper {
+  private static browser: any | undefined;
+  private static IDLE_KILL_TIMEOUT = 30000;
+  private static NAVIGATION_TIMEOUT = 20000;
+  private static idleTimeout: NodeJS.Timeout | undefined;
+  private static browserMutex = new Mutex();
+  private static userCount = 0;
+
+  private static async initBrowser() {
+    await this.browserMutex.runExclusive(async () => {
+      if (!this.browser) {
+        const { chromium } = await import('playwright');
+        this.browser = await chromium.launch({
+          headless: true,
+          channel: 'chromium-headless-shell',
+          args: [
+            '--no-sandbox',
+            '--disable-setuid-sandbox',
+            '--disable-dev-shm-usage',
+            '--disable-gpu',
+            '--disable-blink-features=AutomationControlled',
+          ],
+        });
+      }
+
+      if (this.idleTimeout) clearTimeout(this.idleTimeout);
+    });
+  }
+
+  private static scheduleIdleKill() {
+    if (this.idleTimeout) clearTimeout(this.idleTimeout);
+
+    this.idleTimeout = setTimeout(async () => {
+      await this.browserMutex.runExclusive(async () => {
+        if (this.browser && this.userCount === 0) {
+          {
+            await this.browser.close();
+            this.browser = undefined;
+          }
+        }
+      });
+    }, this.IDLE_KILL_TIMEOUT);
+  }
+
+  static async scrape(
+    url: string,
+  ): Promise<{ content: string; title: string }> {
+    await this.initBrowser();
+
+    if (!this.browser) throw new Error('Browser not initialized');
+
+    const context = await this.browser.newContext({
+      userAgent:
+        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
+    });
+
+    await context.addInitScript(() => {
+      Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
+    });
+
+    const page = await context.newPage();
+
+    this.userCount++;
+
+    try {
+      await page.goto(url, {
+        waitUntil: 'domcontentloaded',
+        timeout: this.NAVIGATION_TIMEOUT,
+      });
+
+      await page
+        .waitForLoadState('load', { timeout: 5000 })
+        .catch(() => undefined);
+      await page.waitForTimeout(500);
+
+      const html = await page.content();
+
+      const dom = new JSDOM(html, {
+        url,
+      });
+
+      const content = new Readability(dom.window.document).parse();
+
+      const title = await page.title();
+
+      return {
+        content: `
+        # ${title ?? 'No title'} - ${url}
+        ${content?.textContent?.trim() ?? 'No content available'}
+        `,
+        title,
+      };
+    } catch (err) {
+      console.log(`Error scraping ${url}:`, err);
+
+      return {
+        title: 'Failed to scrape',
+        content: `# ${url}\n\nError scraping content.`,
+      };
+    } finally {
+      this.userCount--;
+
+      await context.close().catch(() => undefined);
+
+      if (this.userCount === 0) {
+        this.scheduleIdleKill();
+      }
+    }
+  }
+}
+
+export default Scraper;