feat(scraper): add scraper

This commit is contained in:
ItzCrazyKns
2026-04-08 23:20:15 +05:30
parent f6a47fd3e1
commit cbe538cc36

116
src/lib/scraper.ts Normal file
View File

@@ -0,0 +1,116 @@
import { JSDOM } from 'jsdom';
import { Readability } from '@mozilla/readability';
import { Mutex } from 'async-mutex';
class Scraper {
private static browser: any | undefined;
private static IDLE_KILL_TIMEOUT = 30000;
private static NAVIGATION_TIMEOUT = 20000;
private static idleTimeout: NodeJS.Timeout | undefined;
private static browserMutex = new Mutex();
private static userCount = 0;
private static async initBrowser() {
await this.browserMutex.runExclusive(async () => {
if (!this.browser) {
const { chromium } = await import('playwright');
this.browser = await chromium.launch({
headless: true,
channel: 'chromium-headless-shell',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-blink-features=AutomationControlled',
],
});
}
if (this.idleTimeout) clearTimeout(this.idleTimeout);
});
}
private static scheduleIdleKill() {
if (this.idleTimeout) clearTimeout(this.idleTimeout);
this.idleTimeout = setTimeout(async () => {
await this.browserMutex.runExclusive(async () => {
if (this.browser && this.userCount === 0) {
{
await this.browser.close();
this.browser = undefined;
}
}
});
}, this.IDLE_KILL_TIMEOUT);
}
static async scrape(
url: string,
): Promise<{ content: string; title: string }> {
await this.initBrowser();
if (!this.browser) throw new Error('Browser not initialized');
const context = await this.browser.newContext({
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
});
await context.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
});
const page = await context.newPage();
this.userCount++;
try {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: this.NAVIGATION_TIMEOUT,
});
await page
.waitForLoadState('load', { timeout: 5000 })
.catch(() => undefined);
await page.waitForTimeout(500);
const html = await page.content();
const dom = new JSDOM(html, {
url,
});
const content = new Readability(dom.window.document).parse();
const title = await page.title();
return {
content: `
# ${title ?? 'No title'} - ${url}
${content?.textContent?.trim() ?? 'No content available'}
`,
title,
};
} catch (err) {
console.log(`Error scraping ${url}:`, err);
return {
title: 'Failed to scrape',
content: `# ${url}\n\nError scraping content.`,
};
} finally {
this.userCount--;
await context.close().catch(() => undefined);
if (this.userCount === 0) {
this.scheduleIdleKill();
}
}
}
}
export default Scraper;