mirror of
https://github.com/ItzCrazyKns/Perplexica.git
synced 2025-07-11 19:18:40 +00:00
test: add CI/CD workflow
This commit is contained in:
168
src/lib/utils/scraper.ts
Normal file
168
src/lib/utils/scraper.ts
Normal file
@ -0,0 +1,168 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { OllamaService } from '../services/ollamaService';
|
||||
import { sleep } from './helpers';
|
||||
|
||||
const RATE_LIMIT_MS = 1000; // 1 second between requests
|
||||
let lastRequestTime = 0;
|
||||
|
||||
async function rateLimitedRequest(url: string) {
|
||||
const now = Date.now();
|
||||
const timeSinceLastRequest = now - lastRequestTime;
|
||||
|
||||
if (timeSinceLastRequest < RATE_LIMIT_MS) {
|
||||
await sleep(RATE_LIMIT_MS - timeSinceLastRequest);
|
||||
}
|
||||
|
||||
lastRequestTime = Date.now();
|
||||
return axios.get(url, {
|
||||
timeout: 5000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; BusinessFinder/1.0; +http://example.com/bot)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5'
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
export interface ContactInfo {
|
||||
phone?: string;
|
||||
email?: string;
|
||||
address?: string;
|
||||
description?: string;
|
||||
openingHours?: string[];
|
||||
}
|
||||
|
||||
export async function extractContactFromHtml(url: string): Promise<ContactInfo> {
|
||||
try {
|
||||
const response = await rateLimitedRequest(url);
|
||||
|
||||
const $ = cheerio.load(response.data);
|
||||
|
||||
// Extract structured data if available
|
||||
const structuredData = $('script[type="application/ld+json"]')
|
||||
.map((_, el) => {
|
||||
try {
|
||||
return JSON.parse($(el).html() || '');
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.get()
|
||||
.filter(Boolean);
|
||||
|
||||
// Look for LocalBusiness or Restaurant schema
|
||||
const businessData = structuredData.find(data =>
|
||||
data['@type'] === 'LocalBusiness' ||
|
||||
data['@type'] === 'Restaurant'
|
||||
);
|
||||
|
||||
if (businessData) {
|
||||
return {
|
||||
phone: businessData.telephone,
|
||||
email: businessData.email,
|
||||
address: businessData.address?.streetAddress,
|
||||
description: businessData.description,
|
||||
openingHours: businessData.openingHours
|
||||
};
|
||||
}
|
||||
|
||||
// Fallback to regular HTML parsing
|
||||
return {
|
||||
phone: findPhone($),
|
||||
email: findEmail($),
|
||||
address: findAddress($),
|
||||
description: $('meta[name="description"]').attr('content'),
|
||||
openingHours: findOpeningHours($)
|
||||
};
|
||||
} catch (error) {
|
||||
console.warn(`Error extracting contact info from ${url}:`, error);
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractCleanAddress(text: string, location: string): Promise<string> {
|
||||
try {
|
||||
const ollama = new OllamaService();
|
||||
const prompt = `
|
||||
Extract a business address from this text. The business should be in or near ${location}.
|
||||
Only return the address, nothing else. If no valid address is found, return an empty string.
|
||||
|
||||
Text: ${text}
|
||||
`;
|
||||
|
||||
const response = await OllamaService.complete(prompt);
|
||||
return response.trim();
|
||||
} catch (error) {
|
||||
console.warn('Error extracting address:', error);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
function findPhone($: cheerio.CheerioAPI): string | undefined {
|
||||
// Common phone patterns
|
||||
const phonePatterns = [
|
||||
/\b\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})\b/,
|
||||
/\b(?:Phone|Tel|Contact):\s*([0-9-().+ ]{10,})\b/i
|
||||
];
|
||||
|
||||
for (const pattern of phonePatterns) {
|
||||
const match = $.text().match(pattern);
|
||||
if (match) return match[0];
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function findEmail($: cheerio.CheerioAPI): string | undefined {
|
||||
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
|
||||
const match = $.text().match(emailPattern);
|
||||
return match ? match[0] : undefined;
|
||||
}
|
||||
|
||||
function findAddress($: cheerio.CheerioAPI): string | undefined {
|
||||
// Look for address in common elements
|
||||
const addressSelectors = [
|
||||
'address',
|
||||
'[itemtype="http://schema.org/PostalAddress"]',
|
||||
'.address',
|
||||
'#address',
|
||||
'[class*="address"]',
|
||||
'[id*="address"]'
|
||||
];
|
||||
|
||||
for (const selector of addressSelectors) {
|
||||
const element = $(selector).first();
|
||||
if (element.length) {
|
||||
return element.text().trim();
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
function findOpeningHours($: cheerio.CheerioAPI): string[] {
|
||||
const hours: string[] = [];
|
||||
const hoursSelectors = [
|
||||
'[itemtype="http://schema.org/OpeningHoursSpecification"]',
|
||||
'.hours',
|
||||
'#hours',
|
||||
'[class*="hours"]',
|
||||
'[id*="hours"]'
|
||||
];
|
||||
|
||||
for (const selector of hoursSelectors) {
|
||||
const element = $(selector).first();
|
||||
if (element.length) {
|
||||
element.find('*').each((_, el) => {
|
||||
const text = $(el).text().trim();
|
||||
if (text && !hours.includes(text)) {
|
||||
hours.push(text);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return hours;
|
||||
}
|
Reference in New Issue
Block a user