Compare commits
4 Commits
56cd278ed4
...
feat/ollam
Author | SHA1 | Date | |
---|---|---|---|
|
2f20d845c8 | ||
|
7a7eafb8e7 | ||
|
043f66b767 | ||
|
811822c03b |
Before Width: | Height: | Size: 641 KiB After Width: | Height: | Size: 151 KiB |
16
.env.example
@@ -1,11 +1,5 @@
|
||||
PORT=3000
|
||||
NODE_ENV=development
|
||||
SUPABASE_URL=your_supabase_url
|
||||
SUPABASE_KEY=your_supabase_key
|
||||
OLLAMA_URL=http://localhost:11434
|
||||
OLLAMA_MODEL=llama2
|
||||
SEARXNG_URL=http://localhost:4000
|
||||
SEARXNG_INSTANCES=["http://localhost:4000"]
|
||||
MAX_RESULTS_PER_QUERY=50
|
||||
CACHE_DURATION_HOURS=24
|
||||
CACHE_DURATION_DAYS=7
|
||||
PORT=3001
|
||||
OLLAMA_URL=http://localhost:11434 # url of the ollama server
|
||||
SIMILARITY_MEASURE=cosine # cosine or dot
|
||||
SEARXNG_API_URL= # no need to fill this if using docker
|
||||
MODEL_NAME=llama2
|
2
.github/ISSUE_TEMPLATE/bug_report.md
vendored
@@ -4,6 +4,7 @@ about: Create an issue to help us fix bugs
|
||||
title: ''
|
||||
labels: bug
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Describe the bug**
|
||||
@@ -11,7 +12,6 @@ A clear and concise description of what the bug is.
|
||||
|
||||
**To Reproduce**
|
||||
Steps to reproduce the behavior:
|
||||
|
||||
1. Go to '...'
|
||||
2. Click on '....'
|
||||
3. Scroll down to '....'
|
||||
|
3
.github/ISSUE_TEMPLATE/custom.md
vendored
@@ -4,4 +4,7 @@ about: Describe this issue template's purpose here.
|
||||
title: ''
|
||||
labels: ''
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
|
||||
|
1
.github/ISSUE_TEMPLATE/feature_request.md
vendored
@@ -4,6 +4,7 @@ about: Suggest an idea for this project
|
||||
title: ''
|
||||
labels: enhancement
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
**Is your feature request related to a problem? Please describe.**
|
||||
|
29
.github/workflows/ci.yml
vendored
@@ -1,29 +0,0 @@
|
||||
---
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v2
|
||||
with:
|
||||
node-version: '18'
|
||||
|
||||
- name: Install dependencies
|
||||
run: npm ci
|
||||
|
||||
- name: Run tests
|
||||
run: npm test
|
||||
|
||||
- name: Run type check
|
||||
run: npm run build
|
73
.github/workflows/docker-build.yaml
vendored
@@ -1,73 +0,0 @@
|
||||
name: Build & Push Docker Images
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
release:
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
build-and-push:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
service: [backend, app]
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v2
|
||||
with:
|
||||
install: true
|
||||
|
||||
- name: Log in to DockerHub
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
- name: Extract version from release tag
|
||||
if: github.event_name == 'release'
|
||||
id: version
|
||||
run: echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
|
||||
|
||||
- name: Build and push Docker image for ${{ matrix.service }}
|
||||
if: github.ref == 'refs/heads/master' && github.event_name == 'push'
|
||||
run: |
|
||||
docker buildx create --use
|
||||
if [[ "${{ matrix.service }}" == "backend" ]]; then \
|
||||
DOCKERFILE=backend.dockerfile; \
|
||||
IMAGE_NAME=perplexica-backend; \
|
||||
else \
|
||||
DOCKERFILE=app.dockerfile; \
|
||||
IMAGE_NAME=perplexica-frontend; \
|
||||
fi
|
||||
docker buildx build --platform linux/amd64,linux/arm64 \
|
||||
--cache-from=type=registry,ref=itzcrazykns1337/${IMAGE_NAME}:main \
|
||||
--cache-to=type=inline \
|
||||
-f $DOCKERFILE \
|
||||
-t itzcrazykns1337/${IMAGE_NAME}:main \
|
||||
--push .
|
||||
|
||||
- name: Build and push release Docker image for ${{ matrix.service }}
|
||||
if: github.event_name == 'release'
|
||||
run: |
|
||||
docker buildx create --use
|
||||
if [[ "${{ matrix.service }}" == "backend" ]]; then \
|
||||
DOCKERFILE=backend.dockerfile; \
|
||||
IMAGE_NAME=perplexica-backend; \
|
||||
else \
|
||||
DOCKERFILE=app.dockerfile; \
|
||||
IMAGE_NAME=perplexica-frontend; \
|
||||
fi
|
||||
docker buildx build --platform linux/amd64,linux/arm64 \
|
||||
--cache-from=type=registry,ref=itzcrazykns1337/${IMAGE_NAME}:${{ env.RELEASE_VERSION }} \
|
||||
--cache-to=type=inline \
|
||||
-f $DOCKERFILE \
|
||||
-t itzcrazykns1337/${IMAGE_NAME}:${{ env.RELEASE_VERSION }} \
|
||||
--push .
|
41
.gitignore
vendored
@@ -1,32 +1,31 @@
|
||||
# Environment variables
|
||||
.env
|
||||
.env.*
|
||||
!.env.example
|
||||
|
||||
# Dependencies
|
||||
# Node.js
|
||||
node_modules/
|
||||
yarn-error.log
|
||||
npm-debug.log
|
||||
yarn-error.log
|
||||
|
||||
# Build outputs
|
||||
dist/
|
||||
build/
|
||||
.next/
|
||||
# Build output
|
||||
/.next/
|
||||
/out/
|
||||
|
||||
# IDE/Editor
|
||||
# IDE/Editor specific
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*.iml
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
# Environment variables
|
||||
.env
|
||||
.env.local
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
|
||||
# Logs
|
||||
# Log files
|
||||
logs/
|
||||
*.log
|
||||
|
||||
# Cache
|
||||
.cache/
|
||||
.npm/
|
||||
# Testing
|
||||
/coverage/
|
||||
|
||||
# Miscellaneous
|
||||
.DS_Store
|
||||
Thumbs.db
|
@@ -1,41 +0,0 @@
|
||||
# Ignore all files in the node_modules directory
|
||||
node_modules
|
||||
|
||||
# Ignore all files in the .next directory (Next.js build output)
|
||||
.next
|
||||
|
||||
# Ignore all files in the .out directory (TypeScript build output)
|
||||
.out
|
||||
|
||||
# Ignore all files in the .cache directory (Prettier cache)
|
||||
.cache
|
||||
|
||||
# Ignore all files in the .vscode directory (Visual Studio Code settings)
|
||||
.vscode
|
||||
|
||||
# Ignore all files in the .idea directory (IntelliJ IDEA settings)
|
||||
.idea
|
||||
|
||||
# Ignore all files in the dist directory (build output)
|
||||
dist
|
||||
|
||||
# Ignore all files in the build directory (build output)
|
||||
build
|
||||
|
||||
# Ignore all files in the coverage directory (test coverage reports)
|
||||
coverage
|
||||
|
||||
# Ignore all files with the .log extension
|
||||
*.log
|
||||
|
||||
# Ignore all files with the .tmp extension
|
||||
*.tmp
|
||||
|
||||
# Ignore all files with the .swp extension
|
||||
*.swp
|
||||
|
||||
# Ignore all files with the .DS_Store extension (macOS specific)
|
||||
.DS_Store
|
||||
|
||||
# Ignore all files in uploads directory
|
||||
uploads
|
@@ -8,7 +8,8 @@ Perplexica's design consists of two main domains:
|
||||
|
||||
- **Frontend (`ui` directory)**: This is a Next.js application holding all user interface components. It's a self-contained environment that manages everything the user interacts with.
|
||||
- **Backend (root and `src` directory)**: The backend logic is situated in the `src` folder, but the root directory holds the main `package.json` for backend dependency management.
|
||||
- All of the focus modes are created using the Meta Search Agent class present in `src/search/metaSearchAgent.ts`. The main logic behind Perplexica lies there.
|
||||
|
||||
Both the root directory (for backend configurations outside `src`) and the `ui` folder come with an `.env.example` file. These are templates for environment variables that you need to set up manually for the application to run correctly.
|
||||
|
||||
## Setting Up Your Environment
|
||||
|
||||
@@ -16,11 +17,10 @@ Before diving into coding, setting up your local environment is key. Here's what
|
||||
|
||||
### Backend
|
||||
|
||||
1. In the root directory, locate the `sample.config.toml` file.
|
||||
2. Rename it to `config.toml` and fill in the necessary configuration fields specific to the backend.
|
||||
1. In the root directory, locate the `.env.example` file.
|
||||
2. Rename it to `.env` and fill in the necessary environment variables specific to the backend.
|
||||
3. Run `npm install` to install dependencies.
|
||||
4. Run `npm run db:push` to set up the local sqlite.
|
||||
5. Use `npm run dev` to start the backend in development mode.
|
||||
4. Use `npm run dev` to start the backend in development mode.
|
||||
|
||||
### Frontend
|
||||
|
||||
|
176
README.md
@@ -1,120 +1,104 @@
|
||||
# BizSearch
|
||||
# 🚀 Perplexica - An AI-powered search engine 🔎 <!-- omit in toc -->
|
||||
|
||||
A tool for finding and analyzing local businesses using AI-powered data extraction.
|
||||

|
||||
|
||||
## Prerequisites
|
||||
## Table of Contents <!-- omit in toc -->
|
||||
|
||||
- Node.js 16+
|
||||
- Ollama (for local LLM)
|
||||
- SearxNG instance
|
||||
- [Overview](#overview)
|
||||
- [Preview](#preview)
|
||||
- [Features](#features)
|
||||
- [Installation](#installation)
|
||||
- [Getting Started with Docker (Recommended)](#getting-started-with-docker-recommended)
|
||||
- [Non-Docker Installation](#non-docker-installation)
|
||||
- [Upcoming Features](#upcoming-features)
|
||||
- [Support Us](#support-us)
|
||||
- [Contribution](#contribution)
|
||||
- [Acknowledgements](#acknowledgements)
|
||||
|
||||
## Installation
|
||||
## Overview
|
||||
|
||||
1. Install Ollama:
|
||||
```bash
|
||||
# On macOS
|
||||
brew install ollama
|
||||
```
|
||||
Perplexica is an open-source AI-powered searching tool or an AI-powered search engine that goes deep into the internet to find answers. Inspired by Perplexity AI, it's an open-source option that not just searches the web but understands your questions. It uses advanced machine learning algorithms like similarity searching and embeddings to refine results and provides clear answers with sources cited.
|
||||
|
||||
2. Start Ollama:
|
||||
```bash
|
||||
# Start and enable on login
|
||||
brew services start ollama
|
||||
## Preview
|
||||
|
||||
# Or run without auto-start
|
||||
/usr/local/opt/ollama/bin/ollama serve
|
||||
```
|
||||
|
||||
3. Pull the required model:
|
||||
```bash
|
||||
ollama pull mistral
|
||||
```
|
||||
|
||||
4. Clone and set up the project:
|
||||
```bash
|
||||
git clone https://github.com/yourusername/bizsearch.git
|
||||
cd bizsearch
|
||||
npm install
|
||||
```
|
||||
|
||||
5. Configure environment:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
# Edit .env with your settings
|
||||
```
|
||||
|
||||
6. Start the application:
|
||||
```bash
|
||||
npm run dev
|
||||
```
|
||||
|
||||
7. Open http://localhost:3000 in your browser
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If Ollama fails to start:
|
||||
```bash
|
||||
# Stop any existing instance
|
||||
brew services stop ollama
|
||||
# Wait a few seconds
|
||||
sleep 5
|
||||
# Start again
|
||||
brew services start ollama
|
||||
```
|
||||
|
||||
To verify Ollama is running:
|
||||
```bash
|
||||
curl http://localhost:11434/api/version
|
||||
```
|
||||

|
||||
|
||||
## Features
|
||||
|
||||
- Business search with location filtering
|
||||
- Contact information extraction
|
||||
- AI-powered data validation
|
||||
- Clean, user-friendly interface
|
||||
- Service health monitoring
|
||||
- **Two Main Modes:**
|
||||
- **Copilot Mode:** (In development) Boosts search by generating different queries to find more relevant internet sources. Like normal search instead of just using the context by SearxNG, it visits the top matches and tries to find relevant sources to the user's query directly from the page.
|
||||
- **Normal Mode:** Processes your query and performs a web search.
|
||||
- **Focus Modes:** Special modes to better answer specific types of questions. Perplexica currently has 6 focus modes:
|
||||
|
||||
## Configuration
|
||||
1. **All Mode:** Searches the entire web to find the best results.
|
||||
2. **Writing Assistant Mode:** Helpful for writing tasks that does not require searching the web.
|
||||
3. **Academic Search Mode:** Finds articles and papers, ideal for academic research.
|
||||
4. **YouTube Search Mode:** Finds YouTube videos based on the search query.
|
||||
5. **Wolfram Alpha Search Mode:** Answers queries that need calculations or data analysis using Wolfram Alpha.
|
||||
6. **Reddit Search Mode:** Searches Reddit for discussions and opinions related to the query.
|
||||
|
||||
Key environment variables:
|
||||
- `SEARXNG_URL`: Your SearxNG instance URL
|
||||
- `OLLAMA_URL`: Ollama API endpoint (default: http://localhost:11434)
|
||||
- `SUPABASE_URL`: Your Supabase project URL
|
||||
- `SUPABASE_ANON_KEY`: Your Supabase anonymous key
|
||||
- `CACHE_DURATION_DAYS`: How long to cache results (default: 7)
|
||||
- **Current Information:** Some search tools might give you outdated info because they use data from crawling bots and convert them into embeddings and store them in a index (its like converting the web into embeddings which is quite expensive.). Unlike them, Perplexica uses SearxNG, a metasearch engine to get the results and rerank and get the most relevent source out of it, ensuring you always get the latest information without the overhead of daily data updates.
|
||||
|
||||
## Supabase Setup
|
||||
It has many more features like image and video search. Some of the planned features are mentioned in [upcoming features](#upcoming-features).
|
||||
|
||||
1. Create a new Supabase project
|
||||
2. Run the SQL commands in `db/init.sql` to create the cache table
|
||||
3. Copy your project URL and anon key to `.env`
|
||||
## Installation
|
||||
|
||||
## License
|
||||
There are mainly 2 ways of installing Perplexica - With Docker, Without Docker. Using Docker is highly recommended.
|
||||
|
||||
MIT
|
||||
### Getting Started with Docker (Recommended)
|
||||
|
||||
## Cache Management
|
||||
1. Ensure Docker is installed and running on your system.
|
||||
2. Clone the Perplexica repository:
|
||||
|
||||
The application uses Supabase for caching search results. Cache entries expire after 7 days.
|
||||
|
||||
### Manual Cache Cleanup
|
||||
|
||||
If automatic cleanup is not available, you can manually clean up expired entries:
|
||||
|
||||
1. Using the API:
|
||||
```bash
|
||||
curl -X POST http://localhost:3000/api/cleanup
|
||||
git clone -b feat/ollama-support https://github.com/ItzCrazyKns/Perplexica.git
|
||||
```
|
||||
|
||||
2. Using SQL:
|
||||
```sql
|
||||
select manual_cleanup();
|
||||
3. After cloning, navigate to the directory containing the project files.
|
||||
|
||||
4. Rename the `.env.example` file to `.env`. For Docker setups, you need only fill in the following fields:
|
||||
|
||||
- `OLLAMA_URL` (It should be the URL where Ollama is running; it is also filled by default but you need to replace it if your Ollama URL is different.)
|
||||
- `MODEL_NAME` (This is filled by default; you can change it if you want to use a different model.)
|
||||
- `SIMILARITY_MEASURE` (This is filled by default; you can leave it as is if you are unsure about it.)
|
||||
|
||||
5. Ensure you are in the directory containing the `docker-compose.yaml` file and execute:
|
||||
|
||||
```bash
|
||||
docker compose up
|
||||
```
|
||||
|
||||
### Cache Statistics
|
||||
6. Wait a few minutes for the setup to complete. You can access Perplexica at http://localhost:3000 in your web browser.
|
||||
|
||||
View cache statistics using:
|
||||
```sql
|
||||
select * from cache_stats;
|
||||
```
|
||||
**Note**: Once the terminal is stopped, Perplexica will also stop. To restart it, you will need to open Docker Desktop and run Perplexica again.
|
||||
|
||||
### Non-Docker Installation
|
||||
|
||||
For setups without Docker:
|
||||
|
||||
1. Follow the initial steps to clone the repository and rename the `.env.example` file to `.env` in the root directory. You will need to fill in all the fields in this file.
|
||||
2. Additionally, rename the `.env.example` file to `.env` in the `ui` folder and complete all fields.
|
||||
3. The non-Docker setup requires manual configuration of both the backend and frontend.
|
||||
|
||||
**Note**: Using Docker is recommended as it simplifies the setup process, especially for managing environment variables and dependencies.
|
||||
|
||||
## Upcoming Features
|
||||
|
||||
- [ ] Finalizing Copilot Mode
|
||||
- [ ] Adding support for multiple local LLMs and LLM providers such as Anthropic, Google, etc.
|
||||
- [ ] Adding Discover and History Saving features
|
||||
- [x] Introducing various Focus Modes
|
||||
|
||||
## Support Us
|
||||
|
||||
If you find Perplexica useful, consider giving us a star on GitHub. This helps more people discover Perplexica and supports the development of new features. Your support is appreciated.
|
||||
|
||||
## Contribution
|
||||
|
||||
Perplexica is built on the idea that AI and large language models should be easy for everyone to use. If you find bugs or have ideas, please share them in via GitHub Issues. For more information on contributing to Perplexica you can read the [CONTRIBUTING.md](CONTRIBUTING.md) file to learn more about Perplexica and how you can contribute to it.
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
Inspired by Perplexity AI, Perplexica aims to provide a similar service but always up-to-date and fully open source, thanks to SearxNG.
|
||||
|
||||
If you have any queries you can reach me via my Discord - `itzcrazykns`. Thanks for checking out Perplexica.
|
||||
|
@@ -1,7 +1,7 @@
|
||||
FROM node:20.18.0-alpine
|
||||
FROM node:alpine
|
||||
|
||||
ARG NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
|
||||
ARG NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
|
||||
ARG NEXT_PUBLIC_WS_URL
|
||||
ARG NEXT_PUBLIC_API_URL
|
||||
ENV NEXT_PUBLIC_WS_URL=${NEXT_PUBLIC_WS_URL}
|
||||
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
|
||||
|
||||
@@ -9,7 +9,7 @@ WORKDIR /home/perplexica
|
||||
|
||||
COPY ui /home/perplexica/
|
||||
|
||||
RUN yarn install --frozen-lockfile
|
||||
RUN yarn install
|
||||
RUN yarn build
|
||||
|
||||
CMD ["yarn", "start"]
|
@@ -1,17 +1,17 @@
|
||||
FROM node:18-slim
|
||||
FROM node:alpine
|
||||
|
||||
ARG SEARXNG_API_URL
|
||||
ENV SEARXNG_API_URL=${SEARXNG_API_URL}
|
||||
|
||||
WORKDIR /home/perplexica
|
||||
|
||||
COPY src /home/perplexica/src
|
||||
COPY tsconfig.json /home/perplexica/
|
||||
COPY drizzle.config.ts /home/perplexica/
|
||||
COPY .env /home/perplexica/
|
||||
COPY package.json /home/perplexica/
|
||||
COPY yarn.lock /home/perplexica/
|
||||
|
||||
RUN mkdir /home/perplexica/data
|
||||
RUN mkdir /home/perplexica/uploads
|
||||
|
||||
RUN yarn install --frozen-lockfile --network-timeout 600000
|
||||
RUN yarn install
|
||||
RUN yarn build
|
||||
|
||||
CMD ["yarn", "start"]
|
14
config.toml
@@ -1,14 +0,0 @@
|
||||
[GENERAL]
|
||||
PORT = 3001 # Port to run the server on
|
||||
SIMILARITY_MEASURE = "cosine" # "cosine" or "dot"
|
||||
KEEP_ALIVE = "5m" # How long to keep Ollama models loaded into memory. (Instead of using -1 use "-1m")
|
||||
|
||||
[API_KEYS]
|
||||
OPENAI = "" # OpenAI API key - sk-1234567890abcdef1234567890abcdef
|
||||
GROQ = "" # Groq API key - gsk_1234567890abcdef1234567890abcdef
|
||||
ANTHROPIC = "" # Anthropic API key - sk-ant-1234567890abcdef1234567890abcdef
|
||||
GEMINI = "" # Gemini API key - sk-1234567890abcdef1234567890abcdef
|
||||
|
||||
[API_ENDPOINTS]
|
||||
SEARXNG = "http://localhost:32768" # SearxNG API URL
|
||||
OLLAMA = "" # Ollama API URL - http://host.docker.internal:11434
|
2
data/.gitignore
vendored
@@ -1,2 +0,0 @@
|
||||
*
|
||||
!.gitignore
|
189
db/init.sql
@@ -1,189 +0,0 @@
|
||||
-- Enable required extensions
|
||||
create extension if not exists "uuid-ossp"; -- For UUID generation
|
||||
create extension if not exists pg_cron; -- For scheduled jobs
|
||||
|
||||
-- Create the search_cache table
|
||||
create table public.search_cache (
|
||||
id uuid default uuid_generate_v4() primary key,
|
||||
query text not null,
|
||||
results jsonb not null,
|
||||
location text not null,
|
||||
category text not null,
|
||||
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
|
||||
updated_at timestamp with time zone default timezone('utc'::text, now()) not null,
|
||||
expires_at timestamp with time zone default timezone('utc'::text, now() + interval '7 days') not null
|
||||
);
|
||||
|
||||
-- Create indexes
|
||||
create index search_cache_query_idx on public.search_cache (query);
|
||||
create index search_cache_location_category_idx on public.search_cache (location, category);
|
||||
create index search_cache_expires_at_idx on public.search_cache (expires_at);
|
||||
|
||||
-- Enable RLS
|
||||
alter table public.search_cache enable row level security;
|
||||
|
||||
-- Create policies
|
||||
create policy "Allow public read access"
|
||||
on public.search_cache for select
|
||||
using (true);
|
||||
|
||||
create policy "Allow service write access"
|
||||
on public.search_cache for insert
|
||||
with check (true);
|
||||
|
||||
create policy "Allow service update access"
|
||||
on public.search_cache for update
|
||||
using (true)
|
||||
with check (true);
|
||||
|
||||
create policy "Allow delete expired records"
|
||||
on public.search_cache for delete
|
||||
using (expires_at < now());
|
||||
|
||||
-- Create function to clean up expired records
|
||||
create or replace function cleanup_expired_cache()
|
||||
returns void
|
||||
language plpgsql
|
||||
security definer
|
||||
as $$
|
||||
begin
|
||||
delete from public.search_cache
|
||||
where expires_at < now();
|
||||
end;
|
||||
$$;
|
||||
|
||||
-- Create a manual cleanup function since pg_cron might not be available
|
||||
create or replace function manual_cleanup()
|
||||
returns void
|
||||
language plpgsql
|
||||
security definer
|
||||
as $$
|
||||
begin
|
||||
delete from public.search_cache
|
||||
where expires_at < now();
|
||||
end;
|
||||
$$;
|
||||
|
||||
-- Create a view for cache statistics
|
||||
create or replace view cache_stats as
|
||||
select
|
||||
count(*) as total_entries,
|
||||
count(*) filter (where expires_at < now()) as expired_entries,
|
||||
count(*) filter (where expires_at >= now()) as valid_entries,
|
||||
min(created_at) as oldest_entry,
|
||||
max(created_at) as newest_entry,
|
||||
count(distinct category) as unique_categories,
|
||||
count(distinct location) as unique_locations
|
||||
from public.search_cache;
|
||||
|
||||
-- Grant permissions to access the view
|
||||
grant select on cache_stats to postgres;
|
||||
|
||||
-- Create table if not exists businesses
|
||||
create table if not exists businesses (
|
||||
id text primary key,
|
||||
name text not null,
|
||||
phone text,
|
||||
email text,
|
||||
address text,
|
||||
rating numeric,
|
||||
website text,
|
||||
logo text,
|
||||
source text,
|
||||
description text,
|
||||
latitude numeric,
|
||||
longitude numeric,
|
||||
last_updated timestamp with time zone default timezone('utc'::text, now()),
|
||||
search_count integer default 1,
|
||||
created_at timestamp with time zone default timezone('utc'::text, now())
|
||||
);
|
||||
|
||||
-- Create indexes for common queries
|
||||
create index if not exists businesses_name_idx on businesses (name);
|
||||
create index if not exists businesses_rating_idx on businesses (rating desc);
|
||||
create index if not exists businesses_search_count_idx on businesses (search_count desc);
|
||||
create index if not exists businesses_last_updated_idx on businesses (last_updated desc);
|
||||
|
||||
-- Create tables if they don't exist
|
||||
CREATE TABLE IF NOT EXISTS businesses (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
phone TEXT,
|
||||
email TEXT,
|
||||
address TEXT,
|
||||
rating INTEGER,
|
||||
website TEXT,
|
||||
logo TEXT,
|
||||
source TEXT,
|
||||
description TEXT,
|
||||
location JSONB,
|
||||
place_id TEXT,
|
||||
photos TEXT[],
|
||||
opening_hours TEXT[],
|
||||
distance JSONB,
|
||||
last_updated TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
search_count INTEGER DEFAULT 0
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS searches (
|
||||
id SERIAL PRIMARY KEY,
|
||||
query TEXT NOT NULL,
|
||||
location TEXT NOT NULL,
|
||||
timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
results_count INTEGER
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cache (
|
||||
key TEXT PRIMARY KEY,
|
||||
value JSONB NOT NULL,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
expires_at TIMESTAMP WITH TIME ZONE NOT NULL
|
||||
);
|
||||
|
||||
-- Create indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_businesses_location ON businesses USING GIN (location);
|
||||
CREATE INDEX IF NOT EXISTS idx_businesses_search ON businesses USING GIN (to_tsvector('english', name || ' ' || COALESCE(description, '')));
|
||||
CREATE INDEX IF NOT EXISTS idx_cache_expires ON cache (expires_at);
|
||||
|
||||
-- Set up RLS (Row Level Security)
|
||||
ALTER TABLE businesses ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE searches ENABLE ROW LEVEL SECURITY;
|
||||
ALTER TABLE cache ENABLE ROW LEVEL SECURITY;
|
||||
|
||||
-- Create policies
|
||||
CREATE POLICY "Allow anonymous select" ON businesses FOR SELECT USING (true);
|
||||
CREATE POLICY "Allow service role insert" ON businesses FOR INSERT WITH CHECK (true);
|
||||
CREATE POLICY "Allow service role update" ON businesses FOR UPDATE USING (true);
|
||||
|
||||
CREATE POLICY "Allow anonymous select" ON searches FOR SELECT USING (true);
|
||||
CREATE POLICY "Allow service role insert" ON searches FOR INSERT WITH CHECK (true);
|
||||
|
||||
CREATE POLICY "Allow anonymous select" ON cache FOR SELECT USING (true);
|
||||
CREATE POLICY "Allow service role all" ON cache USING (true);
|
||||
|
||||
-- Add place_id column to businesses table if it doesn't exist
|
||||
ALTER TABLE businesses ADD COLUMN IF NOT EXISTS place_id TEXT;
|
||||
CREATE INDEX IF NOT EXISTS idx_businesses_place_id ON businesses(place_id);
|
||||
|
||||
-- Create a unique constraint on place_id (excluding nulls)
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_businesses_place_id_unique
|
||||
ON businesses(place_id)
|
||||
WHERE place_id IS NOT NULL;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS businesses (
|
||||
id TEXT PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
address TEXT NOT NULL,
|
||||
phone TEXT NOT NULL,
|
||||
description TEXT NOT NULL,
|
||||
website TEXT,
|
||||
source TEXT NOT NULL,
|
||||
rating REAL,
|
||||
lat REAL,
|
||||
lng REAL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_businesses_source ON businesses(source);
|
||||
CREATE INDEX IF NOT EXISTS idx_businesses_rating ON businesses(rating);
|
@@ -1,44 +0,0 @@
|
||||
-- Create the businesses table
|
||||
create table businesses (
|
||||
id uuid primary key,
|
||||
name text not null,
|
||||
phone text,
|
||||
address text,
|
||||
city text,
|
||||
state text,
|
||||
zip text,
|
||||
category text[],
|
||||
rating numeric,
|
||||
review_count integer,
|
||||
license text,
|
||||
services text[],
|
||||
hours jsonb,
|
||||
website text,
|
||||
email text,
|
||||
verified boolean default false,
|
||||
last_updated timestamp with time zone,
|
||||
search_query text,
|
||||
search_location text,
|
||||
search_timestamp timestamp with time zone,
|
||||
reliability_score integer,
|
||||
|
||||
-- Create a composite index for deduplication
|
||||
constraint unique_business unique (phone, address)
|
||||
);
|
||||
|
||||
-- Create indexes for common queries
|
||||
create index idx_business_location on businesses (city, state);
|
||||
create index idx_business_category on businesses using gin (category);
|
||||
create index idx_search_query on businesses using gin (search_query gin_trgm_ops);
|
||||
create index idx_search_location on businesses using gin (search_location gin_trgm_ops);
|
||||
create index idx_reliability on businesses (reliability_score);
|
||||
|
||||
-- Enable full text search
|
||||
alter table businesses add column search_vector tsvector
|
||||
generated always as (
|
||||
setweight(to_tsvector('english', coalesce(name, '')), 'A') ||
|
||||
setweight(to_tsvector('english', coalesce(search_query, '')), 'B') ||
|
||||
setweight(to_tsvector('english', coalesce(search_location, '')), 'C')
|
||||
) stored;
|
||||
|
||||
create index idx_business_search on businesses using gin(search_vector);
|
@@ -1,15 +0,0 @@
|
||||
-- Check if table exists
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_schema = 'public'
|
||||
AND table_name = 'businesses'
|
||||
);
|
||||
|
||||
-- Check table structure
|
||||
SELECT column_name, data_type, is_nullable
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = 'public'
|
||||
AND table_name = 'businesses';
|
||||
|
||||
-- Check row count
|
||||
SELECT COUNT(*) as count FROM businesses;
|
@@ -1,34 +1,28 @@
|
||||
services:
|
||||
searxng:
|
||||
image: docker.io/searxng/searxng:latest
|
||||
volumes:
|
||||
- ./searxng:/etc/searxng:rw
|
||||
build:
|
||||
context: .
|
||||
dockerfile: searxng.dockerfile
|
||||
expose:
|
||||
- 4000
|
||||
ports:
|
||||
- 4000:8080
|
||||
networks:
|
||||
- perplexica-network
|
||||
restart: unless-stopped
|
||||
|
||||
perplexica-backend:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: backend.dockerfile
|
||||
image: itzcrazykns1337/perplexica-backend:main
|
||||
environment:
|
||||
args:
|
||||
- SEARXNG_API_URL=http://searxng:8080
|
||||
depends_on:
|
||||
- searxng
|
||||
expose:
|
||||
- 3001
|
||||
ports:
|
||||
- 3001:3001
|
||||
volumes:
|
||||
- backend-dbstore:/home/perplexica/data
|
||||
- uploads:/home/perplexica/uploads
|
||||
- ./config.toml:/home/perplexica/config.toml
|
||||
extra_hosts:
|
||||
- 'host.docker.internal:host-gateway'
|
||||
networks:
|
||||
- perplexica-network
|
||||
restart: unless-stopped
|
||||
|
||||
perplexica-frontend:
|
||||
build:
|
||||
@@ -37,18 +31,14 @@ services:
|
||||
args:
|
||||
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
|
||||
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
|
||||
image: itzcrazykns1337/perplexica-frontend:main
|
||||
depends_on:
|
||||
- perplexica-backend
|
||||
expose:
|
||||
- 3000
|
||||
ports:
|
||||
- 3000:3000
|
||||
networks:
|
||||
- perplexica-network
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
perplexica-network:
|
||||
|
||||
volumes:
|
||||
backend-dbstore:
|
||||
uploads:
|
||||
|
@@ -1,26 +0,0 @@
|
||||
version: '3'
|
||||
services:
|
||||
searxng:
|
||||
image: searxng/searxng
|
||||
ports:
|
||||
- "4000:8080"
|
||||
volumes:
|
||||
- ./searxng:/etc/searxng
|
||||
environment:
|
||||
- INSTANCE_NAME=perplexica-searxng
|
||||
- BASE_URL=http://localhost:4000/
|
||||
- SEARXNG_SECRET=your_secret_key_here
|
||||
restart: unless-stopped
|
||||
|
||||
app:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: backend.dockerfile
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- SEARXNG_URL=http://searxng:8080
|
||||
volumes:
|
||||
- ./config.toml:/home/perplexica/config.toml
|
||||
depends_on:
|
||||
- searxng
|
@@ -1,117 +0,0 @@
|
||||
# Perplexica Search API Documentation
|
||||
|
||||
## Overview
|
||||
|
||||
Perplexica’s Search API makes it easy to use our AI-powered search engine. You can run different types of searches, pick the models you want to use, and get the most recent info. Follow the following headings to learn more about Perplexica's search API.
|
||||
|
||||
## Endpoint
|
||||
|
||||
### **POST** `http://localhost:3001/api/search`
|
||||
|
||||
**Note**: Replace `3001` with any other port if you've changed the default PORT
|
||||
|
||||
### Request
|
||||
|
||||
The API accepts a JSON object in the request body, where you define the focus mode, chat models, embedding models, and your query.
|
||||
|
||||
#### Request Body Structure
|
||||
|
||||
```json
|
||||
{
|
||||
"chatModel": {
|
||||
"provider": "openai",
|
||||
"model": "gpt-4o-mini"
|
||||
},
|
||||
"embeddingModel": {
|
||||
"provider": "openai",
|
||||
"model": "text-embedding-3-large"
|
||||
},
|
||||
"optimizationMode": "speed",
|
||||
"focusMode": "webSearch",
|
||||
"query": "What is Perplexica",
|
||||
"history": [
|
||||
["human", "Hi, how are you?"],
|
||||
["assistant", "I am doing well, how can I help you today?"]
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Request Parameters
|
||||
|
||||
- **`chatModel`** (object, optional): Defines the chat model to be used for the query. For model details you can send a GET request at `http://localhost:3001/api/models`. Make sure to use the key value (For example "gpt-4o-mini" instead of the display name "GPT 4 omni mini").
|
||||
|
||||
- `provider`: Specifies the provider for the chat model (e.g., `openai`, `ollama`).
|
||||
- `model`: The specific model from the chosen provider (e.g., `gpt-4o-mini`).
|
||||
- Optional fields for custom OpenAI configuration:
|
||||
- `customOpenAIBaseURL`: If you’re using a custom OpenAI instance, provide the base URL.
|
||||
- `customOpenAIKey`: The API key for a custom OpenAI instance.
|
||||
|
||||
- **`embeddingModel`** (object, optional): Defines the embedding model for similarity-based searching. For model details you can send a GET request at `http://localhost:3001/api/models`. Make sure to use the key value (For example "text-embedding-3-large" instead of the display name "Text Embedding 3 Large").
|
||||
|
||||
- `provider`: The provider for the embedding model (e.g., `openai`).
|
||||
- `model`: The specific embedding model (e.g., `text-embedding-3-large`).
|
||||
|
||||
- **`focusMode`** (string, required): Specifies which focus mode to use. Available modes:
|
||||
|
||||
- `webSearch`, `academicSearch`, `writingAssistant`, `wolframAlphaSearch`, `youtubeSearch`, `redditSearch`.
|
||||
|
||||
- **`optimizationMode`** (string, optional): Specifies the optimization mode to control the balance between performance and quality. Available modes:
|
||||
|
||||
- `speed`: Prioritize speed and return the fastest answer.
|
||||
- `balanced`: Provide a balanced answer with good speed and reasonable quality.
|
||||
|
||||
- **`query`** (string, required): The search query or question.
|
||||
|
||||
- **`history`** (array, optional): An array of message pairs representing the conversation history. Each pair consists of a role (either 'human' or 'assistant') and the message content. This allows the system to use the context of the conversation to refine results. Example:
|
||||
|
||||
```json
|
||||
[
|
||||
["human", "What is Perplexica?"],
|
||||
["assistant", "Perplexica is an AI-powered search engine..."]
|
||||
]
|
||||
```
|
||||
|
||||
### Response
|
||||
|
||||
The response from the API includes both the final message and the sources used to generate that message.
|
||||
|
||||
#### Example Response
|
||||
|
||||
```json
|
||||
{
|
||||
"message": "Perplexica is an innovative, open-source AI-powered search engine designed to enhance the way users search for information online. Here are some key features and characteristics of Perplexica:\n\n- **AI-Powered Technology**: It utilizes advanced machine learning algorithms to not only retrieve information but also to understand the context and intent behind user queries, providing more relevant results [1][5].\n\n- **Open-Source**: Being open-source, Perplexica offers flexibility and transparency, allowing users to explore its functionalities without the constraints of proprietary software [3][10].",
|
||||
"sources": [
|
||||
{
|
||||
"pageContent": "Perplexica is an innovative, open-source AI-powered search engine designed to enhance the way users search for information online.",
|
||||
"metadata": {
|
||||
"title": "What is Perplexica, and how does it function as an AI-powered search ...",
|
||||
"url": "https://askai.glarity.app/search/What-is-Perplexica--and-how-does-it-function-as-an-AI-powered-search-engine"
|
||||
}
|
||||
},
|
||||
{
|
||||
"pageContent": "Perplexica is an open-source AI-powered search tool that dives deep into the internet to find precise answers.",
|
||||
"metadata": {
|
||||
"title": "Sahar Mor's Post",
|
||||
"url": "https://www.linkedin.com/posts/sahar-mor_a-new-open-source-project-called-perplexica-activity-7204489745668694016-ncja"
|
||||
}
|
||||
}
|
||||
....
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Fields in the Response
|
||||
|
||||
- **`message`** (string): The search result, generated based on the query and focus mode.
|
||||
- **`sources`** (array): A list of sources that were used to generate the search result. Each source includes:
|
||||
- `pageContent`: A snippet of the relevant content from the source.
|
||||
- `metadata`: Metadata about the source, including:
|
||||
- `title`: The title of the webpage.
|
||||
- `url`: The URL of the webpage.
|
||||
|
||||
### Error Handling
|
||||
|
||||
If an error occurs during the search process, the API will return an appropriate error message with an HTTP status code.
|
||||
|
||||
- **400**: If the request is malformed or missing required fields (e.g., no focus mode or query).
|
||||
- **500**: If an internal server error occurs during the search.
|
@@ -1,108 +0,0 @@
|
||||
# Ethical Web Scraping Guidelines
|
||||
|
||||
## Core Principles
|
||||
|
||||
1. **Respect Robots.txt**
|
||||
- Always check and honor robots.txt directives
|
||||
- Cache robots.txt to reduce server load
|
||||
- Default to conservative behavior when uncertain
|
||||
|
||||
2. **Proper Identification**
|
||||
- Use clear, identifiable User-Agent strings
|
||||
- Provide contact information
|
||||
- Be transparent about your purpose
|
||||
|
||||
3. **Rate Limiting**
|
||||
- Implement conservative rate limits
|
||||
- Use exponential backoff for errors
|
||||
- Distribute requests over time
|
||||
|
||||
4. **Data Usage**
|
||||
- Only collect publicly available business information
|
||||
- Respect privacy and data protection laws
|
||||
- Provide clear opt-out mechanisms
|
||||
- Keep data accurate and up-to-date
|
||||
|
||||
5. **Technical Considerations**
|
||||
- Cache results to minimize requests
|
||||
- Handle errors gracefully
|
||||
- Monitor and log access patterns
|
||||
- Use structured data when available
|
||||
|
||||
## Implementation
|
||||
|
||||
1. **Request Headers**
|
||||
```typescript
|
||||
const headers = {
|
||||
'User-Agent': 'BizSearch/1.0 (+https://bizsearch.com/about)',
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
'From': 'contact@bizsearch.com'
|
||||
};
|
||||
```
|
||||
|
||||
2. **Rate Limiting**
|
||||
```typescript
|
||||
const rateLimits = {
|
||||
requestsPerMinute: 10,
|
||||
requestsPerHour: 100,
|
||||
requestsPerDomain: 20
|
||||
};
|
||||
```
|
||||
|
||||
3. **Caching**
|
||||
```typescript
|
||||
const cacheSettings = {
|
||||
ttl: 24 * 60 * 60, // 24 hours
|
||||
maxSize: 1000 // entries
|
||||
};
|
||||
```
|
||||
|
||||
## Opt-Out Process
|
||||
|
||||
1. Business owners can opt-out by:
|
||||
- Submitting a form on our website
|
||||
- Emailing opt-out@bizsearch.com
|
||||
- Adding a meta tag: `<meta name="bizsearch" content="noindex">`
|
||||
|
||||
2. We honor opt-outs within:
|
||||
- 24 hours for direct requests
|
||||
- 72 hours for cached data
|
||||
|
||||
## Legal Compliance
|
||||
|
||||
1. **Data Protection**
|
||||
- GDPR compliance for EU businesses
|
||||
- CCPA compliance for California businesses
|
||||
- Regular data audits and cleanup
|
||||
|
||||
2. **Attribution**
|
||||
- Clear source attribution
|
||||
- Last-updated timestamps
|
||||
- Data accuracy disclaimers
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Before Scraping**
|
||||
- Check robots.txt
|
||||
- Verify site status
|
||||
- Review terms of service
|
||||
- Look for API alternatives
|
||||
|
||||
2. **During Scraping**
|
||||
- Monitor response codes
|
||||
- Respect server hints
|
||||
- Implement backoff strategies
|
||||
- Log access patterns
|
||||
|
||||
3. **After Scraping**
|
||||
- Verify data accuracy
|
||||
- Update cache entries
|
||||
- Clean up old data
|
||||
- Monitor opt-out requests
|
||||
|
||||
## Contact
|
||||
|
||||
For questions or concerns about our scraping practices:
|
||||
- Email: ethics@bizsearch.com
|
||||
- Phone: (555) 123-4567
|
||||
- Web: https://bizsearch.com/ethics
|
@@ -1,11 +0,0 @@
|
||||
# Perplexica's Architecture
|
||||
|
||||
Perplexica's architecture consists of the following key components:
|
||||
|
||||
1. **User Interface**: A web-based interface that allows users to interact with Perplexica for searching images, videos, and much more.
|
||||
2. **Agent/Chains**: These components predict Perplexica's next actions, understand user queries, and decide whether a web search is necessary.
|
||||
3. **SearXNG**: A metadata search engine used by Perplexica to search the web for sources.
|
||||
4. **LLMs (Large Language Models)**: Utilized by agents and chains for tasks like understanding content, writing responses, and citing sources. Examples include Claude, GPTs, etc.
|
||||
5. **Embedding Models**: To improve the accuracy of search results, embedding models re-rank the results using similarity search algorithms such as cosine similarity and dot product distance.
|
||||
|
||||
For a more detailed explanation of how these components work together, see [WORKING.md](https://github.com/ItzCrazyKns/Perplexica/tree/master/docs/architecture/WORKING.md).
|
@@ -1,19 +0,0 @@
|
||||
# How does Perplexica work?
|
||||
|
||||
Curious about how Perplexica works? Don't worry, we'll cover it here. Before we begin, make sure you've read about the architecture of Perplexica to ensure you understand what it's made up of. Haven't read it? You can read it [here](https://github.com/ItzCrazyKns/Perplexica/tree/master/docs/architecture/README.md).
|
||||
|
||||
We'll understand how Perplexica works by taking an example of a scenario where a user asks: "How does an A.C. work?". We'll break down the process into steps to make it easier to understand. The steps are as follows:
|
||||
|
||||
1. The message is sent via WS to the backend server where it invokes the chain. The chain will depend on your focus mode. For this example, let's assume we use the "webSearch" focus mode.
|
||||
2. The chain is now invoked; first, the message is passed to another chain where it first predicts (using the chat history and the question) whether there is a need for sources and searching the web. If there is, it will generate a query (in accordance with the chat history) for searching the web that we'll take up later. If not, the chain will end there, and then the answer generator chain, also known as the response generator, will be started.
|
||||
3. The query returned by the first chain is passed to SearXNG to search the web for information.
|
||||
4. After the information is retrieved, it is based on keyword-based search. We then convert the information into embeddings and the query as well, then we perform a similarity search to find the most relevant sources to answer the query.
|
||||
5. After all this is done, the sources are passed to the response generator. This chain takes all the chat history, the query, and the sources. It generates a response that is streamed to the UI.
|
||||
|
||||
## How are the answers cited?
|
||||
|
||||
The LLMs are prompted to do so. We've prompted them so well that they cite the answers themselves, and using some UI magic, we display it to the user.
|
||||
|
||||
## Image and Video Search
|
||||
|
||||
Image and video searches are conducted in a similar manner. A query is always generated first, then we search the web for images and videos that match the query. These results are then returned to the user.
|
@@ -1,109 +0,0 @@
|
||||
# Expose Perplexica to a network
|
||||
|
||||
This guide will show you how to make Perplexica available over a network. Follow these steps to allow computers on the same network to interact with Perplexica. Choose the instructions that match the operating system you are using.
|
||||
|
||||
## Windows
|
||||
|
||||
1. Open PowerShell as Administrator
|
||||
|
||||
2. Navigate to the directory containing the `docker-compose.yaml` file
|
||||
|
||||
3. Stop and remove the existing Perplexica containers and images:
|
||||
|
||||
```bash
|
||||
docker compose down --rmi all
|
||||
```
|
||||
|
||||
4. Open the `docker-compose.yaml` file in a text editor like Notepad++
|
||||
|
||||
5. Replace `127.0.0.1` with the IP address of the server Perplexica is running on in these two lines:
|
||||
|
||||
```bash
|
||||
args:
|
||||
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
|
||||
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
|
||||
```
|
||||
|
||||
6. Save and close the `docker-compose.yaml` file
|
||||
|
||||
7. Rebuild and restart the Perplexica container:
|
||||
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
## macOS
|
||||
|
||||
1. Open the Terminal application
|
||||
|
||||
2. Navigate to the directory with the `docker-compose.yaml` file:
|
||||
|
||||
```bash
|
||||
cd /path/to/docker-compose.yaml
|
||||
```
|
||||
|
||||
3. Stop and remove existing containers and images:
|
||||
|
||||
```bash
|
||||
docker compose down --rmi all
|
||||
```
|
||||
|
||||
4. Open `docker-compose.yaml` in a text editor like Sublime Text:
|
||||
|
||||
```bash
|
||||
nano docker-compose.yaml
|
||||
```
|
||||
|
||||
5. Replace `127.0.0.1` with the server IP in these lines:
|
||||
|
||||
```bash
|
||||
args:
|
||||
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
|
||||
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
|
||||
```
|
||||
|
||||
6. Save and exit the editor
|
||||
|
||||
7. Rebuild and restart Perplexica:
|
||||
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
```
|
||||
|
||||
## Linux
|
||||
|
||||
1. Open the terminal
|
||||
|
||||
2. Navigate to the `docker-compose.yaml` directory:
|
||||
|
||||
```bash
|
||||
cd /path/to/docker-compose.yaml
|
||||
```
|
||||
|
||||
3. Stop and remove containers and images:
|
||||
|
||||
```bash
|
||||
docker compose down --rmi all
|
||||
```
|
||||
|
||||
4. Edit `docker-compose.yaml`:
|
||||
|
||||
```bash
|
||||
nano docker-compose.yaml
|
||||
```
|
||||
|
||||
5. Replace `127.0.0.1` with the server IP:
|
||||
|
||||
```bash
|
||||
args:
|
||||
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
|
||||
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
|
||||
```
|
||||
|
||||
6. Save and exit the editor
|
||||
|
||||
7. Rebuild and restart Perplexica:
|
||||
|
||||
```bash
|
||||
docker compose up -d --build
|
||||
```
|
@@ -1,40 +0,0 @@
|
||||
# Update Perplexica to the latest version
|
||||
|
||||
To update Perplexica to the latest version, follow these steps:
|
||||
|
||||
## For Docker users
|
||||
|
||||
1. Clone the latest version of Perplexica from GitHub:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ItzCrazyKns/Perplexica.git
|
||||
```
|
||||
|
||||
2. Navigate to the Project Directory.
|
||||
|
||||
3. Pull latest images from registry.
|
||||
|
||||
```bash
|
||||
docker compose pull
|
||||
```
|
||||
|
||||
4. Update and Recreate containers.
|
||||
|
||||
```bash
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
5. Once the command completes running go to http://localhost:3000 and verify the latest changes.
|
||||
|
||||
## For non Docker users
|
||||
|
||||
1. Clone the latest version of Perplexica from GitHub:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ItzCrazyKns/Perplexica.git
|
||||
```
|
||||
|
||||
2. Navigate to the Project Directory
|
||||
3. Execute `npm i` in both the `ui` folder and the root directory.
|
||||
4. Once packages are updated, execute `npm run build` in both the `ui` folder and the root directory.
|
||||
5. Finally, start both the frontend and the backend by running `npm run start` in both the `ui` folder and the root directory.
|
@@ -1,10 +0,0 @@
|
||||
import { defineConfig } from 'drizzle-kit';
|
||||
|
||||
export default defineConfig({
|
||||
dialect: 'sqlite',
|
||||
schema: './src/db/schema.ts',
|
||||
out: './drizzle',
|
||||
dbCredentials: {
|
||||
url: './data/db.sqlite',
|
||||
},
|
||||
});
|
41
frontend/.gitignore
vendored
@@ -1,41 +0,0 @@
|
||||
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
||||
|
||||
# dependencies
|
||||
/node_modules
|
||||
/.pnp
|
||||
.pnp.*
|
||||
.yarn/*
|
||||
!.yarn/patches
|
||||
!.yarn/plugins
|
||||
!.yarn/releases
|
||||
!.yarn/versions
|
||||
|
||||
# testing
|
||||
/coverage
|
||||
|
||||
# next.js
|
||||
/.next/
|
||||
/out/
|
||||
|
||||
# production
|
||||
/build
|
||||
|
||||
# misc
|
||||
.DS_Store
|
||||
*.pem
|
||||
|
||||
# debug
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# env files (can opt-in for committing if needed)
|
||||
.env*
|
||||
|
||||
# vercel
|
||||
.vercel
|
||||
|
||||
# typescript
|
||||
*.tsbuildinfo
|
||||
next-env.d.ts
|
@@ -1,36 +0,0 @@
|
||||
This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app).
|
||||
|
||||
## Getting Started
|
||||
|
||||
First, run the development server:
|
||||
|
||||
```bash
|
||||
npm run dev
|
||||
# or
|
||||
yarn dev
|
||||
# or
|
||||
pnpm dev
|
||||
# or
|
||||
bun dev
|
||||
```
|
||||
|
||||
Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
|
||||
|
||||
You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
|
||||
|
||||
This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
|
||||
|
||||
## Learn More
|
||||
|
||||
To learn more about Next.js, take a look at the following resources:
|
||||
|
||||
- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
|
||||
- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
|
||||
|
||||
You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
|
||||
|
||||
## Deploy on Vercel
|
||||
|
||||
The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
|
||||
|
||||
Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.
|
@@ -1,16 +0,0 @@
|
||||
import { dirname } from "path";
|
||||
import { fileURLToPath } from "url";
|
||||
import { FlatCompat } from "@eslint/eslintrc";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = dirname(__filename);
|
||||
|
||||
const compat = new FlatCompat({
|
||||
baseDirectory: __dirname,
|
||||
});
|
||||
|
||||
const eslintConfig = [
|
||||
...compat.extends("next/core-web-vitals", "next/typescript"),
|
||||
];
|
||||
|
||||
export default eslintConfig;
|
@@ -1,13 +0,0 @@
|
||||
/** @type {import('next').NextConfig} */
|
||||
const nextConfig = {
|
||||
async rewrites() {
|
||||
return [
|
||||
{
|
||||
source: '/api/:path*',
|
||||
destination: 'http://localhost:3000/api/:path*',
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = nextConfig
|
@@ -1,7 +0,0 @@
|
||||
import type { NextConfig } from "next";
|
||||
|
||||
const nextConfig: NextConfig = {
|
||||
/* config options here */
|
||||
};
|
||||
|
||||
export default nextConfig;
|
5848
frontend/package-lock.json
generated
@@ -1,33 +0,0 @@
|
||||
{
|
||||
"name": "frontend",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"scripts": {
|
||||
"dev": "next dev",
|
||||
"build": "next build",
|
||||
"start": "next start",
|
||||
"lint": "next lint"
|
||||
},
|
||||
"dependencies": {
|
||||
"@radix-ui/react-icons": "^1.3.2",
|
||||
"class-variance-authority": "^0.7.1",
|
||||
"clsx": "^2.1.1",
|
||||
"lucide-react": "^0.469.0",
|
||||
"next": "15.1.3",
|
||||
"react": "^19.0.0",
|
||||
"react-dom": "^19.0.0",
|
||||
"tailwind-merge": "^2.6.0",
|
||||
"tailwindcss-animate": "^1.0.7"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@eslint/eslintrc": "^3",
|
||||
"@types/node": "^20",
|
||||
"@types/react": "^19",
|
||||
"@types/react-dom": "^19",
|
||||
"eslint": "^9",
|
||||
"eslint-config-next": "15.1.3",
|
||||
"postcss": "^8",
|
||||
"tailwindcss": "^3.4.1",
|
||||
"typescript": "^5"
|
||||
}
|
||||
}
|
@@ -1,8 +0,0 @@
|
||||
/** @type {import('postcss-load-config').Config} */
|
||||
const config = {
|
||||
plugins: {
|
||||
tailwindcss: {},
|
||||
},
|
||||
};
|
||||
|
||||
export default config;
|
@@ -1 +0,0 @@
|
||||
<svg fill="none" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><path d="M14.5 13.5V5.41a1 1 0 0 0-.3-.7L9.8.29A1 1 0 0 0 9.08 0H1.5v13.5A2.5 2.5 0 0 0 4 16h8a2.5 2.5 0 0 0 2.5-2.5m-1.5 0v-7H8v-5H3v12a1 1 0 0 0 1 1h8a1 1 0 0 0 1-1M9.5 5V2.12L12.38 5zM5.13 5h-.62v1.25h2.12V5zm-.62 3h7.12v1.25H4.5zm.62 3h-.62v1.25h7.12V11z" clip-rule="evenodd" fill="#666" fill-rule="evenodd"/></svg>
|
Before Width: | Height: | Size: 391 B |
@@ -1 +0,0 @@
|
||||
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><g clip-path="url(#a)"><path fill-rule="evenodd" clip-rule="evenodd" d="M10.27 14.1a6.5 6.5 0 0 0 3.67-3.45q-1.24.21-2.7.34-.31 1.83-.97 3.1M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16m.48-1.52a7 7 0 0 1-.96 0H7.5a4 4 0 0 1-.84-1.32q-.38-.89-.63-2.08a40 40 0 0 0 3.92 0q-.25 1.2-.63 2.08a4 4 0 0 1-.84 1.31zm2.94-4.76q1.66-.15 2.95-.43a7 7 0 0 0 0-2.58q-1.3-.27-2.95-.43a18 18 0 0 1 0 3.44m-1.27-3.54a17 17 0 0 1 0 3.64 39 39 0 0 1-4.3 0 17 17 0 0 1 0-3.64 39 39 0 0 1 4.3 0m1.1-1.17q1.45.13 2.69.34a6.5 6.5 0 0 0-3.67-3.44q.65 1.26.98 3.1M8.48 1.5l.01.02q.41.37.84 1.31.38.89.63 2.08a40 40 0 0 0-3.92 0q.25-1.2.63-2.08a4 4 0 0 1 .85-1.32 7 7 0 0 1 .96 0m-2.75.4a6.5 6.5 0 0 0-3.67 3.44 29 29 0 0 1 2.7-.34q.31-1.83.97-3.1M4.58 6.28q-1.66.16-2.95.43a7 7 0 0 0 0 2.58q1.3.27 2.95.43a18 18 0 0 1 0-3.44m.17 4.71q-1.45-.12-2.69-.34a6.5 6.5 0 0 0 3.67 3.44q-.65-1.27-.98-3.1" fill="#666"/></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h16v16H0z"/></clipPath></defs></svg>
|
Before Width: | Height: | Size: 1.0 KiB |
@@ -1 +0,0 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>
|
Before Width: | Height: | Size: 1.3 KiB |
@@ -1 +0,0 @@
|
||||
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1155 1000"><path d="m577.3 0 577.4 1000H0z" fill="#fff"/></svg>
|
Before Width: | Height: | Size: 128 B |
@@ -1 +0,0 @@
|
||||
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.5 2.5h13v10a1 1 0 0 1-1 1h-11a1 1 0 0 1-1-1zM0 1h16v11.5a2.5 2.5 0 0 1-2.5 2.5h-11A2.5 2.5 0 0 1 0 12.5zm3.75 4.5a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5M7 4.75a.75.75 0 1 1-1.5 0 .75.75 0 0 1 1.5 0m1.75.75a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5" fill="#666"/></svg>
|
Before Width: | Height: | Size: 385 B |
Before Width: | Height: | Size: 25 KiB |
@@ -1,76 +0,0 @@
|
||||
@tailwind base;
|
||||
@tailwind components;
|
||||
@tailwind utilities;
|
||||
|
||||
@layer base {
|
||||
:root {
|
||||
--background: 0 0% 100%;
|
||||
--foreground: 222.2 84% 4.9%;
|
||||
|
||||
--card: 0 0% 100%;
|
||||
--card-foreground: 222.2 84% 4.9%;
|
||||
|
||||
--popover: 0 0% 100%;
|
||||
--popover-foreground: 222.2 84% 4.9%;
|
||||
|
||||
--primary: 222.2 47.4% 11.2%;
|
||||
--primary-foreground: 210 40% 98%;
|
||||
|
||||
--secondary: 210 40% 96.1%;
|
||||
--secondary-foreground: 222.2 47.4% 11.2%;
|
||||
|
||||
--muted: 210 40% 96.1%;
|
||||
--muted-foreground: 215.4 16.3% 46.9%;
|
||||
|
||||
--accent: 210 40% 96.1%;
|
||||
--accent-foreground: 222.2 47.4% 11.2%;
|
||||
|
||||
--destructive: 0 84.2% 60.2%;
|
||||
--destructive-foreground: 210 40% 98%;
|
||||
|
||||
--border: 214.3 31.8% 91.4%;
|
||||
--input: 214.3 31.8% 91.4%;
|
||||
--ring: 222.2 84% 4.9%;
|
||||
|
||||
--radius: 0.5rem;
|
||||
}
|
||||
|
||||
.dark {
|
||||
--background: 222.2 84% 4.9%;
|
||||
--foreground: 210 40% 98%;
|
||||
|
||||
--card: 222.2 84% 4.9%;
|
||||
--card-foreground: 210 40% 98%;
|
||||
|
||||
--popover: 222.2 84% 4.9%;
|
||||
--popover-foreground: 210 40% 98%;
|
||||
|
||||
--primary: 210 40% 98%;
|
||||
--primary-foreground: 222.2 47.4% 11.2%;
|
||||
|
||||
--secondary: 217.2 32.6% 17.5%;
|
||||
--secondary-foreground: 210 40% 98%;
|
||||
|
||||
--muted: 217.2 32.6% 17.5%;
|
||||
--muted-foreground: 215 20.2% 65.1%;
|
||||
|
||||
--accent: 217.2 32.6% 17.5%;
|
||||
--accent-foreground: 210 40% 98%;
|
||||
|
||||
--destructive: 0 62.8% 30.6%;
|
||||
--destructive-foreground: 210 40% 98%;
|
||||
|
||||
--border: 217.2 32.6% 17.5%;
|
||||
--input: 217.2 32.6% 17.5%;
|
||||
--ring: 212.7 26.8% 83.9%;
|
||||
}
|
||||
}
|
||||
|
||||
@layer base {
|
||||
* {
|
||||
@apply border-border;
|
||||
}
|
||||
body {
|
||||
@apply bg-background text-foreground;
|
||||
}
|
||||
}
|
@@ -1,34 +0,0 @@
|
||||
import type { Metadata } from "next";
|
||||
import { Geist, Geist_Mono } from "next/font/google";
|
||||
import "./globals.css";
|
||||
|
||||
const geistSans = Geist({
|
||||
variable: "--font-geist-sans",
|
||||
subsets: ["latin"],
|
||||
});
|
||||
|
||||
const geistMono = Geist_Mono({
|
||||
variable: "--font-geist-mono",
|
||||
subsets: ["latin"],
|
||||
});
|
||||
|
||||
export const metadata: Metadata = {
|
||||
title: "Create Next App",
|
||||
description: "Generated by create next app",
|
||||
};
|
||||
|
||||
export default function RootLayout({
|
||||
children,
|
||||
}: Readonly<{
|
||||
children: React.ReactNode;
|
||||
}>) {
|
||||
return (
|
||||
<html lang="en">
|
||||
<body
|
||||
className={`${geistSans.variable} ${geistMono.variable} antialiased`}
|
||||
>
|
||||
{children}
|
||||
</body>
|
||||
</html>
|
||||
);
|
||||
}
|
@@ -1,26 +0,0 @@
|
||||
'use client'
|
||||
|
||||
import { ServerStatus } from "@/components/server-status"
|
||||
import { SearchForm } from "@/components/search-form"
|
||||
import { SearchResults } from "@/components/search-results"
|
||||
import { useState } from "react"
|
||||
|
||||
export default function Home() {
|
||||
const [searchResults, setSearchResults] = useState([])
|
||||
const [isSearching, setIsSearching] = useState(false)
|
||||
|
||||
const services = [
|
||||
{ name: "Ollama", status: "running" as const },
|
||||
{ name: "SearxNG", status: "running" as const },
|
||||
{ name: "Supabase", status: "running" as const }
|
||||
]
|
||||
|
||||
return (
|
||||
<main className="container mx-auto p-4">
|
||||
<h1 className="text-4xl font-bold text-center mb-8">Business Search</h1>
|
||||
<SearchForm onSearch={setSearchResults} onSearchingChange={setIsSearching} />
|
||||
<SearchResults results={searchResults} isLoading={isSearching} />
|
||||
<ServerStatus services={services} />
|
||||
</main>
|
||||
)
|
||||
}
|
@@ -1,79 +0,0 @@
|
||||
import { Search } from "lucide-react"
|
||||
import { useState } from "react"
|
||||
|
||||
interface SearchFormProps {
|
||||
onSearch: (results: any[]) => void;
|
||||
onSearchingChange: (isSearching: boolean) => void;
|
||||
}
|
||||
|
||||
export function SearchForm({ onSearch, onSearchingChange }: SearchFormProps) {
|
||||
const [query, setQuery] = useState("")
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
|
||||
const handleSearch = async (e: React.FormEvent) => {
|
||||
e.preventDefault()
|
||||
if (!query.trim()) return
|
||||
|
||||
setError(null)
|
||||
onSearchingChange(true)
|
||||
try {
|
||||
const response = await fetch("/api/search", {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
body: JSON.stringify({ query: query.trim() }),
|
||||
})
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error("Search failed")
|
||||
}
|
||||
|
||||
const data = await response.json()
|
||||
onSearch(data.results || [])
|
||||
|
||||
} catch (error) {
|
||||
console.error("Search error:", error)
|
||||
onSearch([])
|
||||
setError("Failed to perform search. Please try again.")
|
||||
} finally {
|
||||
onSearchingChange(false)
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="w-full max-w-2xl mx-auto mt-8 mb-12">
|
||||
<div className="flex flex-col gap-4">
|
||||
<div className="flex flex-col gap-2">
|
||||
<label htmlFor="search" className="text-lg font-medium text-center">
|
||||
Find local businesses
|
||||
</label>
|
||||
<form onSubmit={handleSearch} className="relative">
|
||||
<input
|
||||
id="search"
|
||||
type="text"
|
||||
value={query}
|
||||
onChange={(e) => setQuery(e.target.value)}
|
||||
placeholder="e.g. plumbers in Denver, CO"
|
||||
className="w-full px-4 py-3 text-lg rounded-lg border border-border bg-background focus:outline-none focus:ring-2 focus:ring-primary"
|
||||
/>
|
||||
<button
|
||||
type="submit"
|
||||
disabled={!query.trim()}
|
||||
className="absolute right-2 top-1/2 -translate-y-1/2 p-3 rounded-md bg-primary text-primary-foreground hover:bg-primary/90 transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
|
||||
aria-label="Search"
|
||||
>
|
||||
<Search className="h-5 w-5" />
|
||||
</button>
|
||||
</form>
|
||||
{error && (
|
||||
<p className="text-sm text-destructive text-center">{error}</p>
|
||||
)}
|
||||
<p className="text-sm text-muted-foreground text-center mt-2">
|
||||
Try searching for: restaurants, dentists, electricians, etc.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
@@ -1,76 +0,0 @@
|
||||
interface Business {
|
||||
id: string;
|
||||
name: string;
|
||||
address: string;
|
||||
phone: string;
|
||||
website?: string;
|
||||
email?: string;
|
||||
description?: string;
|
||||
rating?: number;
|
||||
}
|
||||
|
||||
interface SearchResultsProps {
|
||||
results: Business[];
|
||||
isLoading: boolean;
|
||||
}
|
||||
|
||||
export function SearchResults({ results, isLoading }: SearchResultsProps) {
|
||||
if (isLoading) {
|
||||
return (
|
||||
<div className="w-full max-w-4xl mx-auto mt-8">
|
||||
<div className="animate-pulse space-y-4">
|
||||
{[...Array(3)].map((_, i) => (
|
||||
<div key={i} className="bg-muted rounded-lg p-6">
|
||||
<div className="h-4 bg-muted-foreground/20 rounded w-3/4 mb-4"></div>
|
||||
<div className="h-3 bg-muted-foreground/20 rounded w-1/2"></div>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
if (!results.length) {
|
||||
return null
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="w-full max-w-4xl mx-auto mt-8">
|
||||
<div className="space-y-4">
|
||||
{results.map((business) => (
|
||||
<div key={business.id} className="bg-card rounded-lg p-6 shadow-sm">
|
||||
<h3 className="text-xl font-semibold mb-2">{business.name}</h3>
|
||||
{business.address && (
|
||||
<p className="text-muted-foreground mb-2">{business.address}</p>
|
||||
)}
|
||||
<div className="flex flex-wrap gap-4 text-sm">
|
||||
{business.phone && (
|
||||
<a
|
||||
href={`tel:${business.phone}`}
|
||||
className="text-primary hover:underline"
|
||||
>
|
||||
{business.phone}
|
||||
</a>
|
||||
)}
|
||||
{business.website && (
|
||||
<a
|
||||
href={business.website}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-primary hover:underline"
|
||||
>
|
||||
Visit Website
|
||||
</a>
|
||||
)}
|
||||
</div>
|
||||
{business.description && (
|
||||
<p className="mt-4 text-sm text-muted-foreground">
|
||||
{business.description}
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
@@ -1,59 +0,0 @@
|
||||
import { CheckCircle2, XCircle, AlertCircle } from "lucide-react"
|
||||
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"
|
||||
|
||||
interface ServiceStatus {
|
||||
name: string
|
||||
status: "running" | "error" | "warning"
|
||||
}
|
||||
|
||||
interface ServerStatusProps {
|
||||
services: ServiceStatus[]
|
||||
error?: string
|
||||
}
|
||||
|
||||
export function ServerStatus({ services, error }: ServerStatusProps) {
|
||||
if (error) {
|
||||
return (
|
||||
<Alert variant="destructive" className="max-w-md mx-auto mt-4">
|
||||
<XCircle className="h-4 w-4" />
|
||||
<AlertTitle>Server Error</AlertTitle>
|
||||
<AlertDescription>{error}</AlertDescription>
|
||||
</Alert>
|
||||
)
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="space-y-4 max-w-md mx-auto mt-4">
|
||||
<h2 className="text-xl font-semibold text-center mb-6">Service Status</h2>
|
||||
<div className="space-y-3">
|
||||
{services.map((service) => (
|
||||
<Alert
|
||||
key={service.name}
|
||||
variant={service.status === "error" ? "destructive" : "default"}
|
||||
className="flex items-center justify-between hover:bg-accent/50 transition-colors"
|
||||
>
|
||||
<div className="flex items-center gap-3">
|
||||
{service.status === "running" && (
|
||||
<CheckCircle2 className="h-5 w-5 text-green-500 shrink-0" />
|
||||
)}
|
||||
{service.status === "error" && (
|
||||
<XCircle className="h-5 w-5 text-red-500 shrink-0" />
|
||||
)}
|
||||
{service.status === "warning" && (
|
||||
<AlertCircle className="h-5 w-5 text-yellow-500 shrink-0" />
|
||||
)}
|
||||
<AlertTitle className="font-medium">{service.name}</AlertTitle>
|
||||
</div>
|
||||
<span className={`text-sm ${
|
||||
service.status === "running" ? "text-green-600" :
|
||||
service.status === "error" ? "text-red-600" :
|
||||
"text-yellow-600"
|
||||
}`}>
|
||||
{service.status.charAt(0).toUpperCase() + service.status.slice(1)}
|
||||
</span>
|
||||
</Alert>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
@@ -1,58 +0,0 @@
|
||||
import * as React from "react"
|
||||
import { cva, type VariantProps } from "class-variance-authority"
|
||||
import { cn } from "@/lib/utils"
|
||||
|
||||
const alertVariants = cva(
|
||||
"relative w-full rounded-lg border p-4 [&>svg~*]:pl-7 [&>svg+div]:translate-y-[-3px] [&>svg]:absolute [&>svg]:left-4 [&>svg]:top-4 [&>svg]:text-foreground",
|
||||
{
|
||||
variants: {
|
||||
variant: {
|
||||
default: "bg-background text-foreground",
|
||||
destructive:
|
||||
"border-destructive/50 text-destructive dark:border-destructive [&>svg]:text-destructive",
|
||||
},
|
||||
},
|
||||
defaultVariants: {
|
||||
variant: "default",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
const Alert = React.forwardRef<
|
||||
HTMLDivElement,
|
||||
React.HTMLAttributes<HTMLDivElement> & VariantProps<typeof alertVariants>
|
||||
>(({ className, variant, ...props }, ref) => (
|
||||
<div
|
||||
ref={ref}
|
||||
role="alert"
|
||||
className={cn(alertVariants({ variant }), className)}
|
||||
{...props}
|
||||
/>
|
||||
))
|
||||
Alert.displayName = "Alert"
|
||||
|
||||
const AlertTitle = React.forwardRef<
|
||||
HTMLParagraphElement,
|
||||
React.HTMLAttributes<HTMLHeadingElement>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<h5
|
||||
ref={ref}
|
||||
className={cn("mb-1 font-medium leading-none tracking-tight", className)}
|
||||
{...props}
|
||||
/>
|
||||
))
|
||||
AlertTitle.displayName = "AlertTitle"
|
||||
|
||||
const AlertDescription = React.forwardRef<
|
||||
HTMLParagraphElement,
|
||||
React.HTMLAttributes<HTMLParagraphElement>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<div
|
||||
ref={ref}
|
||||
className={cn("text-sm [&_p]:leading-relaxed", className)}
|
||||
{...props}
|
||||
/>
|
||||
))
|
||||
AlertDescription.displayName = "AlertDescription"
|
||||
|
||||
export { Alert, AlertTitle, AlertDescription }
|
@@ -1,6 +0,0 @@
|
||||
import { type ClassValue, clsx } from "clsx"
|
||||
import { twMerge } from "tailwind-merge"
|
||||
|
||||
export function cn(...inputs: ClassValue[]) {
|
||||
return twMerge(clsx(inputs))
|
||||
}
|
@@ -1,79 +0,0 @@
|
||||
import type { Config } from "tailwindcss";
|
||||
|
||||
const config: Config = {
|
||||
darkMode: ["class"],
|
||||
content: [
|
||||
'./pages/**/*.{ts,tsx}',
|
||||
'./components/**/*.{ts,tsx}',
|
||||
'./app/**/*.{ts,tsx}',
|
||||
'./src/**/*.{ts,tsx}',
|
||||
],
|
||||
theme: {
|
||||
container: {
|
||||
center: true,
|
||||
padding: "2rem",
|
||||
screens: {
|
||||
"2xl": "1400px",
|
||||
},
|
||||
},
|
||||
extend: {
|
||||
colors: {
|
||||
border: "hsl(var(--border))",
|
||||
input: "hsl(var(--input))",
|
||||
ring: "hsl(var(--ring))",
|
||||
background: "hsl(var(--background))",
|
||||
foreground: "hsl(var(--foreground))",
|
||||
primary: {
|
||||
DEFAULT: "hsl(var(--primary))",
|
||||
foreground: "hsl(var(--primary-foreground))",
|
||||
},
|
||||
secondary: {
|
||||
DEFAULT: "hsl(var(--secondary))",
|
||||
foreground: "hsl(var(--secondary-foreground))",
|
||||
},
|
||||
destructive: {
|
||||
DEFAULT: "hsl(var(--destructive))",
|
||||
foreground: "hsl(var(--destructive-foreground))",
|
||||
},
|
||||
muted: {
|
||||
DEFAULT: "hsl(var(--muted))",
|
||||
foreground: "hsl(var(--muted-foreground))",
|
||||
},
|
||||
accent: {
|
||||
DEFAULT: "hsl(var(--accent))",
|
||||
foreground: "hsl(var(--accent-foreground))",
|
||||
},
|
||||
popover: {
|
||||
DEFAULT: "hsl(var(--popover))",
|
||||
foreground: "hsl(var(--popover-foreground))",
|
||||
},
|
||||
card: {
|
||||
DEFAULT: "hsl(var(--card))",
|
||||
foreground: "hsl(var(--card-foreground))",
|
||||
},
|
||||
},
|
||||
borderRadius: {
|
||||
lg: "var(--radius)",
|
||||
md: "calc(var(--radius) - 2px)",
|
||||
sm: "calc(var(--radius) - 4px)",
|
||||
},
|
||||
keyframes: {
|
||||
"accordion-down": {
|
||||
from: { height: "0" },
|
||||
to: { height: "var(--radix-accordion-content-height)" },
|
||||
},
|
||||
"accordion-up": {
|
||||
from: { height: "var(--radix-accordion-content-height)" },
|
||||
to: { height: "0" },
|
||||
},
|
||||
},
|
||||
animation: {
|
||||
"accordion-down": "accordion-down 0.2s ease-out",
|
||||
"accordion-up": "accordion-up 0.2s ease-out",
|
||||
},
|
||||
},
|
||||
},
|
||||
plugins: [require("tailwindcss-animate")],
|
||||
}
|
||||
|
||||
export default config;
|
@@ -1,27 +0,0 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2017",
|
||||
"lib": ["dom", "dom.iterable", "esnext"],
|
||||
"allowJs": true,
|
||||
"skipLibCheck": true,
|
||||
"strict": true,
|
||||
"noEmit": true,
|
||||
"esModuleInterop": true,
|
||||
"module": "esnext",
|
||||
"moduleResolution": "bundler",
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"jsx": "preserve",
|
||||
"incremental": true,
|
||||
"plugins": [
|
||||
{
|
||||
"name": "next"
|
||||
}
|
||||
],
|
||||
"paths": {
|
||||
"@/*": ["./src/*"]
|
||||
}
|
||||
},
|
||||
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
|
||||
"exclude": ["node_modules"]
|
||||
}
|
@@ -1,17 +0,0 @@
|
||||
module.exports = {
|
||||
preset: 'ts-jest',
|
||||
testEnvironment: 'node',
|
||||
roots: ['<rootDir>/src'],
|
||||
testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
|
||||
transform: {
|
||||
'^.+\\.ts$': 'ts-jest',
|
||||
},
|
||||
moduleFileExtensions: ['ts', 'js', 'json', 'node'],
|
||||
collectCoverageFrom: [
|
||||
'src/**/*.{ts,js}',
|
||||
'!src/tests/**',
|
||||
'!**/node_modules/**',
|
||||
],
|
||||
coverageDirectory: 'coverage',
|
||||
setupFilesAfterEnv: ['<rootDir>/src/tests/setup.ts'],
|
||||
};
|
14318
package-lock.json
generated
61
package.json
@@ -1,80 +1,33 @@
|
||||
{
|
||||
"name": "perplexica-backend",
|
||||
"version": "1.10.0-rc2",
|
||||
"version": "1.0.0",
|
||||
"license": "MIT",
|
||||
"author": "ItzCrazyKns",
|
||||
"scripts": {
|
||||
"start": "ts-node src/index.ts",
|
||||
"start": "node --env-file=.env dist/app.js",
|
||||
"build": "tsc",
|
||||
"dev": "nodemon src/index.ts",
|
||||
"db:push": "drizzle-kit push sqlite",
|
||||
"dev": "nodemon -r dotenv/config src/app.ts",
|
||||
"format": "prettier . --check",
|
||||
"format:write": "prettier . --write",
|
||||
"test:search": "ts-node src/tests/testSearch.ts",
|
||||
"test:supabase": "ts-node src/tests/supabaseTest.ts",
|
||||
"test:deepseek": "ts-node src/tests/testDeepseek.ts",
|
||||
"test:ollama": "ts-node src/tests/testOllama.ts",
|
||||
"test": "jest",
|
||||
"test:watch": "jest --watch",
|
||||
"test:coverage": "jest --coverage",
|
||||
"build:css": "tailwindcss -i ./src/styles/input.css -o ./public/styles/output.css",
|
||||
"watch:css": "tailwindcss -i ./src/styles/input.css -o ./public/styles/output.css --watch"
|
||||
"format:write": "prettier . --write"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@testing-library/jest-dom": "^6.1.5",
|
||||
"@types/better-sqlite3": "^7.6.10",
|
||||
"@types/cors": "^2.8.17",
|
||||
"@types/express": "^4.17.21",
|
||||
"@types/html-to-text": "^9.0.4",
|
||||
"@types/jest": "^29.5.11",
|
||||
"@types/multer": "^1.4.12",
|
||||
"@types/node-fetch": "^2.6.12",
|
||||
"@types/pdf-parse": "^1.1.4",
|
||||
"@types/readable-stream": "^4.0.11",
|
||||
"@types/supertest": "^6.0.2",
|
||||
"@types/ws": "^8.5.12",
|
||||
"autoprefixer": "^10.4.20",
|
||||
"drizzle-kit": "^0.22.7",
|
||||
"jest": "^29.7.0",
|
||||
"nodemon": "^3.1.0",
|
||||
"postcss": "^8.4.49",
|
||||
"prettier": "^3.2.5",
|
||||
"supertest": "^7.0.0",
|
||||
"tailwindcss": "^3.4.17",
|
||||
"ts-jest": "^29.1.1",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.3"
|
||||
},
|
||||
"dependencies": {
|
||||
"@huggingface/transformers": "latest",
|
||||
"@iarna/toml": "^2.2.5",
|
||||
"@langchain/anthropic": "^0.2.3",
|
||||
"@langchain/community": "^0.2.16",
|
||||
"@langchain/google-genai": "^0.0.23",
|
||||
"@langchain/openai": "^0.0.25",
|
||||
"@shadcn/ui": "^0.0.4",
|
||||
"@supabase/supabase-js": "^2.47.10",
|
||||
"@xenova/transformers": "^2.17.1",
|
||||
"axios": "^1.6.8",
|
||||
"better-sqlite3": "^11.7.0",
|
||||
"cheerio": "^1.0.0",
|
||||
"compute-cosine-similarity": "^1.1.0",
|
||||
"compute-dot": "^1.1.0",
|
||||
"cors": "^2.8.5",
|
||||
"dotenv": "^16.4.7",
|
||||
"drizzle-orm": "^0.31.2",
|
||||
"dotenv": "^16.4.5",
|
||||
"express": "^4.19.2",
|
||||
"html-to-text": "^9.0.5",
|
||||
"langchain": "^0.1.30",
|
||||
"mammoth": "^1.8.0",
|
||||
"multer": "^1.4.5-lts.1",
|
||||
"node-fetch": "^2.7.0",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"robots-parser": "^3.0.1",
|
||||
"tesseract.js": "^4.1.4",
|
||||
"torch": "latest",
|
||||
"winston": "^3.13.0",
|
||||
"ws": "^8.17.1",
|
||||
"zod": "^3.24.1"
|
||||
"ws": "^8.16.0",
|
||||
"zod": "^3.22.4"
|
||||
}
|
||||
}
|
||||
|
@@ -1,6 +0,0 @@
|
||||
module.exports = {
|
||||
plugins: {
|
||||
tailwindcss: {},
|
||||
autoprefixer: {},
|
||||
},
|
||||
}
|
@@ -1,214 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en" class="h-full bg-gray-50">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>OffMarket Pro - Business Search</title>
|
||||
<link href="/styles/output.css" rel="stylesheet">
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
|
||||
</head>
|
||||
<body class="min-h-full">
|
||||
<div class="bg-white">
|
||||
<!-- Navigation -->
|
||||
<nav class="bg-white shadow-sm">
|
||||
<div class="mx-auto max-w-7xl px-4 sm:px-6 lg:px-8">
|
||||
<div class="flex h-16 justify-between items-center">
|
||||
<div class="flex-shrink-0 flex items-center">
|
||||
<h1 class="text-xl font-bold text-gray-900">OffMarket Pro</h1>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
|
||||
<!-- Main Content -->
|
||||
<main class="mx-auto max-w-7xl px-4 sm:px-6 lg:px-8 py-8">
|
||||
<!-- Search Form -->
|
||||
<div class="mb-8">
|
||||
<h2 class="text-2xl font-bold text-gray-900 mb-6">Find Off-Market Property Services</h2>
|
||||
<div class="grid grid-cols-1 gap-4 sm:grid-cols-2">
|
||||
<div>
|
||||
<label for="searchQuery" class="block text-sm font-medium text-gray-700">Service Type</label>
|
||||
<input type="text" id="searchQuery" class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-primary focus:ring-primary sm:text-sm" placeholder="e.g. plumber, electrician">
|
||||
</div>
|
||||
<div>
|
||||
<label for="searchLocation" class="block text-sm font-medium text-gray-700">Location</label>
|
||||
<input type="text" id="searchLocation" class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-primary focus:ring-primary sm:text-sm" placeholder="e.g. Denver, CO">
|
||||
</div>
|
||||
</div>
|
||||
<div class="mt-4">
|
||||
<button onclick="performSearch()" class="inline-flex items-center px-4 py-2 border border-transparent text-sm font-medium rounded-md shadow-sm text-white bg-primary hover:bg-primary-hover focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-primary">
|
||||
Search
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Progress Indicator -->
|
||||
<div id="searchProgress" class="hidden mb-8">
|
||||
<div class="bg-white shadow sm:rounded-lg">
|
||||
<div class="px-4 py-5 sm:p-6">
|
||||
<h3 class="text-lg font-medium leading-6 text-gray-900">Search Progress</h3>
|
||||
<div class="mt-4">
|
||||
<div class="relative pt-1">
|
||||
<div class="overflow-hidden h-2 mb-4 text-xs flex rounded bg-gray-200">
|
||||
<div id="progressBar" class="shadow-none flex flex-col text-center whitespace-nowrap text-white justify-center bg-primary transition-all duration-500" style="width: 0%"></div>
|
||||
</div>
|
||||
<div id="progressText" class="text-sm text-gray-600"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Error Display -->
|
||||
<div id="errorDisplay" class="hidden mb-8">
|
||||
<div class="rounded-md bg-red-50 p-4">
|
||||
<div class="flex">
|
||||
<div class="flex-shrink-0">
|
||||
<svg class="h-5 w-5 text-red-400" viewBox="0 0 20 20" fill="currentColor">
|
||||
<path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clip-rule="evenodd"/>
|
||||
</svg>
|
||||
</div>
|
||||
<div class="ml-3">
|
||||
<h3 class="text-sm font-medium text-red-800">Error</h3>
|
||||
<div class="mt-2 text-sm text-red-700">
|
||||
<p id="errorMessage"></p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Results Table -->
|
||||
<div id="resultsContainer" class="hidden">
|
||||
<div class="bg-white shadow overflow-hidden sm:rounded-lg">
|
||||
<div class="px-4 py-5 sm:px-6">
|
||||
<h3 class="text-lg leading-6 font-medium text-gray-900">Search Results</h3>
|
||||
</div>
|
||||
<div class="border-t border-gray-200">
|
||||
<div class="overflow-x-auto">
|
||||
<table class="min-w-full divide-y divide-gray-200">
|
||||
<thead class="bg-gray-50">
|
||||
<tr>
|
||||
<th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">Business</th>
|
||||
<th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">Contact</th>
|
||||
<th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="resultsBody" class="bg-white divide-y divide-gray-200">
|
||||
<!-- Results will be inserted here -->
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
class SearchProgress {
|
||||
constructor() {
|
||||
this.progressBar = document.getElementById('progressBar');
|
||||
this.progressText = document.getElementById('progressText');
|
||||
this.container = document.getElementById('searchProgress');
|
||||
}
|
||||
|
||||
show() {
|
||||
this.container.classList.remove('hidden');
|
||||
this.setProgress(0, 'Starting search...');
|
||||
}
|
||||
|
||||
hide() {
|
||||
this.container.classList.add('hidden');
|
||||
}
|
||||
|
||||
setProgress(percent, message) {
|
||||
this.progressBar.style.width = `${percent}%`;
|
||||
this.progressText.textContent = message;
|
||||
}
|
||||
|
||||
showError(message) {
|
||||
this.setProgress(100, `Error: ${message}`);
|
||||
this.progressBar.classList.remove('bg-primary');
|
||||
this.progressBar.classList.add('bg-red-500');
|
||||
}
|
||||
}
|
||||
|
||||
async function performSearch() {
|
||||
const query = document.getElementById('searchQuery').value;
|
||||
const location = document.getElementById('searchLocation').value;
|
||||
|
||||
if (!query || !location) {
|
||||
showError('Please enter both search query and location');
|
||||
return;
|
||||
}
|
||||
|
||||
const progress = new SearchProgress();
|
||||
progress.show();
|
||||
|
||||
try {
|
||||
document.getElementById('errorDisplay').classList.add('hidden');
|
||||
document.getElementById('resultsContainer').classList.add('hidden');
|
||||
|
||||
const response = await fetch('/api/search', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
body: JSON.stringify({ query, location })
|
||||
});
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!data.success) {
|
||||
throw new Error(data.error || 'Search failed');
|
||||
}
|
||||
|
||||
displayResults(data.results);
|
||||
progress.hide();
|
||||
|
||||
} catch (error) {
|
||||
console.error('Search error:', error);
|
||||
progress.showError(error.message);
|
||||
showError(error.message);
|
||||
}
|
||||
}
|
||||
|
||||
function showError(message) {
|
||||
const errorDisplay = document.getElementById('errorDisplay');
|
||||
const errorMessage = document.getElementById('errorMessage');
|
||||
errorMessage.textContent = message;
|
||||
errorDisplay.classList.remove('hidden');
|
||||
}
|
||||
|
||||
function displayResults(results) {
|
||||
const container = document.getElementById('resultsContainer');
|
||||
const tbody = document.getElementById('resultsBody');
|
||||
|
||||
tbody.innerHTML = results.map(business => `
|
||||
<tr>
|
||||
<td class="px-6 py-4">
|
||||
<div class="text-sm font-medium text-gray-900">${business.name}</div>
|
||||
<div class="text-sm text-gray-500">${business.description}</div>
|
||||
</td>
|
||||
<td class="px-6 py-4">
|
||||
<div class="text-sm text-gray-900">${business.address}</div>
|
||||
<div class="text-sm text-gray-500">${business.phone}</div>
|
||||
</td>
|
||||
<td class="px-6 py-4">
|
||||
${business.website ?
|
||||
`<a href="${business.website}" target="_blank"
|
||||
class="inline-flex items-center px-3 py-2 border border-transparent text-sm leading-4 font-medium rounded-md text-white bg-primary hover:bg-primary-hover focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-primary">
|
||||
Visit Website
|
||||
</a>` :
|
||||
'<span class="text-sm text-gray-500">No website available</span>'
|
||||
}
|
||||
</td>
|
||||
</tr>
|
||||
`).join('');
|
||||
|
||||
container.classList.remove('hidden');
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
@@ -1,14 +0,0 @@
|
||||
[GENERAL]
|
||||
PORT = 3001 # Port to run the server on
|
||||
SIMILARITY_MEASURE = "cosine" # "cosine" or "dot"
|
||||
KEEP_ALIVE = "5m" # How long to keep Ollama models loaded into memory. (Instead of using -1 use "-1m")
|
||||
|
||||
[API_KEYS]
|
||||
OPENAI = "" # OpenAI API key - sk-1234567890abcdef1234567890abcdef
|
||||
GROQ = "" # Groq API key - gsk_1234567890abcdef1234567890abcdef
|
||||
ANTHROPIC = "" # Anthropic API key - sk-ant-1234567890abcdef1234567890abcdef
|
||||
GEMINI = "" # Gemini API key - sk-1234567890abcdef1234567890abcdef
|
||||
|
||||
[API_ENDPOINTS]
|
||||
SEARXNG = "http://localhost:32768" # SearxNG API URL
|
||||
OLLAMA = "" # Ollama API URL - http://host.docker.internal:11434
|
2380
searxng-settings.yml
Normal file
3
searxng.dockerfile
Normal file
@@ -0,0 +1,3 @@
|
||||
FROM searxng/searxng
|
||||
|
||||
COPY searxng-settings.yml /etc/searxng/settings.yml
|
@@ -1,3 +0,0 @@
|
||||
[botdetection.ip_limit]
|
||||
# activate link_token method in the ip_limit method
|
||||
link_token = true
|
@@ -1,59 +0,0 @@
|
||||
use_default_settings: true
|
||||
|
||||
general:
|
||||
instance_name: 'searxng'
|
||||
|
||||
search:
|
||||
autocomplete: 'google'
|
||||
formats:
|
||||
- html
|
||||
- json
|
||||
|
||||
server:
|
||||
secret_key: 'a2fb23f1b02e6ee83875b09826990de0f6bd908b6638e8c10277d415f6ab852b' # Is overwritten by ${SEARXNG_SECRET}
|
||||
port: 8080
|
||||
bind_address: "0.0.0.0"
|
||||
base_url: http://localhost:8080/
|
||||
|
||||
engines:
|
||||
- name: wolframalpha
|
||||
disabled: false
|
||||
|
||||
- name: google
|
||||
engine: google
|
||||
shortcut: g
|
||||
disabled: false
|
||||
|
||||
- name: bing
|
||||
engine: bing
|
||||
shortcut: b
|
||||
disabled: false
|
||||
|
||||
- name: duckduckgo
|
||||
engine: duckduckgo
|
||||
shortcut: d
|
||||
disabled: false
|
||||
|
||||
- name: yelp
|
||||
engine: yelp
|
||||
shortcut: y
|
||||
disabled: false
|
||||
|
||||
ui:
|
||||
static_path: ""
|
||||
templates_path: ""
|
||||
default_theme: simple
|
||||
default_locale: en
|
||||
results_on_new_tab: false
|
||||
|
||||
outgoing:
|
||||
request_timeout: 6.0
|
||||
max_request_timeout: 10.0
|
||||
pool_connections: 100
|
||||
pool_maxsize: 10
|
||||
enable_http2: true
|
||||
|
||||
server:
|
||||
limiter: false
|
||||
image_proxy: false
|
||||
http_protocol_version: "1.0"
|
@@ -1,50 +0,0 @@
|
||||
[uwsgi]
|
||||
# Who will run the code
|
||||
uid = searxng
|
||||
gid = searxng
|
||||
|
||||
# Number of workers (usually CPU count)
|
||||
# default value: %k (= number of CPU core, see Dockerfile)
|
||||
workers = %k
|
||||
|
||||
# Number of threads per worker
|
||||
# default value: 4 (see Dockerfile)
|
||||
threads = 4
|
||||
|
||||
# The right granted on the created socket
|
||||
chmod-socket = 666
|
||||
|
||||
# Plugin to use and interpreter config
|
||||
single-interpreter = true
|
||||
master = true
|
||||
plugin = python3
|
||||
lazy-apps = true
|
||||
enable-threads = 4
|
||||
|
||||
# Module to import
|
||||
module = searx.webapp
|
||||
|
||||
# Virtualenv and python path
|
||||
pythonpath = /usr/local/searxng/
|
||||
chdir = /usr/local/searxng/searx/
|
||||
|
||||
# automatically set processes name to something meaningful
|
||||
auto-procname = true
|
||||
|
||||
# Disable request logging for privacy
|
||||
disable-logging = true
|
||||
log-5xx = true
|
||||
|
||||
# Set the max size of a request (request-body excluded)
|
||||
buffer-size = 8192
|
||||
|
||||
# No keep alive
|
||||
# See https://github.com/searx/searx-docker/issues/24
|
||||
add-header = Connection: close
|
||||
|
||||
# uwsgi serves the static files
|
||||
static-map = /static=/usr/local/searxng/searx/static
|
||||
# expires set to one day
|
||||
static-expires = /* 86400
|
||||
static-gzip-all = True
|
||||
offload-threads = 4
|
260
src/agents/academicSearchAgent.ts
Normal file
@@ -0,0 +1,260 @@
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import {
|
||||
PromptTemplate,
|
||||
ChatPromptTemplate,
|
||||
MessagesPlaceholder,
|
||||
} from '@langchain/core/prompts';
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { ChatOllama } from '@langchain/community/chat_models/ollama';
|
||||
import { Ollama } from '@langchain/community/llms/ollama';
|
||||
import { OllamaEmbeddings } from '@langchain/community/embeddings/ollama';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { Document } from '@langchain/core/documents';
|
||||
import { searchSearxng } from '../core/searxng';
|
||||
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import eventEmitter from 'events';
|
||||
import computeSimilarity from '../utils/computeSimilarity';
|
||||
|
||||
const chatLLM = new ChatOllama({
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
model: process.env.MODEL_NAME,
|
||||
temperature: 0.7,
|
||||
});
|
||||
|
||||
const llm = new Ollama({
|
||||
temperature: 0,
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const embeddings = new OllamaEmbeddings({
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const basicAcademicSearchRetrieverPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
|
||||
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
|
||||
|
||||
Example:
|
||||
1. Follow up question: How does stable diffusion work?
|
||||
Rephrased: Stable diffusion working
|
||||
|
||||
2. Follow up question: What is linear algebra?
|
||||
Rephrased: Linear algebra
|
||||
|
||||
3. Follow up question: What is the third law of thermodynamics?
|
||||
Rephrased: Third law of thermodynamics
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
const basicAcademicSearchResponsePrompt = `
|
||||
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Acadedemic', this means you will be searching for academic papers and articles on the web.
|
||||
|
||||
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
|
||||
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
|
||||
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
|
||||
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
|
||||
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
|
||||
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
|
||||
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
|
||||
|
||||
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
|
||||
talk about the context in your response.
|
||||
|
||||
<context>
|
||||
{context}
|
||||
</context>
|
||||
|
||||
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
|
||||
Anything between the \`context\` is retrieved from a search engine and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
|
||||
`;
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const handleStream = async (
|
||||
stream: AsyncGenerator<StreamEvent, any, unknown>,
|
||||
emitter: eventEmitter,
|
||||
) => {
|
||||
for await (const event of stream) {
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalSourceRetriever'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'sources', data: event.data.output }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_stream' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'response', data: event.data.chunk }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit('end');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const processDocs = async (docs: Document[]) => {
|
||||
return docs
|
||||
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
|
||||
.join('\n');
|
||||
};
|
||||
|
||||
const rerankDocs = async ({
|
||||
query,
|
||||
docs,
|
||||
}: {
|
||||
query: string;
|
||||
docs: Document[];
|
||||
}) => {
|
||||
if (docs.length === 0) {
|
||||
return docs;
|
||||
}
|
||||
|
||||
const docsWithContent = docs.filter(
|
||||
(doc) => doc.pageContent && doc.pageContent.length > 0,
|
||||
);
|
||||
|
||||
const docEmbeddings = await embeddings.embedDocuments(
|
||||
docsWithContent.map((doc) => doc.pageContent),
|
||||
);
|
||||
|
||||
const queryEmbedding = await embeddings.embedQuery(query);
|
||||
|
||||
const similarity = docEmbeddings.map((docEmbedding, i) => {
|
||||
const sim = computeSimilarity(queryEmbedding, docEmbedding);
|
||||
|
||||
return {
|
||||
index: i,
|
||||
similarity: sim,
|
||||
};
|
||||
});
|
||||
|
||||
const sortedDocs = similarity
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.slice(0, 15)
|
||||
.map((sim) => docsWithContent[sim.index]);
|
||||
|
||||
return sortedDocs;
|
||||
};
|
||||
|
||||
type BasicChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const basicAcademicSearchRetrieverChain = RunnableSequence.from([
|
||||
PromptTemplate.fromTemplate(basicAcademicSearchRetrieverPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
if (input === 'not_needed') {
|
||||
return { query: '', docs: [] };
|
||||
}
|
||||
|
||||
const res = await searchSearxng(input, {
|
||||
language: 'en',
|
||||
engines: [
|
||||
'arxiv',
|
||||
'google_scholar',
|
||||
'internet_archive_scholar',
|
||||
'pubmed',
|
||||
],
|
||||
});
|
||||
|
||||
const documents = res.results.map(
|
||||
(result) =>
|
||||
new Document({
|
||||
pageContent: result.content,
|
||||
metadata: {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
...(result.img_src && { img_src: result.img_src }),
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
return { query: input, docs: documents };
|
||||
}),
|
||||
]);
|
||||
|
||||
const basicAcademicSearchAnsweringChain = RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
query: (input: BasicChainInput) => input.query,
|
||||
chat_history: (input: BasicChainInput) => input.chat_history,
|
||||
context: RunnableSequence.from([
|
||||
(input) => ({
|
||||
query: input.query,
|
||||
chat_history: formatChatHistoryAsString(input.chat_history),
|
||||
}),
|
||||
basicAcademicSearchRetrieverChain
|
||||
.pipe(rerankDocs)
|
||||
.withConfig({
|
||||
runName: 'FinalSourceRetriever',
|
||||
})
|
||||
.pipe(processDocs),
|
||||
]),
|
||||
}),
|
||||
ChatPromptTemplate.fromMessages([
|
||||
['system', basicAcademicSearchResponsePrompt],
|
||||
new MessagesPlaceholder('chat_history'),
|
||||
['user', '{query}'],
|
||||
]),
|
||||
chatLLM,
|
||||
strParser,
|
||||
]).withConfig({
|
||||
runName: 'FinalResponseGenerator',
|
||||
});
|
||||
|
||||
const basicAcademicSearch = (query: string, history: BaseMessage[]) => {
|
||||
const emitter = new eventEmitter();
|
||||
|
||||
try {
|
||||
const stream = basicAcademicSearchAnsweringChain.streamEvents(
|
||||
{
|
||||
chat_history: history,
|
||||
query: query,
|
||||
},
|
||||
{
|
||||
version: 'v1',
|
||||
},
|
||||
);
|
||||
|
||||
handleStream(stream, emitter);
|
||||
} catch (err) {
|
||||
emitter.emit(
|
||||
'error',
|
||||
JSON.stringify({ data: 'An error has occurred please try again later' }),
|
||||
);
|
||||
console.error(err);
|
||||
}
|
||||
|
||||
return emitter;
|
||||
};
|
||||
|
||||
const handleAcademicSearch = (message: string, history: BaseMessage[]) => {
|
||||
const emitter = basicAcademicSearch(message, history);
|
||||
return emitter;
|
||||
};
|
||||
|
||||
export default handleAcademicSearch;
|
81
src/agents/imageSearchAgent.ts
Normal file
@@ -0,0 +1,81 @@
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { PromptTemplate } from '@langchain/core/prompts';
|
||||
import { Ollama } from '@langchain/community/llms/ollama';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { searchSearxng } from '../core/searxng';
|
||||
|
||||
const llm = new Ollama({
|
||||
temperature: 0,
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const imageSearchChainPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question so it is a standalone question that can be used by the LLM to search the web for images.
|
||||
You need to make sure the rephrased question agrees with the conversation and is relevant to the conversation.
|
||||
|
||||
Example:
|
||||
1. Follow up question: What is a cat?
|
||||
Rephrased: A cat
|
||||
|
||||
2. Follow up question: What is a car? How does it works?
|
||||
Rephrased: Car working
|
||||
|
||||
3. Follow up question: How does an AC work?
|
||||
Rephrased: AC working
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
type ImageSearchChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const imageSearchChain = RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
chat_history: (input: ImageSearchChainInput) => {
|
||||
return formatChatHistoryAsString(input.chat_history);
|
||||
},
|
||||
query: (input: ImageSearchChainInput) => {
|
||||
return input.query;
|
||||
},
|
||||
}),
|
||||
PromptTemplate.fromTemplate(imageSearchChainPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
const res = await searchSearxng(input, {
|
||||
categories: ['images'],
|
||||
engines: ['bing_images', 'google_images'],
|
||||
});
|
||||
|
||||
const images = [];
|
||||
|
||||
res.results.forEach((result) => {
|
||||
if (result.img_src && result.url && result.title) {
|
||||
images.push({
|
||||
img_src: result.img_src,
|
||||
url: result.url,
|
||||
title: result.title,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return images.slice(0, 10);
|
||||
}),
|
||||
]);
|
||||
|
||||
export default imageSearchChain;
|
256
src/agents/redditSearchAgent.ts
Normal file
@@ -0,0 +1,256 @@
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import {
|
||||
PromptTemplate,
|
||||
ChatPromptTemplate,
|
||||
MessagesPlaceholder,
|
||||
} from '@langchain/core/prompts';
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { ChatOllama } from '@langchain/community/chat_models/ollama';
|
||||
import { Ollama } from '@langchain/community/llms/ollama';
|
||||
import { OllamaEmbeddings } from '@langchain/community/embeddings/ollama';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { Document } from '@langchain/core/documents';
|
||||
import { searchSearxng } from '../core/searxng';
|
||||
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import eventEmitter from 'events';
|
||||
import computeSimilarity from '../utils/computeSimilarity';
|
||||
|
||||
const chatLLM = new ChatOllama({
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
model: process.env.MODEL_NAME,
|
||||
temperature: 0.7,
|
||||
});
|
||||
|
||||
const llm = new Ollama({
|
||||
temperature: 0,
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const embeddings = new OllamaEmbeddings({
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const basicRedditSearchRetrieverPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
|
||||
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
|
||||
|
||||
Example:
|
||||
1. Follow up question: Which company is most likely to create an AGI
|
||||
Rephrased: Which company is most likely to create an AGI
|
||||
|
||||
2. Follow up question: Is Earth flat?
|
||||
Rephrased: Is Earth flat?
|
||||
|
||||
3. Follow up question: Is there life on Mars?
|
||||
Rephrased: Is there life on Mars?
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
const basicRedditSearchResponsePrompt = `
|
||||
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Reddit', this means you will be searching for information, opinions and discussions on the web using Reddit.
|
||||
|
||||
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
|
||||
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
|
||||
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
|
||||
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
|
||||
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
|
||||
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
|
||||
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
|
||||
|
||||
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by Reddit and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
|
||||
talk about the context in your response.
|
||||
|
||||
<context>
|
||||
{context}
|
||||
</context>
|
||||
|
||||
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
|
||||
Anything between the \`context\` is retrieved from Reddit and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
|
||||
`;
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const handleStream = async (
|
||||
stream: AsyncGenerator<StreamEvent, any, unknown>,
|
||||
emitter: eventEmitter,
|
||||
) => {
|
||||
for await (const event of stream) {
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalSourceRetriever'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'sources', data: event.data.output }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_stream' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'response', data: event.data.chunk }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit('end');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const processDocs = async (docs: Document[]) => {
|
||||
return docs
|
||||
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
|
||||
.join('\n');
|
||||
};
|
||||
|
||||
const rerankDocs = async ({
|
||||
query,
|
||||
docs,
|
||||
}: {
|
||||
query: string;
|
||||
docs: Document[];
|
||||
}) => {
|
||||
if (docs.length === 0) {
|
||||
return docs;
|
||||
}
|
||||
|
||||
const docsWithContent = docs.filter(
|
||||
(doc) => doc.pageContent && doc.pageContent.length > 0,
|
||||
);
|
||||
|
||||
const docEmbeddings = await embeddings.embedDocuments(
|
||||
docsWithContent.map((doc) => doc.pageContent),
|
||||
);
|
||||
|
||||
const queryEmbedding = await embeddings.embedQuery(query);
|
||||
|
||||
const similarity = docEmbeddings.map((docEmbedding, i) => {
|
||||
const sim = computeSimilarity(queryEmbedding, docEmbedding);
|
||||
|
||||
return {
|
||||
index: i,
|
||||
similarity: sim,
|
||||
};
|
||||
});
|
||||
|
||||
const sortedDocs = similarity
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.slice(0, 15)
|
||||
.filter((sim) => sim.similarity > 0.3)
|
||||
.map((sim) => docsWithContent[sim.index]);
|
||||
|
||||
return sortedDocs;
|
||||
};
|
||||
|
||||
type BasicChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const basicRedditSearchRetrieverChain = RunnableSequence.from([
|
||||
PromptTemplate.fromTemplate(basicRedditSearchRetrieverPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
if (input === 'not_needed') {
|
||||
return { query: '', docs: [] };
|
||||
}
|
||||
|
||||
const res = await searchSearxng(input, {
|
||||
language: 'en',
|
||||
engines: ['reddit'],
|
||||
});
|
||||
|
||||
const documents = res.results.map(
|
||||
(result) =>
|
||||
new Document({
|
||||
pageContent: result.content ? result.content : result.title,
|
||||
metadata: {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
...(result.img_src && { img_src: result.img_src }),
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
return { query: input, docs: documents };
|
||||
}),
|
||||
]);
|
||||
|
||||
const basicRedditSearchAnsweringChain = RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
query: (input: BasicChainInput) => input.query,
|
||||
chat_history: (input: BasicChainInput) => input.chat_history,
|
||||
context: RunnableSequence.from([
|
||||
(input) => ({
|
||||
query: input.query,
|
||||
chat_history: formatChatHistoryAsString(input.chat_history),
|
||||
}),
|
||||
basicRedditSearchRetrieverChain
|
||||
.pipe(rerankDocs)
|
||||
.withConfig({
|
||||
runName: 'FinalSourceRetriever',
|
||||
})
|
||||
.pipe(processDocs),
|
||||
]),
|
||||
}),
|
||||
ChatPromptTemplate.fromMessages([
|
||||
['system', basicRedditSearchResponsePrompt],
|
||||
new MessagesPlaceholder('chat_history'),
|
||||
['user', '{query}'],
|
||||
]),
|
||||
chatLLM,
|
||||
strParser,
|
||||
]).withConfig({
|
||||
runName: 'FinalResponseGenerator',
|
||||
});
|
||||
|
||||
const basicRedditSearch = (query: string, history: BaseMessage[]) => {
|
||||
const emitter = new eventEmitter();
|
||||
|
||||
try {
|
||||
const stream = basicRedditSearchAnsweringChain.streamEvents(
|
||||
{
|
||||
chat_history: history,
|
||||
query: query,
|
||||
},
|
||||
{
|
||||
version: 'v1',
|
||||
},
|
||||
);
|
||||
|
||||
handleStream(stream, emitter);
|
||||
} catch (err) {
|
||||
emitter.emit(
|
||||
'error',
|
||||
JSON.stringify({ data: 'An error has occurred please try again later' }),
|
||||
);
|
||||
console.error(err);
|
||||
}
|
||||
|
||||
return emitter;
|
||||
};
|
||||
|
||||
const handleRedditSearch = (message: string, history: BaseMessage[]) => {
|
||||
const emitter = basicRedditSearch(message, history);
|
||||
return emitter;
|
||||
};
|
||||
|
||||
export default handleRedditSearch;
|
255
src/agents/webSearchAgent.ts
Normal file
@@ -0,0 +1,255 @@
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import {
|
||||
PromptTemplate,
|
||||
ChatPromptTemplate,
|
||||
MessagesPlaceholder,
|
||||
} from '@langchain/core/prompts';
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { ChatOllama } from '@langchain/community/chat_models/ollama';
|
||||
import { Ollama } from '@langchain/community/llms/ollama';
|
||||
import { OllamaEmbeddings } from '@langchain/community/embeddings/ollama';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { Document } from '@langchain/core/documents';
|
||||
import { searchSearxng } from '../core/searxng';
|
||||
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import eventEmitter from 'events';
|
||||
import computeSimilarity from '../utils/computeSimilarity';
|
||||
|
||||
const chatLLM = new ChatOllama({
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
model: process.env.MODEL_NAME,
|
||||
temperature: 0.7,
|
||||
});
|
||||
|
||||
const llm = new Ollama({
|
||||
temperature: 0,
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const embeddings = new OllamaEmbeddings({
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const basicSearchRetrieverPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
|
||||
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
|
||||
|
||||
Example:
|
||||
1. Follow up question: What is the capital of France?
|
||||
Rephrased: Capital of france
|
||||
|
||||
2. Follow up question: What is the population of New York City?
|
||||
Rephrased: Population of New York City
|
||||
|
||||
3. Follow up question: What is Docker?
|
||||
Rephrased: What is Docker
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
const basicWebSearchResponsePrompt = `
|
||||
You are Perplexica, an AI model who is expert at searching the web and answering user's queries.
|
||||
|
||||
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
|
||||
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
|
||||
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
|
||||
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
|
||||
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
|
||||
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
|
||||
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
|
||||
|
||||
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
|
||||
talk about the context in your response.
|
||||
|
||||
<context>
|
||||
{context}
|
||||
</context>
|
||||
|
||||
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
|
||||
Anything between the \`context\` is retrieved from a search engine and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
|
||||
`;
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const handleStream = async (
|
||||
stream: AsyncGenerator<StreamEvent, any, unknown>,
|
||||
emitter: eventEmitter,
|
||||
) => {
|
||||
for await (const event of stream) {
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalSourceRetriever'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'sources', data: event.data.output }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_stream' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'response', data: event.data.chunk }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit('end');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const processDocs = async (docs: Document[]) => {
|
||||
return docs
|
||||
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
|
||||
.join('\n');
|
||||
};
|
||||
|
||||
const rerankDocs = async ({
|
||||
query,
|
||||
docs,
|
||||
}: {
|
||||
query: string;
|
||||
docs: Document[];
|
||||
}) => {
|
||||
if (docs.length === 0) {
|
||||
return docs;
|
||||
}
|
||||
|
||||
const docsWithContent = docs.filter(
|
||||
(doc) => doc.pageContent && doc.pageContent.length > 0,
|
||||
);
|
||||
|
||||
const docEmbeddings = await embeddings.embedDocuments(
|
||||
docsWithContent.map((doc) => doc.pageContent),
|
||||
);
|
||||
|
||||
const queryEmbedding = await embeddings.embedQuery(query);
|
||||
|
||||
const similarity = docEmbeddings.map((docEmbedding, i) => {
|
||||
const sim = computeSimilarity(queryEmbedding, docEmbedding);
|
||||
|
||||
return {
|
||||
index: i,
|
||||
similarity: sim,
|
||||
};
|
||||
});
|
||||
|
||||
const sortedDocs = similarity
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.filter((sim) => sim.similarity > 0.5)
|
||||
.slice(0, 15)
|
||||
.map((sim) => docsWithContent[sim.index]);
|
||||
|
||||
return sortedDocs;
|
||||
};
|
||||
|
||||
type BasicChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const basicWebSearchRetrieverChain = RunnableSequence.from([
|
||||
PromptTemplate.fromTemplate(basicSearchRetrieverPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
if (input === 'not_needed') {
|
||||
return { query: '', docs: [] };
|
||||
}
|
||||
|
||||
const res = await searchSearxng(input, {
|
||||
language: 'en',
|
||||
});
|
||||
|
||||
const documents = res.results.map(
|
||||
(result) =>
|
||||
new Document({
|
||||
pageContent: result.content,
|
||||
metadata: {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
...(result.img_src && { img_src: result.img_src }),
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
return { query: input, docs: documents };
|
||||
}),
|
||||
]);
|
||||
|
||||
const basicWebSearchAnsweringChain = RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
query: (input: BasicChainInput) => input.query,
|
||||
chat_history: (input: BasicChainInput) => input.chat_history,
|
||||
context: RunnableSequence.from([
|
||||
(input) => ({
|
||||
query: input.query,
|
||||
chat_history: formatChatHistoryAsString(input.chat_history),
|
||||
}),
|
||||
basicWebSearchRetrieverChain
|
||||
.pipe(rerankDocs)
|
||||
.withConfig({
|
||||
runName: 'FinalSourceRetriever',
|
||||
})
|
||||
.pipe(processDocs),
|
||||
]),
|
||||
}),
|
||||
ChatPromptTemplate.fromMessages([
|
||||
['system', basicWebSearchResponsePrompt],
|
||||
new MessagesPlaceholder('chat_history'),
|
||||
['user', '{query}'],
|
||||
]),
|
||||
chatLLM,
|
||||
strParser,
|
||||
]).withConfig({
|
||||
runName: 'FinalResponseGenerator',
|
||||
});
|
||||
|
||||
const basicWebSearch = (query: string, history: BaseMessage[]) => {
|
||||
const emitter = new eventEmitter();
|
||||
|
||||
try {
|
||||
const stream = basicWebSearchAnsweringChain.streamEvents(
|
||||
{
|
||||
chat_history: history,
|
||||
query: query,
|
||||
},
|
||||
{
|
||||
version: 'v1',
|
||||
},
|
||||
);
|
||||
|
||||
handleStream(stream, emitter);
|
||||
} catch (err) {
|
||||
emitter.emit(
|
||||
'error',
|
||||
JSON.stringify({ data: 'An error has occurred please try again later' }),
|
||||
);
|
||||
console.error(err);
|
||||
}
|
||||
|
||||
return emitter;
|
||||
};
|
||||
|
||||
const handleWebSearch = (message: string, history: BaseMessage[]) => {
|
||||
const emitter = basicWebSearch(message, history);
|
||||
return emitter;
|
||||
};
|
||||
|
||||
export default handleWebSearch;
|
212
src/agents/wolframAlphaSearchAgent.ts
Normal file
@@ -0,0 +1,212 @@
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import {
|
||||
PromptTemplate,
|
||||
ChatPromptTemplate,
|
||||
MessagesPlaceholder,
|
||||
} from '@langchain/core/prompts';
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { ChatOllama } from '@langchain/community/chat_models/ollama';
|
||||
import { Ollama } from '@langchain/community/llms/ollama';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { Document } from '@langchain/core/documents';
|
||||
import { searchSearxng } from '../core/searxng';
|
||||
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import eventEmitter from 'events';
|
||||
|
||||
const chatLLM = new ChatOllama({
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
model: process.env.MODEL_NAME,
|
||||
temperature: 0.7,
|
||||
});
|
||||
|
||||
const llm = new Ollama({
|
||||
temperature: 0,
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const basicWolframAlphaSearchRetrieverPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
|
||||
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
|
||||
|
||||
Example:
|
||||
1. Follow up question: What is the atomic radius of S?
|
||||
Rephrased: Atomic radius of S
|
||||
|
||||
2. Follow up question: What is linear algebra?
|
||||
Rephrased: Linear algebra
|
||||
|
||||
3. Follow up question: What is the third law of thermodynamics?
|
||||
Rephrased: Third law of thermodynamics
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
const basicWolframAlphaSearchResponsePrompt = `
|
||||
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Wolfram Alpha', this means you will be searching for information on the web using Wolfram Alpha. It is a computational knowledge engine that can answer factual queries and perform computations.
|
||||
|
||||
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
|
||||
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
|
||||
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
|
||||
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
|
||||
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
|
||||
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
|
||||
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
|
||||
|
||||
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by Wolfram Alpha and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
|
||||
talk about the context in your response.
|
||||
|
||||
<context>
|
||||
{context}
|
||||
</context>
|
||||
|
||||
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
|
||||
Anything between the \`context\` is retrieved from Wolfram Alpha and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
|
||||
`;
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const handleStream = async (
|
||||
stream: AsyncGenerator<StreamEvent, any, unknown>,
|
||||
emitter: eventEmitter,
|
||||
) => {
|
||||
for await (const event of stream) {
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalSourceRetriever'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'sources', data: event.data.output }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_stream' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'response', data: event.data.chunk }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit('end');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const processDocs = async (docs: Document[]) => {
|
||||
return docs
|
||||
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
|
||||
.join('\n');
|
||||
};
|
||||
|
||||
type BasicChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const basicWolframAlphaSearchRetrieverChain = RunnableSequence.from([
|
||||
PromptTemplate.fromTemplate(basicWolframAlphaSearchRetrieverPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
if (input === 'not_needed') {
|
||||
return { query: '', docs: [] };
|
||||
}
|
||||
|
||||
const res = await searchSearxng(input, {
|
||||
language: 'en',
|
||||
engines: ['wolframalpha'],
|
||||
});
|
||||
|
||||
const documents = res.results.map(
|
||||
(result) =>
|
||||
new Document({
|
||||
pageContent: result.content,
|
||||
metadata: {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
...(result.img_src && { img_src: result.img_src }),
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
return { query: input, docs: documents };
|
||||
}),
|
||||
]);
|
||||
|
||||
const basicWolframAlphaSearchAnsweringChain = RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
query: (input: BasicChainInput) => input.query,
|
||||
chat_history: (input: BasicChainInput) => input.chat_history,
|
||||
context: RunnableSequence.from([
|
||||
(input) => ({
|
||||
query: input.query,
|
||||
chat_history: formatChatHistoryAsString(input.chat_history),
|
||||
}),
|
||||
basicWolframAlphaSearchRetrieverChain
|
||||
.pipe(({ query, docs }) => {
|
||||
return docs;
|
||||
})
|
||||
.withConfig({
|
||||
runName: 'FinalSourceRetriever',
|
||||
})
|
||||
.pipe(processDocs),
|
||||
]),
|
||||
}),
|
||||
ChatPromptTemplate.fromMessages([
|
||||
['system', basicWolframAlphaSearchResponsePrompt],
|
||||
new MessagesPlaceholder('chat_history'),
|
||||
['user', '{query}'],
|
||||
]),
|
||||
chatLLM,
|
||||
strParser,
|
||||
]).withConfig({
|
||||
runName: 'FinalResponseGenerator',
|
||||
});
|
||||
|
||||
const basicWolframAlphaSearch = (query: string, history: BaseMessage[]) => {
|
||||
const emitter = new eventEmitter();
|
||||
|
||||
try {
|
||||
const stream = basicWolframAlphaSearchAnsweringChain.streamEvents(
|
||||
{
|
||||
chat_history: history,
|
||||
query: query,
|
||||
},
|
||||
{
|
||||
version: 'v1',
|
||||
},
|
||||
);
|
||||
|
||||
handleStream(stream, emitter);
|
||||
} catch (err) {
|
||||
emitter.emit(
|
||||
'error',
|
||||
JSON.stringify({ data: 'An error has occurred please try again later' }),
|
||||
);
|
||||
console.error(err);
|
||||
}
|
||||
|
||||
return emitter;
|
||||
};
|
||||
|
||||
const handleWolframAlphaSearch = (message: string, history: BaseMessage[]) => {
|
||||
const emitter = basicWolframAlphaSearch(message, history);
|
||||
return emitter;
|
||||
};
|
||||
|
||||
export default handleWolframAlphaSearch;
|
86
src/agents/writingAssistant.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import {
|
||||
ChatPromptTemplate,
|
||||
MessagesPlaceholder,
|
||||
} from '@langchain/core/prompts';
|
||||
import { RunnableSequence } from '@langchain/core/runnables';
|
||||
import { ChatOllama } from '@langchain/community/chat_models/ollama';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
|
||||
import eventEmitter from 'events';
|
||||
|
||||
const chatLLM = new ChatOllama({
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
model: process.env.MODEL_NAME,
|
||||
temperature: 0.7,
|
||||
});
|
||||
|
||||
const writingAssistantPrompt = `
|
||||
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are currently set on focus mode 'Writing Assistant', this means you will be helping the user write a response to a given query.
|
||||
Since you are a writing assistant, you would not perform web searches. If you think you lack information to answer the query, you can ask the user for more information or suggest them to switch to a different focus mode.
|
||||
`;
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const handleStream = async (
|
||||
stream: AsyncGenerator<StreamEvent, any, unknown>,
|
||||
emitter: eventEmitter,
|
||||
) => {
|
||||
for await (const event of stream) {
|
||||
if (
|
||||
event.event === 'on_chain_stream' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'response', data: event.data.chunk }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit('end');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const writingAssistantChain = RunnableSequence.from([
|
||||
ChatPromptTemplate.fromMessages([
|
||||
['system', writingAssistantPrompt],
|
||||
new MessagesPlaceholder('chat_history'),
|
||||
['user', '{query}'],
|
||||
]),
|
||||
chatLLM,
|
||||
strParser,
|
||||
]).withConfig({
|
||||
runName: 'FinalResponseGenerator',
|
||||
});
|
||||
|
||||
const handleWritingAssistant = (query: string, history: BaseMessage[]) => {
|
||||
const emitter = new eventEmitter();
|
||||
|
||||
try {
|
||||
const stream = writingAssistantChain.streamEvents(
|
||||
{
|
||||
chat_history: history,
|
||||
query: query,
|
||||
},
|
||||
{
|
||||
version: 'v1',
|
||||
},
|
||||
);
|
||||
|
||||
handleStream(stream, emitter);
|
||||
} catch (err) {
|
||||
emitter.emit(
|
||||
'error',
|
||||
JSON.stringify({ data: 'An error has occurred please try again later' }),
|
||||
);
|
||||
console.error(err);
|
||||
}
|
||||
|
||||
return emitter;
|
||||
};
|
||||
|
||||
export default handleWritingAssistant;
|
256
src/agents/youtubeSearchAgent.ts
Normal file
@@ -0,0 +1,256 @@
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import {
|
||||
PromptTemplate,
|
||||
ChatPromptTemplate,
|
||||
MessagesPlaceholder,
|
||||
} from '@langchain/core/prompts';
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { ChatOllama } from '@langchain/community/chat_models/ollama';
|
||||
import { Ollama } from '@langchain/community/llms/ollama';
|
||||
import { OllamaEmbeddings } from '@langchain/community/embeddings/ollama';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { Document } from '@langchain/core/documents';
|
||||
import { searchSearxng } from '../core/searxng';
|
||||
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import eventEmitter from 'events';
|
||||
import computeSimilarity from '../utils/computeSimilarity';
|
||||
|
||||
const chatLLM = new ChatOllama({
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
model: process.env.MODEL_NAME,
|
||||
temperature: 0.7,
|
||||
});
|
||||
|
||||
const llm = new Ollama({
|
||||
temperature: 0,
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const embeddings = new OllamaEmbeddings({
|
||||
model: process.env.MODEL_NAME,
|
||||
baseUrl: process.env.OLLAMA_URL,
|
||||
});
|
||||
|
||||
const basicYoutubeSearchRetrieverPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
|
||||
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
|
||||
|
||||
Example:
|
||||
1. Follow up question: How does an A.C work?
|
||||
Rephrased: A.C working
|
||||
|
||||
2. Follow up question: Linear algebra explanation video
|
||||
Rephrased: What is linear algebra?
|
||||
|
||||
3. Follow up question: What is theory of relativity?
|
||||
Rephrased: What is theory of relativity?
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
const basicYoutubeSearchResponsePrompt = `
|
||||
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Youtube', this means you will be searching for videos on the web using Youtube and providing information based on the video's transcript.
|
||||
|
||||
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
|
||||
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
|
||||
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
|
||||
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
|
||||
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
|
||||
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
|
||||
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
|
||||
|
||||
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by Youtube and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
|
||||
talk about the context in your response.
|
||||
|
||||
<context>
|
||||
{context}
|
||||
</context>
|
||||
|
||||
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
|
||||
Anything between the \`context\` is retrieved from Youtube and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
|
||||
`;
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const handleStream = async (
|
||||
stream: AsyncGenerator<StreamEvent, any, unknown>,
|
||||
emitter: eventEmitter,
|
||||
) => {
|
||||
for await (const event of stream) {
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalSourceRetriever'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'sources', data: event.data.output }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_stream' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit(
|
||||
'data',
|
||||
JSON.stringify({ type: 'response', data: event.data.chunk }),
|
||||
);
|
||||
}
|
||||
if (
|
||||
event.event === 'on_chain_end' &&
|
||||
event.name === 'FinalResponseGenerator'
|
||||
) {
|
||||
emitter.emit('end');
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const processDocs = async (docs: Document[]) => {
|
||||
return docs
|
||||
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
|
||||
.join('\n');
|
||||
};
|
||||
|
||||
const rerankDocs = async ({
|
||||
query,
|
||||
docs,
|
||||
}: {
|
||||
query: string;
|
||||
docs: Document[];
|
||||
}) => {
|
||||
if (docs.length === 0) {
|
||||
return docs;
|
||||
}
|
||||
|
||||
const docsWithContent = docs.filter(
|
||||
(doc) => doc.pageContent && doc.pageContent.length > 0,
|
||||
);
|
||||
|
||||
const docEmbeddings = await embeddings.embedDocuments(
|
||||
docsWithContent.map((doc) => doc.pageContent),
|
||||
);
|
||||
|
||||
const queryEmbedding = await embeddings.embedQuery(query);
|
||||
|
||||
const similarity = docEmbeddings.map((docEmbedding, i) => {
|
||||
const sim = computeSimilarity(queryEmbedding, docEmbedding);
|
||||
|
||||
return {
|
||||
index: i,
|
||||
similarity: sim,
|
||||
};
|
||||
});
|
||||
|
||||
const sortedDocs = similarity
|
||||
.sort((a, b) => b.similarity - a.similarity)
|
||||
.slice(0, 15)
|
||||
.filter((sim) => sim.similarity > 0.3)
|
||||
.map((sim) => docsWithContent[sim.index]);
|
||||
|
||||
return sortedDocs;
|
||||
};
|
||||
|
||||
type BasicChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const basicYoutubeSearchRetrieverChain = RunnableSequence.from([
|
||||
PromptTemplate.fromTemplate(basicYoutubeSearchRetrieverPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
if (input === 'not_needed') {
|
||||
return { query: '', docs: [] };
|
||||
}
|
||||
|
||||
const res = await searchSearxng(input, {
|
||||
language: 'en',
|
||||
engines: ['youtube'],
|
||||
});
|
||||
|
||||
const documents = res.results.map(
|
||||
(result) =>
|
||||
new Document({
|
||||
pageContent: result.content ? result.content : result.title,
|
||||
metadata: {
|
||||
title: result.title,
|
||||
url: result.url,
|
||||
...(result.img_src && { img_src: result.img_src }),
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
return { query: input, docs: documents };
|
||||
}),
|
||||
]);
|
||||
|
||||
const basicYoutubeSearchAnsweringChain = RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
query: (input: BasicChainInput) => input.query,
|
||||
chat_history: (input: BasicChainInput) => input.chat_history,
|
||||
context: RunnableSequence.from([
|
||||
(input) => ({
|
||||
query: input.query,
|
||||
chat_history: formatChatHistoryAsString(input.chat_history),
|
||||
}),
|
||||
basicYoutubeSearchRetrieverChain
|
||||
.pipe(rerankDocs)
|
||||
.withConfig({
|
||||
runName: 'FinalSourceRetriever',
|
||||
})
|
||||
.pipe(processDocs),
|
||||
]),
|
||||
}),
|
||||
ChatPromptTemplate.fromMessages([
|
||||
['system', basicYoutubeSearchResponsePrompt],
|
||||
new MessagesPlaceholder('chat_history'),
|
||||
['user', '{query}'],
|
||||
]),
|
||||
chatLLM,
|
||||
strParser,
|
||||
]).withConfig({
|
||||
runName: 'FinalResponseGenerator',
|
||||
});
|
||||
|
||||
const basicYoutubeSearch = (query: string, history: BaseMessage[]) => {
|
||||
const emitter = new eventEmitter();
|
||||
|
||||
try {
|
||||
const stream = basicYoutubeSearchAnsweringChain.streamEvents(
|
||||
{
|
||||
chat_history: history,
|
||||
query: query,
|
||||
},
|
||||
{
|
||||
version: 'v1',
|
||||
},
|
||||
);
|
||||
|
||||
handleStream(stream, emitter);
|
||||
} catch (err) {
|
||||
emitter.emit(
|
||||
'error',
|
||||
JSON.stringify({ data: 'An error has occurred please try again later' }),
|
||||
);
|
||||
console.error(err);
|
||||
}
|
||||
|
||||
return emitter;
|
||||
};
|
||||
|
||||
const handleYoutubeSearch = (message: string, history: BaseMessage[]) => {
|
||||
const emitter = basicYoutubeSearch(message, history);
|
||||
return emitter;
|
||||
};
|
||||
|
||||
export default handleYoutubeSearch;
|
26
src/app.ts
@@ -1,16 +1,26 @@
|
||||
import { startWebSocketServer } from './websocket';
|
||||
import express from 'express';
|
||||
import cors from 'cors';
|
||||
import searchRoutes from './routes/search';
|
||||
import businessRoutes from './routes/business';
|
||||
import http from 'http';
|
||||
import routes from './routes';
|
||||
|
||||
const app = express();
|
||||
const server = http.createServer(app);
|
||||
|
||||
// Middleware
|
||||
app.use(cors());
|
||||
const corsOptions = {
|
||||
origin: '*',
|
||||
};
|
||||
|
||||
app.use(cors(corsOptions));
|
||||
app.use(express.json());
|
||||
|
||||
// Routes
|
||||
app.use('/api/search', searchRoutes);
|
||||
app.use('/api/business', businessRoutes);
|
||||
app.use('/api', routes);
|
||||
app.get('/api', (_, res) => {
|
||||
res.status(200).json({ status: 'ok' });
|
||||
});
|
||||
|
||||
export default app;
|
||||
server.listen(process.env.PORT!, () => {
|
||||
console.log(`API server started on port ${process.env.PORT}`);
|
||||
});
|
||||
|
||||
startWebSocketServer(server);
|
||||
|
@@ -1,84 +0,0 @@
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { PromptTemplate } from '@langchain/core/prompts';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { searchSearxng } from '../lib/searxng';
|
||||
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||
|
||||
const imageSearchChainPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question so it is a standalone question that can be used by the LLM to search the web for images.
|
||||
You need to make sure the rephrased question agrees with the conversation and is relevant to the conversation.
|
||||
|
||||
Example:
|
||||
1. Follow up question: What is a cat?
|
||||
Rephrased: A cat
|
||||
|
||||
2. Follow up question: What is a car? How does it works?
|
||||
Rephrased: Car working
|
||||
|
||||
3. Follow up question: How does an AC work?
|
||||
Rephrased: AC working
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
type ImageSearchChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const createImageSearchChain = (llm: BaseChatModel) => {
|
||||
return RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
chat_history: (input: ImageSearchChainInput) => {
|
||||
return formatChatHistoryAsString(input.chat_history);
|
||||
},
|
||||
query: (input: ImageSearchChainInput) => {
|
||||
return input.query;
|
||||
},
|
||||
}),
|
||||
PromptTemplate.fromTemplate(imageSearchChainPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
const res = await searchSearxng(input, {
|
||||
engines: ['bing images', 'google images'],
|
||||
});
|
||||
|
||||
const images = [];
|
||||
|
||||
res.results.forEach((result) => {
|
||||
if (result.img_src && result.url && result.title) {
|
||||
images.push({
|
||||
img_src: result.img_src,
|
||||
url: result.url,
|
||||
title: result.title,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return images.slice(0, 10);
|
||||
}),
|
||||
]);
|
||||
};
|
||||
|
||||
const handleImageSearch = (
|
||||
input: ImageSearchChainInput,
|
||||
llm: BaseChatModel,
|
||||
) => {
|
||||
const imageSearchChain = createImageSearchChain(llm);
|
||||
return imageSearchChain.invoke(input);
|
||||
};
|
||||
|
||||
export default handleImageSearch;
|
@@ -1,55 +0,0 @@
|
||||
import { RunnableSequence, RunnableMap } from '@langchain/core/runnables';
|
||||
import ListLineOutputParser from '../lib/outputParsers/listLineOutputParser';
|
||||
import { PromptTemplate } from '@langchain/core/prompts';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||
import { ChatOpenAI } from '@langchain/openai';
|
||||
|
||||
const suggestionGeneratorPrompt = `
|
||||
You are an AI suggestion generator for an AI powered search engine. You will be given a conversation below. You need to generate 4-5 suggestions based on the conversation. The suggestion should be relevant to the conversation that can be used by the user to ask the chat model for more information.
|
||||
You need to make sure the suggestions are relevant to the conversation and are helpful to the user. Keep a note that the user might use these suggestions to ask a chat model for more information.
|
||||
Make sure the suggestions are medium in length and are informative and relevant to the conversation.
|
||||
|
||||
Provide these suggestions separated by newlines between the XML tags <suggestions> and </suggestions>. For example:
|
||||
|
||||
<suggestions>
|
||||
Tell me more about SpaceX and their recent projects
|
||||
What is the latest news on SpaceX?
|
||||
Who is the CEO of SpaceX?
|
||||
</suggestions>
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
`;
|
||||
|
||||
type SuggestionGeneratorInput = {
|
||||
chat_history: BaseMessage[];
|
||||
};
|
||||
|
||||
const outputParser = new ListLineOutputParser({
|
||||
key: 'suggestions',
|
||||
});
|
||||
|
||||
const createSuggestionGeneratorChain = (llm: BaseChatModel) => {
|
||||
return RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
chat_history: (input: SuggestionGeneratorInput) =>
|
||||
formatChatHistoryAsString(input.chat_history),
|
||||
}),
|
||||
PromptTemplate.fromTemplate(suggestionGeneratorPrompt),
|
||||
llm,
|
||||
outputParser,
|
||||
]);
|
||||
};
|
||||
|
||||
const generateSuggestions = (
|
||||
input: SuggestionGeneratorInput,
|
||||
llm: BaseChatModel,
|
||||
) => {
|
||||
(llm as unknown as ChatOpenAI).temperature = 0;
|
||||
const suggestionGeneratorChain = createSuggestionGeneratorChain(llm);
|
||||
return suggestionGeneratorChain.invoke(input);
|
||||
};
|
||||
|
||||
export default generateSuggestions;
|
@@ -1,90 +0,0 @@
|
||||
import {
|
||||
RunnableSequence,
|
||||
RunnableMap,
|
||||
RunnableLambda,
|
||||
} from '@langchain/core/runnables';
|
||||
import { PromptTemplate } from '@langchain/core/prompts';
|
||||
import formatChatHistoryAsString from '../utils/formatHistory';
|
||||
import { BaseMessage } from '@langchain/core/messages';
|
||||
import { StringOutputParser } from '@langchain/core/output_parsers';
|
||||
import { searchSearxng } from '../lib/searxng';
|
||||
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
|
||||
|
||||
const VideoSearchChainPrompt = `
|
||||
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question so it is a standalone question that can be used by the LLM to search Youtube for videos.
|
||||
You need to make sure the rephrased question agrees with the conversation and is relevant to the conversation.
|
||||
|
||||
Example:
|
||||
1. Follow up question: How does a car work?
|
||||
Rephrased: How does a car work?
|
||||
|
||||
2. Follow up question: What is the theory of relativity?
|
||||
Rephrased: What is theory of relativity
|
||||
|
||||
3. Follow up question: How does an AC work?
|
||||
Rephrased: How does an AC work
|
||||
|
||||
Conversation:
|
||||
{chat_history}
|
||||
|
||||
Follow up question: {query}
|
||||
Rephrased question:
|
||||
`;
|
||||
|
||||
type VideoSearchChainInput = {
|
||||
chat_history: BaseMessage[];
|
||||
query: string;
|
||||
};
|
||||
|
||||
const strParser = new StringOutputParser();
|
||||
|
||||
const createVideoSearchChain = (llm: BaseChatModel) => {
|
||||
return RunnableSequence.from([
|
||||
RunnableMap.from({
|
||||
chat_history: (input: VideoSearchChainInput) => {
|
||||
return formatChatHistoryAsString(input.chat_history);
|
||||
},
|
||||
query: (input: VideoSearchChainInput) => {
|
||||
return input.query;
|
||||
},
|
||||
}),
|
||||
PromptTemplate.fromTemplate(VideoSearchChainPrompt),
|
||||
llm,
|
||||
strParser,
|
||||
RunnableLambda.from(async (input: string) => {
|
||||
const res = await searchSearxng(input, {
|
||||
engines: ['youtube'],
|
||||
});
|
||||
|
||||
const videos = [];
|
||||
|
||||
res.results.forEach((result) => {
|
||||
if (
|
||||
result.thumbnail &&
|
||||
result.url &&
|
||||
result.title &&
|
||||
result.iframe_src
|
||||
) {
|
||||
videos.push({
|
||||
img_src: result.thumbnail,
|
||||
url: result.url,
|
||||
title: result.title,
|
||||
iframe_src: result.iframe_src,
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return videos.slice(0, 10);
|
||||
}),
|
||||
]);
|
||||
};
|
||||
|
||||
const handleVideoSearch = (
|
||||
input: VideoSearchChainInput,
|
||||
llm: BaseChatModel,
|
||||
) => {
|
||||
const VideoSearchChain = createVideoSearchChain(llm);
|
||||
return VideoSearchChain.invoke(input);
|
||||
};
|
||||
|
||||
export default handleVideoSearch;
|
@@ -1,92 +0,0 @@
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import toml from '@iarna/toml';
|
||||
|
||||
const configFileName = 'config.toml';
|
||||
|
||||
interface Config {
|
||||
GENERAL: {
|
||||
PORT: number;
|
||||
SIMILARITY_MEASURE: string;
|
||||
KEEP_ALIVE: string;
|
||||
};
|
||||
API_KEYS: {
|
||||
OPENAI: string;
|
||||
GROQ: string;
|
||||
ANTHROPIC: string;
|
||||
GEMINI: string;
|
||||
};
|
||||
API_ENDPOINTS: {
|
||||
SEARXNG: string;
|
||||
OLLAMA: string;
|
||||
};
|
||||
}
|
||||
|
||||
type RecursivePartial<T> = {
|
||||
[P in keyof T]?: RecursivePartial<T[P]>;
|
||||
};
|
||||
|
||||
const loadConfig = () =>
|
||||
toml.parse(
|
||||
fs.readFileSync(path.join(__dirname, `../${configFileName}`), 'utf-8'),
|
||||
) as any as Config;
|
||||
|
||||
export const getPort = () => loadConfig().GENERAL.PORT;
|
||||
|
||||
export const getSimilarityMeasure = () =>
|
||||
loadConfig().GENERAL.SIMILARITY_MEASURE;
|
||||
|
||||
export const getKeepAlive = () => loadConfig().GENERAL.KEEP_ALIVE;
|
||||
|
||||
export const getOpenaiApiKey = () => loadConfig().API_KEYS.OPENAI;
|
||||
|
||||
export const getGroqApiKey = () => loadConfig().API_KEYS.GROQ;
|
||||
|
||||
export const getAnthropicApiKey = () => loadConfig().API_KEYS.ANTHROPIC;
|
||||
|
||||
export const getGeminiApiKey = () => loadConfig().API_KEYS.GEMINI;
|
||||
|
||||
export const getSearxngApiEndpoint = () =>
|
||||
process.env.SEARXNG_API_URL || loadConfig().API_ENDPOINTS.SEARXNG;
|
||||
|
||||
export const getOllamaApiEndpoint = () => loadConfig().API_ENDPOINTS.OLLAMA;
|
||||
|
||||
export const updateConfig = (config: RecursivePartial<Config>) => {
|
||||
const currentConfig = loadConfig();
|
||||
|
||||
for (const key in currentConfig) {
|
||||
if (!config[key]) config[key] = {};
|
||||
|
||||
if (typeof currentConfig[key] === 'object' && currentConfig[key] !== null) {
|
||||
for (const nestedKey in currentConfig[key]) {
|
||||
if (
|
||||
!config[key][nestedKey] &&
|
||||
currentConfig[key][nestedKey] &&
|
||||
config[key][nestedKey] !== ''
|
||||
) {
|
||||
config[key][nestedKey] = currentConfig[key][nestedKey];
|
||||
}
|
||||
}
|
||||
} else if (currentConfig[key] && config[key] !== '') {
|
||||
config[key] = currentConfig[key];
|
||||
}
|
||||
}
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(__dirname, `../${configFileName}`),
|
||||
toml.stringify(config),
|
||||
);
|
||||
};
|
||||
|
||||
export const config = {
|
||||
ollama: {
|
||||
url: process.env.OLLAMA_URL || 'http://localhost:11434',
|
||||
model: process.env.OLLAMA_MODEL || 'mistral',
|
||||
options: {
|
||||
temperature: 0.1,
|
||||
top_p: 0.9,
|
||||
timeout: 30000 // 30 seconds timeout
|
||||
}
|
||||
},
|
||||
// ... other config
|
||||
};
|
@@ -1,40 +0,0 @@
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
// Load environment variables
|
||||
dotenv.config();
|
||||
|
||||
// Environment configuration
|
||||
const env = {
|
||||
// Supabase Configuration
|
||||
SUPABASE_URL: process.env.SUPABASE_URL || '',
|
||||
SUPABASE_KEY: process.env.SUPABASE_KEY || '',
|
||||
|
||||
// Server Configuration
|
||||
PORT: parseInt(process.env.PORT || '3001', 10),
|
||||
NODE_ENV: process.env.NODE_ENV || 'development',
|
||||
|
||||
// Search Configuration
|
||||
MAX_RESULTS_PER_QUERY: parseInt(process.env.MAX_RESULTS_PER_QUERY || '50', 10),
|
||||
CACHE_DURATION_HOURS: parseInt(process.env.CACHE_DURATION_HOURS || '24', 10),
|
||||
CACHE_DURATION_DAYS: parseInt(process.env.CACHE_DURATION_DAYS || '7', 10),
|
||||
|
||||
// SearxNG Configuration
|
||||
SEARXNG_URL: process.env.SEARXNG_URL || 'http://localhost:4000',
|
||||
|
||||
// Ollama Configuration
|
||||
OLLAMA_URL: process.env.OLLAMA_URL || 'http://localhost:11434',
|
||||
OLLAMA_MODEL: process.env.OLLAMA_MODEL || 'deepseek-coder:6.7b',
|
||||
|
||||
// Hugging Face Configuration
|
||||
HUGGING_FACE_API_KEY: process.env.HUGGING_FACE_API_KEY || ''
|
||||
};
|
||||
|
||||
// Validate required environment variables
|
||||
const requiredEnvVars = ['SUPABASE_URL', 'SUPABASE_KEY', 'SEARXNG_URL'];
|
||||
for (const envVar of requiredEnvVars) {
|
||||
if (!env[envVar as keyof typeof env]) {
|
||||
throw new Error(`Missing required environment variable: ${envVar}`);
|
||||
}
|
||||
}
|
||||
|
||||
export { env };
|
@@ -1,77 +0,0 @@
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'path';
|
||||
|
||||
// Load .env file
|
||||
dotenv.config({ path: path.resolve(__dirname, '../../.env') });
|
||||
|
||||
export interface Config {
|
||||
supabase: {
|
||||
url: string;
|
||||
anonKey: string;
|
||||
};
|
||||
server: {
|
||||
port: number;
|
||||
nodeEnv: string;
|
||||
};
|
||||
search: {
|
||||
maxResultsPerQuery: number;
|
||||
cacheDurationHours: number;
|
||||
searxngUrl?: string;
|
||||
};
|
||||
rateLimit: {
|
||||
windowMs: number;
|
||||
maxRequests: number;
|
||||
};
|
||||
security: {
|
||||
corsOrigin: string;
|
||||
jwtSecret: string;
|
||||
};
|
||||
proxy?: {
|
||||
http?: string;
|
||||
https?: string;
|
||||
};
|
||||
logging: {
|
||||
level: string;
|
||||
};
|
||||
}
|
||||
|
||||
const config: Config = {
|
||||
supabase: {
|
||||
url: process.env.SUPABASE_URL || '',
|
||||
anonKey: process.env.SUPABASE_ANON_KEY || '',
|
||||
},
|
||||
server: {
|
||||
port: parseInt(process.env.PORT || '3000', 10),
|
||||
nodeEnv: process.env.NODE_ENV || 'development',
|
||||
},
|
||||
search: {
|
||||
maxResultsPerQuery: parseInt(process.env.MAX_RESULTS_PER_QUERY || '20', 10),
|
||||
cacheDurationHours: parseInt(process.env.CACHE_DURATION_HOURS || '24', 10),
|
||||
searxngUrl: process.env.SEARXNG_URL
|
||||
},
|
||||
rateLimit: {
|
||||
windowMs: parseInt(process.env.RATE_LIMIT_WINDOW_MS || '900000', 10),
|
||||
maxRequests: parseInt(process.env.RATE_LIMIT_MAX_REQUESTS || '100', 10),
|
||||
},
|
||||
security: {
|
||||
corsOrigin: process.env.CORS_ORIGIN || 'http://localhost:3000',
|
||||
jwtSecret: process.env.JWT_SECRET || 'your_jwt_secret_key',
|
||||
},
|
||||
logging: {
|
||||
level: process.env.LOG_LEVEL || 'info',
|
||||
},
|
||||
};
|
||||
|
||||
// Validate required configuration
|
||||
const validateConfig = () => {
|
||||
if (!config.supabase.url) {
|
||||
throw new Error('SUPABASE_URL is required');
|
||||
}
|
||||
if (!config.supabase.anonKey) {
|
||||
throw new Error('SUPABASE_ANON_KEY is required');
|
||||
}
|
||||
};
|
||||
|
||||
validateConfig();
|
||||
|
||||
export { config };
|
69
src/core/agentPicker.ts
Normal file
@@ -0,0 +1,69 @@
|
||||
import { z } from 'zod';
|
||||
import { OpenAI } from '@langchain/openai';
|
||||
import { RunnableSequence } from '@langchain/core/runnables';
|
||||
import { StructuredOutputParser } from 'langchain/output_parsers';
|
||||
import { PromptTemplate } from '@langchain/core/prompts';
|
||||
|
||||
const availableAgents = [
|
||||
{
|
||||
name: 'webSearch',
|
||||
description:
|
||||
'It is expert is searching the web for information and answer user queries',
|
||||
},
|
||||
/* {
|
||||
name: 'academicSearch',
|
||||
description:
|
||||
'It is expert is searching the academic databases for information and answer user queries. It is particularly good at finding research papers and articles on topics like science, engineering, and technology. Use this instead of wolframAlphaSearch if the user query is not mathematical or scientific in nature',
|
||||
},
|
||||
{
|
||||
name: 'youtubeSearch',
|
||||
description:
|
||||
'This model is expert at finding videos on youtube based on user queries',
|
||||
},
|
||||
{
|
||||
name: 'wolframAlphaSearch',
|
||||
description:
|
||||
'This model is expert at finding answers to mathematical and scientific questions based on user queries.',
|
||||
},
|
||||
{
|
||||
name: 'redditSearch',
|
||||
description:
|
||||
'This model is expert at finding posts and discussions on reddit based on user queries',
|
||||
},
|
||||
{
|
||||
name: 'writingAssistant',
|
||||
description:
|
||||
'If there is no need for searching, this model is expert at generating text based on user queries',
|
||||
}, */
|
||||
];
|
||||
|
||||
const parser = StructuredOutputParser.fromZodSchema(
|
||||
z.object({
|
||||
agent: z.string().describe('The name of the selected agent'),
|
||||
}),
|
||||
);
|
||||
|
||||
const prompt = `
|
||||
You are an AI model who is expert at finding suitable agents for user queries. The available agents are:
|
||||
${availableAgents.map((agent) => `- ${agent.name}: ${agent.description}`).join('\n')}
|
||||
|
||||
Your task is to find the most suitable agent for the following query: {query}
|
||||
|
||||
{format_instructions}
|
||||
`;
|
||||
|
||||
const chain = RunnableSequence.from([
|
||||
PromptTemplate.fromTemplate(prompt),
|
||||
new OpenAI({ temperature: 0 }),
|
||||
parser,
|
||||
]);
|
||||
|
||||
const pickSuitableAgent = async (query: string) => {
|
||||
const res = await chain.invoke({
|
||||
query,
|
||||
format_instructions: parser.getFormatInstructions(),
|
||||
});
|
||||
return res.agent;
|
||||
};
|
||||
|
||||
export default pickSuitableAgent;
|
42
src/core/searxng.ts
Normal file
@@ -0,0 +1,42 @@
|
||||
import axios from 'axios';
|
||||
|
||||
interface SearxngSearchOptions {
|
||||
categories?: string[];
|
||||
engines?: string[];
|
||||
language?: string;
|
||||
pageno?: number;
|
||||
}
|
||||
|
||||
interface SearxngSearchResult {
|
||||
title: string;
|
||||
url: string;
|
||||
img_src?: string;
|
||||
thumbnail_src?: string;
|
||||
content?: string;
|
||||
author?: string;
|
||||
}
|
||||
|
||||
export const searchSearxng = async (
|
||||
query: string,
|
||||
opts?: SearxngSearchOptions,
|
||||
) => {
|
||||
const url = new URL(`${process.env.SEARXNG_API_URL}/search?format=json`);
|
||||
url.searchParams.append('q', query);
|
||||
|
||||
if (opts) {
|
||||
Object.keys(opts).forEach((key) => {
|
||||
if (Array.isArray(opts[key])) {
|
||||
url.searchParams.append(key, opts[key].join(','));
|
||||
return;
|
||||
}
|
||||
url.searchParams.append(key, opts[key]);
|
||||
});
|
||||
}
|
||||
|
||||
const res = await axios.get(url.toString());
|
||||
|
||||
const results: SearxngSearchResult[] = res.data.results;
|
||||
const suggestions: string[] = res.data.suggestions;
|
||||
|
||||
return { results, suggestions };
|
||||
};
|
@@ -1,10 +0,0 @@
|
||||
import { drizzle } from 'drizzle-orm/better-sqlite3';
|
||||
import Database from 'better-sqlite3';
|
||||
import * as schema from './schema';
|
||||
|
||||
const sqlite = new Database('data/db.sqlite');
|
||||
const db = drizzle(sqlite, {
|
||||
schema: schema,
|
||||
});
|
||||
|
||||
export default db;
|
@@ -1,28 +0,0 @@
|
||||
import { sql } from 'drizzle-orm';
|
||||
import { text, integer, sqliteTable } from 'drizzle-orm/sqlite-core';
|
||||
|
||||
export const messages = sqliteTable('messages', {
|
||||
id: integer('id').primaryKey(),
|
||||
content: text('content').notNull(),
|
||||
chatId: text('chatId').notNull(),
|
||||
messageId: text('messageId').notNull(),
|
||||
role: text('type', { enum: ['assistant', 'user'] }),
|
||||
metadata: text('metadata', {
|
||||
mode: 'json',
|
||||
}),
|
||||
});
|
||||
|
||||
interface File {
|
||||
name: string;
|
||||
fileId: string;
|
||||
}
|
||||
|
||||
export const chats = sqliteTable('chats', {
|
||||
id: text('id').primaryKey(),
|
||||
title: text('title').notNull(),
|
||||
createdAt: text('createdAt').notNull(),
|
||||
focusMode: text('focusMode').notNull(),
|
||||
files: text('files', { mode: 'json' })
|
||||
.$type<File[]>()
|
||||
.default(sql`'[]'`),
|
||||
});
|
24
src/index.ts
@@ -1,24 +0,0 @@
|
||||
import './config/env'; // Load environment variables first
|
||||
import { startServer } from './server';
|
||||
import { isPortAvailable } from './utils/portCheck';
|
||||
import { testConnection } from './lib/supabase';
|
||||
|
||||
const PORT = process.env.PORT || 3001;
|
||||
|
||||
const init = async () => {
|
||||
if (!await isPortAvailable(PORT)) {
|
||||
console.error(`Port ${PORT} is in use. Please try a different port or free up the current one.`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Test Supabase connection
|
||||
const isConnected = await testConnection();
|
||||
if (!isConnected) {
|
||||
console.error('Failed to connect to Supabase. Please check your configuration.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
startServer();
|
||||
};
|
||||
|
||||
init().catch(console.error);
|
@@ -1,116 +0,0 @@
|
||||
export interface Category {
|
||||
id: string;
|
||||
name: string;
|
||||
icon: string;
|
||||
subcategories: SubCategory[];
|
||||
}
|
||||
|
||||
export interface SubCategory {
|
||||
id: string;
|
||||
name: string;
|
||||
}
|
||||
|
||||
export const categories: Category[] = [
|
||||
{
|
||||
id: 'real-estate-pros',
|
||||
name: 'Real Estate Professionals',
|
||||
icon: '🏢',
|
||||
subcategories: [
|
||||
{ id: 'wholesalers', name: 'Real Estate Wholesalers' },
|
||||
{ id: 'agents', name: 'Real Estate Agents' },
|
||||
{ id: 'attorneys', name: 'Real Estate Attorneys' },
|
||||
{ id: 'scouts', name: 'Property Scouts' },
|
||||
{ id: 'brokers', name: 'Real Estate Brokers' },
|
||||
{ id: 'consultants', name: 'Real Estate Consultants' }
|
||||
]
|
||||
},
|
||||
{
|
||||
id: 'legal-title',
|
||||
name: 'Legal & Title Services',
|
||||
icon: '⚖️',
|
||||
subcategories: [
|
||||
{ id: 'title-companies', name: 'Title Companies' },
|
||||
{ id: 'closing-attorneys', name: 'Closing Attorneys' },
|
||||
{ id: 'zoning-consultants', name: 'Zoning Consultants' },
|
||||
{ id: 'probate-specialists', name: 'Probate Specialists' },
|
||||
{ id: 'eviction-specialists', name: 'Eviction Specialists' }
|
||||
]
|
||||
},
|
||||
{
|
||||
id: 'financial',
|
||||
name: 'Financial Services',
|
||||
icon: '💰',
|
||||
subcategories: [
|
||||
{ id: 'hard-money', name: 'Hard Money Lenders' },
|
||||
{ id: 'private-equity', name: 'Private Equity Investors' },
|
||||
{ id: 'mortgage-brokers', name: 'Mortgage Brokers' },
|
||||
{ id: 'tax-advisors', name: 'Tax Advisors' },
|
||||
{ id: 'appraisers', name: 'Appraisers' }
|
||||
]
|
||||
},
|
||||
{
|
||||
id: 'contractors',
|
||||
name: 'Specialist Contractors',
|
||||
icon: '🔨',
|
||||
subcategories: [
|
||||
{ id: 'general', name: 'General Contractors' },
|
||||
{ id: 'plumbers', name: 'Plumbers' },
|
||||
{ id: 'electricians', name: 'Electricians' },
|
||||
{ id: 'hvac', name: 'HVAC Technicians' },
|
||||
{ id: 'roofers', name: 'Roofers' },
|
||||
{ id: 'foundation', name: 'Foundation Specialists' },
|
||||
{ id: 'asbestos', name: 'Asbestos Removal' },
|
||||
{ id: 'mold', name: 'Mold Remediation' }
|
||||
]
|
||||
},
|
||||
{
|
||||
id: 'property-services',
|
||||
name: 'Property Services',
|
||||
icon: '🏠',
|
||||
subcategories: [
|
||||
{ id: 'surveyors', name: 'Surveyors' },
|
||||
{ id: 'inspectors', name: 'Inspectors' },
|
||||
{ id: 'property-managers', name: 'Property Managers' },
|
||||
{ id: 'environmental', name: 'Environmental Consultants' },
|
||||
{ id: 'junk-removal', name: 'Junk Removal Services' },
|
||||
{ id: 'cleaning', name: 'Property Cleaning' }
|
||||
]
|
||||
},
|
||||
{
|
||||
id: 'marketing',
|
||||
name: 'Marketing & Lead Gen',
|
||||
icon: '📢',
|
||||
subcategories: [
|
||||
{ id: 'direct-mail', name: 'Direct Mail Services' },
|
||||
{ id: 'social-media', name: 'Social Media Marketing' },
|
||||
{ id: 'seo', name: 'SEO Specialists' },
|
||||
{ id: 'ppc', name: 'PPC Advertising' },
|
||||
{ id: 'lead-gen', name: 'Lead Generation' },
|
||||
{ id: 'skip-tracing', name: 'Skip Tracing Services' }
|
||||
]
|
||||
},
|
||||
{
|
||||
id: 'data-tech',
|
||||
name: 'Data & Technology',
|
||||
icon: '💻',
|
||||
subcategories: [
|
||||
{ id: 'data-providers', name: 'Property Data Providers' },
|
||||
{ id: 'crm', name: 'CRM Systems' },
|
||||
{ id: 'valuation', name: 'Valuation Tools' },
|
||||
{ id: 'virtual-tours', name: 'Virtual Tour Services' },
|
||||
{ id: 'automation', name: 'Automation Tools' }
|
||||
]
|
||||
},
|
||||
{
|
||||
id: 'specialty',
|
||||
name: 'Specialty Services',
|
||||
icon: '🎯',
|
||||
subcategories: [
|
||||
{ id: 'auction', name: 'Auction Companies' },
|
||||
{ id: 'relocation', name: 'Relocation Services' },
|
||||
{ id: 'staging', name: 'Home Staging' },
|
||||
{ id: 'photography', name: 'Real Estate Photography' },
|
||||
{ id: 'virtual-assistant', name: 'Virtual Assistants' }
|
||||
]
|
||||
}
|
||||
];
|
@@ -1,51 +0,0 @@
|
||||
import { Database } from 'better-sqlite3';
|
||||
import path from 'path';
|
||||
|
||||
interface OptOutEntry {
|
||||
domain: string;
|
||||
email: string;
|
||||
reason?: string;
|
||||
timestamp: Date;
|
||||
}
|
||||
|
||||
export class OptOutDatabase {
|
||||
private db: Database;
|
||||
|
||||
constructor() {
|
||||
this.db = new Database(path.join(__dirname, '../../../data/optout.db'));
|
||||
this.initializeDatabase();
|
||||
}
|
||||
|
||||
private initializeDatabase() {
|
||||
this.db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS opt_outs (
|
||||
domain TEXT PRIMARY KEY,
|
||||
email TEXT NOT NULL,
|
||||
reason TEXT,
|
||||
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_domain ON opt_outs(domain);
|
||||
`);
|
||||
}
|
||||
|
||||
async addOptOut(entry: OptOutEntry): Promise<void> {
|
||||
const stmt = this.db.prepare(
|
||||
'INSERT OR REPLACE INTO opt_outs (domain, email, reason, timestamp) VALUES (?, ?, ?, ?)'
|
||||
);
|
||||
stmt.run(entry.domain, entry.email, entry.reason, entry.timestamp.toISOString());
|
||||
}
|
||||
|
||||
isOptedOut(domain: string): boolean {
|
||||
const stmt = this.db.prepare('SELECT 1 FROM opt_outs WHERE domain = ?');
|
||||
return stmt.get(domain) !== undefined;
|
||||
}
|
||||
|
||||
removeOptOut(domain: string): void {
|
||||
const stmt = this.db.prepare('DELETE FROM opt_outs WHERE domain = ?');
|
||||
stmt.run(domain);
|
||||
}
|
||||
|
||||
getOptOutList(): OptOutEntry[] {
|
||||
return this.db.prepare('SELECT * FROM opt_outs').all();
|
||||
}
|
||||
}
|
@@ -1,74 +0,0 @@
|
||||
import { createClient } from '@supabase/supabase-js';
|
||||
import { BusinessData } from '../searxng';
|
||||
import { env } from '../../config/env';
|
||||
|
||||
// Create the Supabase client with validated environment variables
|
||||
export const supabase = createClient(
|
||||
env.supabase.url,
|
||||
env.supabase.anonKey,
|
||||
{
|
||||
auth: {
|
||||
persistSession: false // Since this is a server environment
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
// Define the cache record type
|
||||
export interface CacheRecord {
|
||||
id: string;
|
||||
query: string;
|
||||
results: BusinessData[];
|
||||
location: string;
|
||||
category: string;
|
||||
created_at: string;
|
||||
updated_at: string;
|
||||
expires_at: string;
|
||||
}
|
||||
|
||||
// Export database helper functions
|
||||
export async function getCacheEntry(
|
||||
category: string,
|
||||
location: string
|
||||
): Promise<CacheRecord | null> {
|
||||
const { data, error } = await supabase
|
||||
.from('search_cache')
|
||||
.select('*')
|
||||
.eq('category', category.toLowerCase())
|
||||
.eq('location', location.toLowerCase())
|
||||
.gt('expires_at', new Date().toISOString())
|
||||
.order('created_at', { ascending: false })
|
||||
.limit(1)
|
||||
.single();
|
||||
|
||||
if (error) {
|
||||
console.error('Cache lookup failed:', error);
|
||||
return null;
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
export async function saveCacheEntry(
|
||||
category: string,
|
||||
location: string,
|
||||
results: BusinessData[],
|
||||
expiresInDays: number = 7
|
||||
): Promise<void> {
|
||||
const expiresAt = new Date();
|
||||
expiresAt.setDate(expiresAt.getDate() + expiresInDays);
|
||||
|
||||
const { error } = await supabase
|
||||
.from('search_cache')
|
||||
.insert({
|
||||
query: `${category} in ${location}`,
|
||||
category: category.toLowerCase(),
|
||||
location: location.toLowerCase(),
|
||||
results,
|
||||
expires_at: expiresAt.toISOString()
|
||||
});
|
||||
|
||||
if (error) {
|
||||
console.error('Failed to save cache entry:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
@@ -1,195 +0,0 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { Cache } from './utils/cache';
|
||||
import { RateLimiter } from './utils/rateLimiter';
|
||||
import robotsParser from 'robots-parser';
|
||||
|
||||
interface ScrapingResult {
|
||||
emails: string[];
|
||||
phones: string[];
|
||||
addresses: string[];
|
||||
socialLinks: string[];
|
||||
source: string;
|
||||
timestamp: Date;
|
||||
attribution: string;
|
||||
}
|
||||
|
||||
export class EmailScraper {
|
||||
private cache: Cache<ScrapingResult>;
|
||||
private rateLimiter: RateLimiter;
|
||||
private robotsCache = new Map<string, any>();
|
||||
|
||||
constructor(private options = {
|
||||
timeout: 5000,
|
||||
cacheTTL: 60,
|
||||
rateLimit: { windowMs: 60000, maxRequests: 10 }, // More conservative rate limiting
|
||||
userAgent: 'BizSearch/1.0 (+https://your-domain.com/about) - Business Directory Service'
|
||||
}) {
|
||||
this.cache = new Cache<ScrapingResult>(options.cacheTTL);
|
||||
this.rateLimiter = new RateLimiter(options.rateLimit.windowMs, options.rateLimit.maxRequests);
|
||||
}
|
||||
|
||||
private async checkRobotsPermission(url: string): Promise<boolean> {
|
||||
try {
|
||||
const { protocol, host } = new URL(url);
|
||||
const robotsUrl = `${protocol}//${host}/robots.txt`;
|
||||
|
||||
let parser = this.robotsCache.get(host);
|
||||
if (!parser) {
|
||||
const response = await axios.get(robotsUrl);
|
||||
parser = robotsParser(robotsUrl, response.data);
|
||||
this.robotsCache.set(host, parser);
|
||||
}
|
||||
|
||||
return parser.isAllowed(url, this.options.userAgent);
|
||||
} catch (error) {
|
||||
console.warn(`Could not check robots.txt for ${url}:`, error);
|
||||
return true; // Assume allowed if robots.txt is unavailable
|
||||
}
|
||||
}
|
||||
|
||||
async scrapeEmails(url: string): Promise<ScrapingResult> {
|
||||
// Check cache first
|
||||
const cached = this.cache.get(url);
|
||||
if (cached) return cached;
|
||||
|
||||
// Check robots.txt
|
||||
const allowed = await this.checkRobotsPermission(url);
|
||||
if (!allowed) {
|
||||
console.log(`Respecting robots.txt disallow for ${url}`);
|
||||
return {
|
||||
emails: [],
|
||||
phones: [],
|
||||
addresses: [],
|
||||
socialLinks: [],
|
||||
source: url,
|
||||
timestamp: new Date(),
|
||||
attribution: 'Restricted by robots.txt'
|
||||
};
|
||||
}
|
||||
|
||||
// Wait for rate limiting slot
|
||||
await this.rateLimiter.waitForSlot();
|
||||
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
timeout: this.options.timeout,
|
||||
headers: {
|
||||
'User-Agent': this.options.userAgent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
}
|
||||
});
|
||||
|
||||
// Check for noindex meta tag
|
||||
const $ = cheerio.load(response.data);
|
||||
if ($('meta[name="robots"][content*="noindex"]').length > 0) {
|
||||
return {
|
||||
emails: [],
|
||||
phones: [],
|
||||
addresses: [],
|
||||
socialLinks: [],
|
||||
source: url,
|
||||
timestamp: new Date(),
|
||||
attribution: 'Respecting noindex directive'
|
||||
};
|
||||
}
|
||||
|
||||
// Only extract contact information from public contact pages or structured data
|
||||
const isContactPage = /contact|about/i.test(url) ||
|
||||
$('h1, h2').text().toLowerCase().includes('contact');
|
||||
|
||||
const result = {
|
||||
emails: new Set<string>(),
|
||||
phones: new Set<string>(),
|
||||
addresses: new Set<string>(),
|
||||
socialLinks: new Set<string>(),
|
||||
source: url,
|
||||
timestamp: new Date(),
|
||||
attribution: `Data from public business listing at ${new URL(url).hostname}`
|
||||
};
|
||||
|
||||
// Extract from structured data (Schema.org)
|
||||
$('script[type="application/ld+json"]').each((_, element) => {
|
||||
try {
|
||||
const data = JSON.parse($(element).html() || '{}');
|
||||
if (data['@type'] === 'LocalBusiness' || data['@type'] === 'Organization') {
|
||||
if (data.email) result.emails.add(data.email.toLowerCase());
|
||||
if (data.telephone) result.phones.add(this.formatPhoneNumber(data.telephone));
|
||||
if (data.address) {
|
||||
const fullAddress = this.formatAddress(data.address);
|
||||
if (fullAddress) result.addresses.add(fullAddress);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('Error parsing JSON-LD:', e);
|
||||
}
|
||||
});
|
||||
|
||||
// Only scrape additional info if it's a contact page
|
||||
if (isContactPage) {
|
||||
// Extract clearly marked contact information
|
||||
$('[itemprop="email"], .contact-email, .email').each((_, element) => {
|
||||
const email = $(element).text().trim();
|
||||
if (this.isValidEmail(email)) {
|
||||
result.emails.add(email.toLowerCase());
|
||||
}
|
||||
});
|
||||
|
||||
$('[itemprop="telephone"], .phone, .contact-phone').each((_, element) => {
|
||||
const phone = $(element).text().trim();
|
||||
const formatted = this.formatPhoneNumber(phone);
|
||||
if (formatted) result.phones.add(formatted);
|
||||
});
|
||||
}
|
||||
|
||||
const finalResult = {
|
||||
...result,
|
||||
emails: Array.from(result.emails),
|
||||
phones: Array.from(result.phones),
|
||||
addresses: Array.from(result.addresses),
|
||||
socialLinks: Array.from(result.socialLinks)
|
||||
};
|
||||
|
||||
this.cache.set(url, finalResult);
|
||||
return finalResult;
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Failed to scrape ${url}:`, error);
|
||||
return {
|
||||
emails: [],
|
||||
phones: [],
|
||||
addresses: [],
|
||||
socialLinks: [],
|
||||
source: url,
|
||||
timestamp: new Date(),
|
||||
attribution: 'Error accessing page'
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private isValidEmail(email: string): boolean {
|
||||
return /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/.test(email);
|
||||
}
|
||||
|
||||
private formatPhoneNumber(phone: string): string {
|
||||
const digits = phone.replace(/\D/g, '');
|
||||
if (digits.length === 10) {
|
||||
return `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
|
||||
}
|
||||
return phone;
|
||||
}
|
||||
|
||||
private formatAddress(address: any): string | null {
|
||||
if (typeof address === 'string') return address;
|
||||
if (typeof address === 'object') {
|
||||
const parts = [
|
||||
address.streetAddress,
|
||||
address.addressLocality,
|
||||
address.addressRegion,
|
||||
address.postalCode
|
||||
].filter(Boolean);
|
||||
if (parts.length > 0) return parts.join(', ');
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
@@ -1,82 +0,0 @@
|
||||
import { Embeddings, type EmbeddingsParams } from '@langchain/core/embeddings';
|
||||
import { chunkArray } from '@langchain/core/utils/chunk_array';
|
||||
|
||||
export interface HuggingFaceTransformersEmbeddingsParams
|
||||
extends EmbeddingsParams {
|
||||
modelName: string;
|
||||
|
||||
model: string;
|
||||
|
||||
timeout?: number;
|
||||
|
||||
batchSize?: number;
|
||||
|
||||
stripNewLines?: boolean;
|
||||
}
|
||||
|
||||
export class HuggingFaceTransformersEmbeddings
|
||||
extends Embeddings
|
||||
implements HuggingFaceTransformersEmbeddingsParams
|
||||
{
|
||||
modelName = 'Xenova/all-MiniLM-L6-v2';
|
||||
|
||||
model = 'Xenova/all-MiniLM-L6-v2';
|
||||
|
||||
batchSize = 512;
|
||||
|
||||
stripNewLines = true;
|
||||
|
||||
timeout?: number;
|
||||
|
||||
private pipelinePromise: Promise<any>;
|
||||
|
||||
constructor(fields?: Partial<HuggingFaceTransformersEmbeddingsParams>) {
|
||||
super(fields ?? {});
|
||||
|
||||
this.modelName = fields?.model ?? fields?.modelName ?? this.model;
|
||||
this.model = this.modelName;
|
||||
this.stripNewLines = fields?.stripNewLines ?? this.stripNewLines;
|
||||
this.timeout = fields?.timeout;
|
||||
}
|
||||
|
||||
async embedDocuments(texts: string[]): Promise<number[][]> {
|
||||
const batches = chunkArray(
|
||||
this.stripNewLines ? texts.map((t) => t.replace(/\n/g, ' ')) : texts,
|
||||
this.batchSize,
|
||||
);
|
||||
|
||||
const batchRequests = batches.map((batch) => this.runEmbedding(batch));
|
||||
const batchResponses = await Promise.all(batchRequests);
|
||||
const embeddings: number[][] = [];
|
||||
|
||||
for (let i = 0; i < batchResponses.length; i += 1) {
|
||||
const batchResponse = batchResponses[i];
|
||||
for (let j = 0; j < batchResponse.length; j += 1) {
|
||||
embeddings.push(batchResponse[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return embeddings;
|
||||
}
|
||||
|
||||
async embedQuery(text: string): Promise<number[]> {
|
||||
const data = await this.runEmbedding([
|
||||
this.stripNewLines ? text.replace(/\n/g, ' ') : text,
|
||||
]);
|
||||
return data[0];
|
||||
}
|
||||
|
||||
private async runEmbedding(texts: string[]) {
|
||||
const { pipeline } = await import('@xenova/transformers');
|
||||
|
||||
const pipe = await (this.pipelinePromise ??= pipeline(
|
||||
'feature-extraction',
|
||||
this.model,
|
||||
));
|
||||
|
||||
return this.caller.call(async () => {
|
||||
const output = await pipe(texts, { pooling: 'mean', normalize: true });
|
||||
return output.tolist();
|
||||
});
|
||||
}
|
||||
}
|
@@ -1,48 +0,0 @@
|
||||
import { BaseOutputParser } from '@langchain/core/output_parsers';
|
||||
|
||||
interface LineOutputParserArgs {
|
||||
key?: string;
|
||||
}
|
||||
|
||||
class LineOutputParser extends BaseOutputParser<string> {
|
||||
private key = 'questions';
|
||||
|
||||
constructor(args?: LineOutputParserArgs) {
|
||||
super();
|
||||
this.key = args.key ?? this.key;
|
||||
}
|
||||
|
||||
static lc_name() {
|
||||
return 'LineOutputParser';
|
||||
}
|
||||
|
||||
lc_namespace = ['langchain', 'output_parsers', 'line_output_parser'];
|
||||
|
||||
async parse(text: string): Promise<string> {
|
||||
text = text.trim() || '';
|
||||
|
||||
const regex = /^(\s*(-|\*|\d+\.\s|\d+\)\s|\u2022)\s*)+/;
|
||||
const startKeyIndex = text.indexOf(`<${this.key}>`);
|
||||
const endKeyIndex = text.indexOf(`</${this.key}>`);
|
||||
|
||||
if (startKeyIndex === -1 || endKeyIndex === -1) {
|
||||
return '';
|
||||
}
|
||||
|
||||
const questionsStartIndex =
|
||||
startKeyIndex === -1 ? 0 : startKeyIndex + `<${this.key}>`.length;
|
||||
const questionsEndIndex = endKeyIndex === -1 ? text.length : endKeyIndex;
|
||||
const line = text
|
||||
.slice(questionsStartIndex, questionsEndIndex)
|
||||
.trim()
|
||||
.replace(regex, '');
|
||||
|
||||
return line;
|
||||
}
|
||||
|
||||
getFormatInstructions(): string {
|
||||
throw new Error('Not implemented.');
|
||||
}
|
||||
}
|
||||
|
||||
export default LineOutputParser;
|
@@ -1,50 +0,0 @@
|
||||
import { BaseOutputParser } from '@langchain/core/output_parsers';
|
||||
|
||||
interface LineListOutputParserArgs {
|
||||
key?: string;
|
||||
}
|
||||
|
||||
class LineListOutputParser extends BaseOutputParser<string[]> {
|
||||
private key = 'questions';
|
||||
|
||||
constructor(args?: LineListOutputParserArgs) {
|
||||
super();
|
||||
this.key = args.key ?? this.key;
|
||||
}
|
||||
|
||||
static lc_name() {
|
||||
return 'LineListOutputParser';
|
||||
}
|
||||
|
||||
lc_namespace = ['langchain', 'output_parsers', 'line_list_output_parser'];
|
||||
|
||||
async parse(text: string): Promise<string[]> {
|
||||
text = text.trim() || '';
|
||||
|
||||
const regex = /^(\s*(-|\*|\d+\.\s|\d+\)\s|\u2022)\s*)+/;
|
||||
const startKeyIndex = text.indexOf(`<${this.key}>`);
|
||||
const endKeyIndex = text.indexOf(`</${this.key}>`);
|
||||
|
||||
if (startKeyIndex === -1 || endKeyIndex === -1) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const questionsStartIndex =
|
||||
startKeyIndex === -1 ? 0 : startKeyIndex + `<${this.key}>`.length;
|
||||
const questionsEndIndex = endKeyIndex === -1 ? text.length : endKeyIndex;
|
||||
const lines = text
|
||||
.slice(questionsStartIndex, questionsEndIndex)
|
||||
.trim()
|
||||
.split('\n')
|
||||
.filter((line) => line.trim() !== '')
|
||||
.map((line) => line.replace(regex, ''));
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
getFormatInstructions(): string {
|
||||
throw new Error('Not implemented.');
|
||||
}
|
||||
}
|
||||
|
||||
export default LineListOutputParser;
|
@@ -1,59 +0,0 @@
|
||||
import { ChatAnthropic } from '@langchain/anthropic';
|
||||
import { getAnthropicApiKey } from '../../config';
|
||||
import logger from '../../utils/logger';
|
||||
|
||||
export const loadAnthropicChatModels = async () => {
|
||||
const anthropicApiKey = getAnthropicApiKey();
|
||||
|
||||
if (!anthropicApiKey) return {};
|
||||
|
||||
try {
|
||||
const chatModels = {
|
||||
'claude-3-5-sonnet-20241022': {
|
||||
displayName: 'Claude 3.5 Sonnet',
|
||||
model: new ChatAnthropic({
|
||||
temperature: 0.7,
|
||||
anthropicApiKey: anthropicApiKey,
|
||||
model: 'claude-3-5-sonnet-20241022',
|
||||
}),
|
||||
},
|
||||
'claude-3-5-haiku-20241022': {
|
||||
displayName: 'Claude 3.5 Haiku',
|
||||
model: new ChatAnthropic({
|
||||
temperature: 0.7,
|
||||
anthropicApiKey: anthropicApiKey,
|
||||
model: 'claude-3-5-haiku-20241022',
|
||||
}),
|
||||
},
|
||||
'claude-3-opus-20240229': {
|
||||
displayName: 'Claude 3 Opus',
|
||||
model: new ChatAnthropic({
|
||||
temperature: 0.7,
|
||||
anthropicApiKey: anthropicApiKey,
|
||||
model: 'claude-3-opus-20240229',
|
||||
}),
|
||||
},
|
||||
'claude-3-sonnet-20240229': {
|
||||
displayName: 'Claude 3 Sonnet',
|
||||
model: new ChatAnthropic({
|
||||
temperature: 0.7,
|
||||
anthropicApiKey: anthropicApiKey,
|
||||
model: 'claude-3-sonnet-20240229',
|
||||
}),
|
||||
},
|
||||
'claude-3-haiku-20240307': {
|
||||
displayName: 'Claude 3 Haiku',
|
||||
model: new ChatAnthropic({
|
||||
temperature: 0.7,
|
||||
anthropicApiKey: anthropicApiKey,
|
||||
model: 'claude-3-haiku-20240307',
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return chatModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading Anthropic models: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
@@ -1,19 +0,0 @@
|
||||
import { Business, SearchParams } from '../../../types/business';
|
||||
import { WebScraperProvider } from './webScraper';
|
||||
|
||||
export class BusinessProvider {
|
||||
private scraper: WebScraperProvider;
|
||||
|
||||
constructor() {
|
||||
this.scraper = new WebScraperProvider();
|
||||
}
|
||||
|
||||
async search(params: SearchParams): Promise<Business[]> {
|
||||
return this.scraper.search(params);
|
||||
}
|
||||
|
||||
async getDetails(businessId: string): Promise<Business | null> {
|
||||
// Implement detailed business lookup using stored data or additional scraping
|
||||
return null;
|
||||
}
|
||||
}
|
@@ -1,111 +0,0 @@
|
||||
import { Business, SearchParams } from '../../../types/business';
|
||||
import { searchWeb } from '../search'; // This is Perplexica's existing search function
|
||||
import { parseHTML } from '../utils/parser';
|
||||
|
||||
export class WebScraperProvider {
|
||||
async search(params: SearchParams): Promise<Business[]> {
|
||||
const searchQueries = this.generateQueries(params);
|
||||
const businesses: Business[] = [];
|
||||
|
||||
for (const query of searchQueries) {
|
||||
// Use Perplexica's existing search functionality
|
||||
const results = await searchWeb(query, {
|
||||
maxResults: 20,
|
||||
type: 'general' // or 'news' depending on what we want
|
||||
});
|
||||
|
||||
for (const result of results) {
|
||||
try {
|
||||
const html = await fetch(result.url).then(res => res.text());
|
||||
const businessData = await this.extractBusinessData(html, result.url);
|
||||
if (businessData) {
|
||||
businesses.push(businessData);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to extract data from ${result.url}:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return this.deduplicateBusinesses(businesses);
|
||||
}
|
||||
|
||||
private generateQueries(params: SearchParams): string[] {
|
||||
const { location, category } = params;
|
||||
return [
|
||||
`${category} in ${location}`,
|
||||
`${category} business ${location}`,
|
||||
`best ${category} near ${location}`,
|
||||
`${category} services ${location} reviews`
|
||||
];
|
||||
}
|
||||
|
||||
private async extractBusinessData(html: string, sourceUrl: string): Promise<Business | null> {
|
||||
const $ = parseHTML(html);
|
||||
|
||||
// Different extraction logic based on source
|
||||
if (sourceUrl.includes('yelp.com')) {
|
||||
return this.extractYelpData($);
|
||||
} else if (sourceUrl.includes('yellowpages.com')) {
|
||||
return this.extractYellowPagesData($);
|
||||
}
|
||||
// ... other source-specific extractors
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractYelpData($: any): Business | null {
|
||||
try {
|
||||
return {
|
||||
id: crypto.randomUUID(),
|
||||
name: $('.business-name').text().trim(),
|
||||
phone: $('.phone-number').text().trim(),
|
||||
address: $('.address').text().trim(),
|
||||
city: $('.city').text().trim(),
|
||||
state: $('.state').text().trim(),
|
||||
zip: $('.zip').text().trim(),
|
||||
category: $('.category-str-list').text().split(',').map(s => s.trim()),
|
||||
rating: parseFloat($('.rating').text()),
|
||||
reviewCount: parseInt($('.review-count').text()),
|
||||
services: $('.services-list').text().split(',').map(s => s.trim()),
|
||||
hours: this.extractHours($),
|
||||
website: $('.website-link').attr('href'),
|
||||
verified: false,
|
||||
lastUpdated: new Date()
|
||||
};
|
||||
} catch (error) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private deduplicateBusinesses(businesses: Business[]): Business[] {
|
||||
// Group by phone number and address to identify duplicates
|
||||
const uniqueBusinesses = new Map<string, Business>();
|
||||
|
||||
for (const business of businesses) {
|
||||
const key = `${business.phone}-${business.address}`.toLowerCase();
|
||||
if (!uniqueBusinesses.has(key)) {
|
||||
uniqueBusinesses.set(key, business);
|
||||
} else {
|
||||
// Merge data if we have additional information
|
||||
const existing = uniqueBusinesses.get(key)!;
|
||||
uniqueBusinesses.set(key, this.mergeBusinessData(existing, business));
|
||||
}
|
||||
}
|
||||
|
||||
return Array.from(uniqueBusinesses.values());
|
||||
}
|
||||
|
||||
private mergeBusinessData(existing: Business, newData: Business): Business {
|
||||
return {
|
||||
...existing,
|
||||
services: [...new Set([...existing.services, ...newData.services])],
|
||||
rating: (existing.rating + newData.rating) / 2,
|
||||
reviewCount: existing.reviewCount + newData.reviewCount,
|
||||
// Keep the most complete data for other fields
|
||||
website: existing.website || newData.website,
|
||||
email: existing.email || newData.email,
|
||||
hours: existing.hours || newData.hours
|
||||
};
|
||||
}
|
||||
}
|
@@ -1,69 +0,0 @@
|
||||
import {
|
||||
ChatGoogleGenerativeAI,
|
||||
GoogleGenerativeAIEmbeddings,
|
||||
} from '@langchain/google-genai';
|
||||
import { getGeminiApiKey } from '../../config';
|
||||
import logger from '../../utils/logger';
|
||||
|
||||
export const loadGeminiChatModels = async () => {
|
||||
const geminiApiKey = getGeminiApiKey();
|
||||
|
||||
if (!geminiApiKey) return {};
|
||||
|
||||
try {
|
||||
const chatModels = {
|
||||
'gemini-1.5-flash': {
|
||||
displayName: 'Gemini 1.5 Flash',
|
||||
model: new ChatGoogleGenerativeAI({
|
||||
modelName: 'gemini-1.5-flash',
|
||||
temperature: 0.7,
|
||||
apiKey: geminiApiKey,
|
||||
}),
|
||||
},
|
||||
'gemini-1.5-flash-8b': {
|
||||
displayName: 'Gemini 1.5 Flash 8B',
|
||||
model: new ChatGoogleGenerativeAI({
|
||||
modelName: 'gemini-1.5-flash-8b',
|
||||
temperature: 0.7,
|
||||
apiKey: geminiApiKey,
|
||||
}),
|
||||
},
|
||||
'gemini-1.5-pro': {
|
||||
displayName: 'Gemini 1.5 Pro',
|
||||
model: new ChatGoogleGenerativeAI({
|
||||
modelName: 'gemini-1.5-pro',
|
||||
temperature: 0.7,
|
||||
apiKey: geminiApiKey,
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return chatModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading Gemini models: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
export const loadGeminiEmbeddingsModels = async () => {
|
||||
const geminiApiKey = getGeminiApiKey();
|
||||
|
||||
if (!geminiApiKey) return {};
|
||||
|
||||
try {
|
||||
const embeddingModels = {
|
||||
'text-embedding-004': {
|
||||
displayName: 'Text Embedding',
|
||||
model: new GoogleGenerativeAIEmbeddings({
|
||||
apiKey: geminiApiKey,
|
||||
modelName: 'text-embedding-004',
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return embeddingModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading Gemini embeddings model: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
@@ -1,136 +0,0 @@
|
||||
import { ChatOpenAI } from '@langchain/openai';
|
||||
import { getGroqApiKey } from '../../config';
|
||||
import logger from '../../utils/logger';
|
||||
|
||||
export const loadGroqChatModels = async () => {
|
||||
const groqApiKey = getGroqApiKey();
|
||||
|
||||
if (!groqApiKey) return {};
|
||||
|
||||
try {
|
||||
const chatModels = {
|
||||
'llama-3.3-70b-versatile': {
|
||||
displayName: 'Llama 3.3 70B',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'llama-3.3-70b-versatile',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'llama-3.2-3b-preview': {
|
||||
displayName: 'Llama 3.2 3B',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'llama-3.2-3b-preview',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'llama-3.2-11b-vision-preview': {
|
||||
displayName: 'Llama 3.2 11B Vision',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'llama-3.2-11b-vision-preview',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'llama-3.2-90b-vision-preview': {
|
||||
displayName: 'Llama 3.2 90B Vision',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'llama-3.2-90b-vision-preview',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'llama-3.1-8b-instant': {
|
||||
displayName: 'Llama 3.1 8B',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'llama-3.1-8b-instant',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'llama3-8b-8192': {
|
||||
displayName: 'LLaMA3 8B',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'llama3-8b-8192',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'llama3-70b-8192': {
|
||||
displayName: 'LLaMA3 70B',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'llama3-70b-8192',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'mixtral-8x7b-32768': {
|
||||
displayName: 'Mixtral 8x7B',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'mixtral-8x7b-32768',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
'gemma2-9b-it': {
|
||||
displayName: 'Gemma2 9B',
|
||||
model: new ChatOpenAI(
|
||||
{
|
||||
openAIApiKey: groqApiKey,
|
||||
modelName: 'gemma2-9b-it',
|
||||
temperature: 0.7,
|
||||
},
|
||||
{
|
||||
baseURL: 'https://api.groq.com/openai/v1',
|
||||
},
|
||||
),
|
||||
},
|
||||
};
|
||||
|
||||
return chatModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading Groq models: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
@@ -1,49 +0,0 @@
|
||||
import { loadGroqChatModels } from './groq';
|
||||
import { loadOllamaChatModels, loadOllamaEmbeddingsModels } from './ollama';
|
||||
import { loadOpenAIChatModels, loadOpenAIEmbeddingsModels } from './openai';
|
||||
import { loadAnthropicChatModels } from './anthropic';
|
||||
import { loadTransformersEmbeddingsModels } from './transformers';
|
||||
import { loadGeminiChatModels, loadGeminiEmbeddingsModels } from './gemini';
|
||||
|
||||
const chatModelProviders = {
|
||||
openai: loadOpenAIChatModels,
|
||||
groq: loadGroqChatModels,
|
||||
ollama: loadOllamaChatModels,
|
||||
anthropic: loadAnthropicChatModels,
|
||||
gemini: loadGeminiChatModels,
|
||||
};
|
||||
|
||||
const embeddingModelProviders = {
|
||||
openai: loadOpenAIEmbeddingsModels,
|
||||
local: loadTransformersEmbeddingsModels,
|
||||
ollama: loadOllamaEmbeddingsModels,
|
||||
gemini: loadGeminiEmbeddingsModels,
|
||||
};
|
||||
|
||||
export const getAvailableChatModelProviders = async () => {
|
||||
const models = {};
|
||||
|
||||
for (const provider in chatModelProviders) {
|
||||
const providerModels = await chatModelProviders[provider]();
|
||||
if (Object.keys(providerModels).length > 0) {
|
||||
models[provider] = providerModels;
|
||||
}
|
||||
}
|
||||
|
||||
models['custom_openai'] = {};
|
||||
|
||||
return models;
|
||||
};
|
||||
|
||||
export const getAvailableEmbeddingModelProviders = async () => {
|
||||
const models = {};
|
||||
|
||||
for (const provider in embeddingModelProviders) {
|
||||
const providerModels = await embeddingModelProviders[provider]();
|
||||
if (Object.keys(providerModels).length > 0) {
|
||||
models[provider] = providerModels;
|
||||
}
|
||||
}
|
||||
|
||||
return models;
|
||||
};
|
@@ -1,74 +0,0 @@
|
||||
import { OllamaEmbeddings } from '@langchain/community/embeddings/ollama';
|
||||
import { getKeepAlive, getOllamaApiEndpoint } from '../../config';
|
||||
import logger from '../../utils/logger';
|
||||
import { ChatOllama } from '@langchain/community/chat_models/ollama';
|
||||
import axios from 'axios';
|
||||
|
||||
export const loadOllamaChatModels = async () => {
|
||||
const ollamaEndpoint = getOllamaApiEndpoint();
|
||||
const keepAlive = getKeepAlive();
|
||||
|
||||
if (!ollamaEndpoint) return {};
|
||||
|
||||
try {
|
||||
const response = await axios.get(`${ollamaEndpoint}/api/tags`, {
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
const { models: ollamaModels } = response.data;
|
||||
|
||||
const chatModels = ollamaModels.reduce((acc, model) => {
|
||||
acc[model.model] = {
|
||||
displayName: model.name,
|
||||
model: new ChatOllama({
|
||||
baseUrl: ollamaEndpoint,
|
||||
model: model.model,
|
||||
temperature: 0.7,
|
||||
keepAlive: keepAlive,
|
||||
}),
|
||||
};
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
return chatModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading Ollama models: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
export const loadOllamaEmbeddingsModels = async () => {
|
||||
const ollamaEndpoint = getOllamaApiEndpoint();
|
||||
|
||||
if (!ollamaEndpoint) return {};
|
||||
|
||||
try {
|
||||
const response = await axios.get(`${ollamaEndpoint}/api/tags`, {
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
const { models: ollamaModels } = response.data;
|
||||
|
||||
const embeddingsModels = ollamaModels.reduce((acc, model) => {
|
||||
acc[model.model] = {
|
||||
displayName: model.name,
|
||||
model: new OllamaEmbeddings({
|
||||
baseUrl: ollamaEndpoint,
|
||||
model: model.model,
|
||||
}),
|
||||
};
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
return embeddingsModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading Ollama embeddings model: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
@@ -1,89 +0,0 @@
|
||||
import { ChatOpenAI, OpenAIEmbeddings } from '@langchain/openai';
|
||||
import { getOpenaiApiKey } from '../../config';
|
||||
import logger from '../../utils/logger';
|
||||
|
||||
export const loadOpenAIChatModels = async () => {
|
||||
const openAIApiKey = getOpenaiApiKey();
|
||||
|
||||
if (!openAIApiKey) return {};
|
||||
|
||||
try {
|
||||
const chatModels = {
|
||||
'gpt-3.5-turbo': {
|
||||
displayName: 'GPT-3.5 Turbo',
|
||||
model: new ChatOpenAI({
|
||||
openAIApiKey,
|
||||
modelName: 'gpt-3.5-turbo',
|
||||
temperature: 0.7,
|
||||
}),
|
||||
},
|
||||
'gpt-4': {
|
||||
displayName: 'GPT-4',
|
||||
model: new ChatOpenAI({
|
||||
openAIApiKey,
|
||||
modelName: 'gpt-4',
|
||||
temperature: 0.7,
|
||||
}),
|
||||
},
|
||||
'gpt-4-turbo': {
|
||||
displayName: 'GPT-4 turbo',
|
||||
model: new ChatOpenAI({
|
||||
openAIApiKey,
|
||||
modelName: 'gpt-4-turbo',
|
||||
temperature: 0.7,
|
||||
}),
|
||||
},
|
||||
'gpt-4o': {
|
||||
displayName: 'GPT-4 omni',
|
||||
model: new ChatOpenAI({
|
||||
openAIApiKey,
|
||||
modelName: 'gpt-4o',
|
||||
temperature: 0.7,
|
||||
}),
|
||||
},
|
||||
'gpt-4o-mini': {
|
||||
displayName: 'GPT-4 omni mini',
|
||||
model: new ChatOpenAI({
|
||||
openAIApiKey,
|
||||
modelName: 'gpt-4o-mini',
|
||||
temperature: 0.7,
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return chatModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading OpenAI models: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
export const loadOpenAIEmbeddingsModels = async () => {
|
||||
const openAIApiKey = getOpenaiApiKey();
|
||||
|
||||
if (!openAIApiKey) return {};
|
||||
|
||||
try {
|
||||
const embeddingModels = {
|
||||
'text-embedding-3-small': {
|
||||
displayName: 'Text Embedding 3 Small',
|
||||
model: new OpenAIEmbeddings({
|
||||
openAIApiKey,
|
||||
modelName: 'text-embedding-3-small',
|
||||
}),
|
||||
},
|
||||
'text-embedding-3-large': {
|
||||
displayName: 'Text Embedding 3 Large',
|
||||
model: new OpenAIEmbeddings({
|
||||
openAIApiKey,
|
||||
modelName: 'text-embedding-3-large',
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return embeddingModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading OpenAI embeddings model: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
@@ -1,32 +0,0 @@
|
||||
import logger from '../../utils/logger';
|
||||
import { HuggingFaceTransformersEmbeddings } from '../huggingfaceTransformer';
|
||||
|
||||
export const loadTransformersEmbeddingsModels = async () => {
|
||||
try {
|
||||
const embeddingModels = {
|
||||
'xenova-bge-small-en-v1.5': {
|
||||
displayName: 'BGE Small',
|
||||
model: new HuggingFaceTransformersEmbeddings({
|
||||
modelName: 'Xenova/bge-small-en-v1.5',
|
||||
}),
|
||||
},
|
||||
'xenova-gte-small': {
|
||||
displayName: 'GTE Small',
|
||||
model: new HuggingFaceTransformersEmbeddings({
|
||||
modelName: 'Xenova/gte-small',
|
||||
}),
|
||||
},
|
||||
'xenova-bert-base-multilingual-uncased': {
|
||||
displayName: 'Bert Multilingual',
|
||||
model: new HuggingFaceTransformersEmbeddings({
|
||||
modelName: 'Xenova/bert-base-multilingual-uncased',
|
||||
}),
|
||||
},
|
||||
};
|
||||
|
||||
return embeddingModels;
|
||||
} catch (err) {
|
||||
logger.error(`Error loading Transformers embeddings model: ${err}`);
|
||||
return {};
|
||||
}
|
||||
};
|
@@ -1,54 +0,0 @@
|
||||
import axios from 'axios';
|
||||
import { config } from '../config';
|
||||
|
||||
interface SearchOptions {
|
||||
maxResults?: number;
|
||||
type?: 'general' | 'news';
|
||||
engines?: string[];
|
||||
}
|
||||
|
||||
interface SearchResult {
|
||||
url: string;
|
||||
title: string;
|
||||
content: string;
|
||||
score?: number;
|
||||
}
|
||||
|
||||
export async function searchWeb(
|
||||
query: string,
|
||||
options: SearchOptions = {}
|
||||
): Promise<SearchResult[]> {
|
||||
const {
|
||||
maxResults = 20,
|
||||
type = 'general',
|
||||
engines = ['google', 'bing', 'duckduckgo']
|
||||
} = options;
|
||||
|
||||
try {
|
||||
const response = await axios.get(`${config.search.searxngUrl || process.env.SEARXNG_URL}/search`, {
|
||||
params: {
|
||||
q: query,
|
||||
format: 'json',
|
||||
categories: type,
|
||||
engines: engines.join(','),
|
||||
limit: maxResults
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.data || !response.data.results) {
|
||||
console.error('Invalid response from SearxNG:', response.data);
|
||||
return [];
|
||||
}
|
||||
|
||||
return response.data.results.map((result: any) => ({
|
||||
url: result.url,
|
||||
title: result.title,
|
||||
content: result.content || result.snippet || '',
|
||||
score: result.score
|
||||
}));
|
||||
|
||||
} catch (error) {
|
||||
console.error('Search failed:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
@@ -1,313 +0,0 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { createWorker } from 'tesseract.js';
|
||||
import { env } from '../config/env';
|
||||
import { OllamaService } from './services/ollamaService';
|
||||
import { BusinessData } from './types';
|
||||
import { db } from './services/databaseService';
|
||||
import { generateBusinessId } from './utils';
|
||||
import { extractContactFromHtml, extractCleanAddress } from './utils/scraper';
|
||||
import { GeocodingService } from './services/geocodingService';
|
||||
import { cleanAddress, formatPhoneNumber, cleanEmail, cleanDescription } from './utils/dataCleanup';
|
||||
import { CleanupService } from './services/cleanupService';
|
||||
|
||||
// Define interfaces used only in this file
|
||||
interface SearchResult {
|
||||
url: string;
|
||||
title: string;
|
||||
content: string;
|
||||
phone?: string;
|
||||
email?: string;
|
||||
address?: string;
|
||||
website?: string;
|
||||
rating?: number;
|
||||
coordinates?: {
|
||||
lat: number;
|
||||
lng: number;
|
||||
};
|
||||
}
|
||||
|
||||
interface ContactInfo {
|
||||
phone?: string;
|
||||
email?: string;
|
||||
address?: string;
|
||||
description?: string;
|
||||
openingHours?: string[];
|
||||
}
|
||||
|
||||
// Export the main search function
|
||||
export async function searchBusinesses(
|
||||
query: string,
|
||||
options: { onProgress?: (status: string, progress: number) => void } = {}
|
||||
): Promise<BusinessData[]> {
|
||||
try {
|
||||
console.log('Processing search query:', query);
|
||||
const [searchTerm, location] = query.split(' in ').map(s => s.trim());
|
||||
if (!searchTerm || !location) {
|
||||
throw new Error('Invalid search query format. Use: "search term in location"');
|
||||
}
|
||||
|
||||
options.onProgress?.('Checking cache', 0);
|
||||
|
||||
// Check cache first
|
||||
const cacheKey = `search:${searchTerm}:${location}`;
|
||||
let results = await db.getFromCache(cacheKey);
|
||||
|
||||
if (!results) {
|
||||
// Check database for existing businesses
|
||||
console.log('Searching database for:', searchTerm, 'in', location);
|
||||
const existingBusinesses = await db.searchBusinesses(searchTerm, location);
|
||||
|
||||
// Start search immediately
|
||||
console.log('Starting web search');
|
||||
const searchPromise = performSearch(searchTerm, location, options);
|
||||
|
||||
if (existingBusinesses.length > 0) {
|
||||
console.log(`Found ${existingBusinesses.length} existing businesses`);
|
||||
options.onProgress?.('Retrieved from database', 50);
|
||||
}
|
||||
|
||||
// Wait for new results
|
||||
const newResults = await searchPromise;
|
||||
console.log(`Got ${newResults.length} new results from search`);
|
||||
|
||||
// Merge results, removing duplicates by ID
|
||||
const allResults = [...existingBusinesses];
|
||||
for (const result of newResults) {
|
||||
if (!allResults.some(b => b.id === result.id)) {
|
||||
allResults.push(result);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Total unique results: ${allResults.length}`);
|
||||
|
||||
// Cache combined results
|
||||
await db.saveToCache(cacheKey, allResults, env.cache.durationHours * 60 * 60 * 1000);
|
||||
|
||||
console.log(`Returning ${allResults.length} total results (${existingBusinesses.length} existing + ${newResults.length} new)`);
|
||||
results = allResults;
|
||||
}
|
||||
|
||||
// Clean all results using LLM
|
||||
options.onProgress?.('Cleaning data', 75);
|
||||
const cleanedResults = await CleanupService.cleanBusinessRecords(results);
|
||||
|
||||
options.onProgress?.('Search complete', 100);
|
||||
return cleanedResults;
|
||||
} catch (error) {
|
||||
console.error('Search error:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
async function performSearch(
|
||||
searchTerm: string,
|
||||
location: string,
|
||||
options: any
|
||||
): Promise<BusinessData[]> {
|
||||
const queries = [
|
||||
searchTerm + ' ' + location,
|
||||
searchTerm + ' business near ' + location,
|
||||
searchTerm + ' services ' + location,
|
||||
'local ' + searchTerm + ' ' + location
|
||||
];
|
||||
|
||||
options.onProgress?.('Searching multiple sources', 25);
|
||||
|
||||
let allResults: SearchResult[] = [];
|
||||
const seenUrls = new Set<string>();
|
||||
|
||||
for (const q of queries) {
|
||||
try {
|
||||
const response = await axios.get(`${env.searxng.currentUrl}/search`, {
|
||||
params: {
|
||||
q,
|
||||
format: 'json',
|
||||
engines: 'google,google_maps',
|
||||
language: 'en-US',
|
||||
time_range: '',
|
||||
safesearch: 1
|
||||
}
|
||||
});
|
||||
|
||||
if (response.data?.results) {
|
||||
// Deduplicate results
|
||||
const newResults = response.data.results.filter((result: SearchResult) => {
|
||||
if (seenUrls.has(result.url)) {
|
||||
return false;
|
||||
}
|
||||
seenUrls.add(result.url);
|
||||
return true;
|
||||
});
|
||||
|
||||
console.log(`Found ${newResults.length} unique results from ${response.data.results[0]?.engine}`);
|
||||
allResults = allResults.concat(newResults);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Search failed for query "${q}":`, error);
|
||||
}
|
||||
}
|
||||
|
||||
options.onProgress?.('Processing results', 50);
|
||||
|
||||
const filteredResults = allResults.filter(isValidBusinessResult);
|
||||
const processedResults = await processResults(filteredResults, location);
|
||||
|
||||
// Save results to database
|
||||
for (const result of processedResults) {
|
||||
await db.saveBusiness(result).catch(console.error);
|
||||
}
|
||||
|
||||
options.onProgress?.('Search complete', 100);
|
||||
return processedResults;
|
||||
}
|
||||
|
||||
// Add other necessary functions (isValidBusinessResult, processResults, etc.)
|
||||
function isValidBusinessResult(result: SearchResult): boolean {
|
||||
// Skip listing/directory pages and search results
|
||||
const skipPatterns = [
|
||||
'tripadvisor.com',
|
||||
'yelp.com',
|
||||
'opentable.com',
|
||||
'restaurants-for-sale',
|
||||
'guide.michelin.com',
|
||||
'denver.org',
|
||||
'/blog/',
|
||||
'/maps/',
|
||||
'search?',
|
||||
'features/',
|
||||
'/lists/',
|
||||
'reddit.com',
|
||||
'eater.com'
|
||||
];
|
||||
|
||||
if (skipPatterns.some(pattern => result.url.toLowerCase().includes(pattern))) {
|
||||
console.log(`Skipping listing page: ${result.url}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Must have a title
|
||||
if (!result.title || result.title.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip results that look like articles or lists
|
||||
const articlePatterns = [
|
||||
'Best',
|
||||
'Top',
|
||||
'Guide',
|
||||
'Where to',
|
||||
'Welcome to',
|
||||
'Updated',
|
||||
'Near',
|
||||
'Restaurants in'
|
||||
];
|
||||
|
||||
if (articlePatterns.some(pattern => result.title.includes(pattern))) {
|
||||
console.log(`Skipping article: ${result.title}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Only accept results that look like actual business pages
|
||||
const businessPatterns = [
|
||||
'menu',
|
||||
'reservation',
|
||||
'location',
|
||||
'contact',
|
||||
'about-us',
|
||||
'home'
|
||||
];
|
||||
|
||||
const hasBusinessPattern = businessPatterns.some(pattern =>
|
||||
result.url.toLowerCase().includes(pattern) ||
|
||||
result.content.toLowerCase().includes(pattern)
|
||||
);
|
||||
|
||||
if (!hasBusinessPattern) {
|
||||
console.log(`Skipping non-business page: ${result.url}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
async function processResults(results: SearchResult[], location: string): Promise<BusinessData[]> {
|
||||
const processedResults: BusinessData[] = [];
|
||||
|
||||
// Get coordinates for the location
|
||||
const locationGeo = await GeocodingService.geocode(location);
|
||||
const defaultCoords = locationGeo || { lat: 39.7392, lng: -104.9903 };
|
||||
|
||||
for (const result of results) {
|
||||
try {
|
||||
// Extract contact info from webpage
|
||||
const contactInfo = await extractContactFromHtml(result.url);
|
||||
|
||||
// Create initial business record
|
||||
const business: BusinessData = {
|
||||
id: generateBusinessId(result),
|
||||
name: cleanBusinessName(result.title),
|
||||
phone: result.phone || contactInfo.phone || '',
|
||||
email: result.email || contactInfo.email || '',
|
||||
address: result.address || contactInfo.address || '',
|
||||
rating: result.rating || 0,
|
||||
website: result.website || result.url || '',
|
||||
logo: '',
|
||||
source: 'web',
|
||||
description: result.content || contactInfo.description || '',
|
||||
location: defaultCoords,
|
||||
openingHours: contactInfo.openingHours
|
||||
};
|
||||
|
||||
// Clean up the record using LLM
|
||||
const cleanedBusiness = await CleanupService.cleanBusinessRecord(business);
|
||||
|
||||
// Get coordinates for cleaned address
|
||||
if (cleanedBusiness.address) {
|
||||
const addressGeo = await GeocodingService.geocode(cleanedBusiness.address);
|
||||
if (addressGeo) {
|
||||
cleanedBusiness.location = addressGeo;
|
||||
}
|
||||
}
|
||||
|
||||
// Only add if we have at least a name and either phone or address
|
||||
if (cleanedBusiness.name && (cleanedBusiness.phone || cleanedBusiness.address)) {
|
||||
processedResults.push(cleanedBusiness);
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
console.error(`Error processing result ${result.title}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
return processedResults;
|
||||
}
|
||||
|
||||
// Helper functions
|
||||
function cleanBusinessName(name: string): string {
|
||||
// Remove common suffixes and prefixes
|
||||
const cleanName = name
|
||||
.replace(/^(The|A|An)\s+/i, '')
|
||||
.replace(/\s+(-|–|—|:).*$/, '')
|
||||
.replace(/\s*\([^)]*\)/g, '')
|
||||
.trim();
|
||||
|
||||
return cleanName;
|
||||
}
|
||||
|
||||
async function getLocationCoordinates(address: string): Promise<{lat: number, lng: number}> {
|
||||
// Implement geocoding here
|
||||
// For now, return default coordinates for Denver
|
||||
return { lat: 39.7392, lng: -104.9903 };
|
||||
}
|
||||
|
||||
async function searchAndUpdateInBackground(searchTerm: string, location: string) {
|
||||
try {
|
||||
const results = await performSearch(searchTerm, location, {});
|
||||
console.log(`Updated ${results.length} businesses in background`);
|
||||
} catch (error) {
|
||||
console.error('Background search error:', error);
|
||||
}
|
||||
}
|
||||
|
||||
// ... rest of the file remains the same
|
@@ -1,111 +0,0 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { Cache } from '../utils/cache';
|
||||
import { RateLimiter } from '../utils/rateLimiter';
|
||||
|
||||
interface CrawlResult {
|
||||
mainContent: string;
|
||||
contactInfo: string;
|
||||
aboutInfo: string;
|
||||
structuredData: any;
|
||||
}
|
||||
|
||||
export class BusinessCrawler {
|
||||
private cache: Cache<CrawlResult>;
|
||||
private rateLimiter: RateLimiter;
|
||||
|
||||
constructor() {
|
||||
this.cache = new Cache<CrawlResult>(60); // 1 hour cache
|
||||
this.rateLimiter = new RateLimiter();
|
||||
}
|
||||
|
||||
async crawlBusinessSite(url: string): Promise<CrawlResult> {
|
||||
// Check cache first
|
||||
const cached = this.cache.get(url);
|
||||
if (cached) return cached;
|
||||
|
||||
await this.rateLimiter.waitForSlot();
|
||||
|
||||
try {
|
||||
const mainPage = await this.fetchPage(url);
|
||||
const $ = cheerio.load(mainPage);
|
||||
|
||||
// Get all important URLs
|
||||
const contactUrl = this.findContactPage($, url);
|
||||
const aboutUrl = this.findAboutPage($, url);
|
||||
|
||||
// Crawl additional pages
|
||||
const [contactPage, aboutPage] = await Promise.all([
|
||||
contactUrl ? this.fetchPage(contactUrl) : '',
|
||||
aboutUrl ? this.fetchPage(aboutUrl) : ''
|
||||
]);
|
||||
|
||||
// Extract structured data
|
||||
const structuredData = this.extractStructuredData($);
|
||||
|
||||
const result = {
|
||||
mainContent: $('body').text(),
|
||||
contactInfo: contactPage,
|
||||
aboutInfo: aboutPage,
|
||||
structuredData
|
||||
};
|
||||
|
||||
this.cache.set(url, result);
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error(`Failed to crawl ${url}:`, error);
|
||||
return {
|
||||
mainContent: '',
|
||||
contactInfo: '',
|
||||
aboutInfo: '',
|
||||
structuredData: {}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async fetchPage(url: string): Promise<string> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
timeout: 10000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; BizSearch/1.0; +http://localhost:3000/about)',
|
||||
}
|
||||
});
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch ${url}:`, error);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
private findContactPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
|
||||
const contactLinks = $('a[href*="contact"], a:contains("Contact")');
|
||||
if (contactLinks.length > 0) {
|
||||
const href = contactLinks.first().attr('href');
|
||||
return href ? new URL(href, baseUrl).toString() : null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private findAboutPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
|
||||
const aboutLinks = $('a[href*="about"], a:contains("About")');
|
||||
if (aboutLinks.length > 0) {
|
||||
const href = aboutLinks.first().attr('href');
|
||||
return href ? new URL(href, baseUrl).toString() : null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractStructuredData($: cheerio.CheerioAPI): any {
|
||||
const structuredData: any[] = [];
|
||||
$('script[type="application/ld+json"]').each((_, element) => {
|
||||
try {
|
||||
const data = JSON.parse($(element).html() || '{}');
|
||||
structuredData.push(data);
|
||||
} catch (error) {
|
||||
console.error('Failed to parse structured data:', error);
|
||||
}
|
||||
});
|
||||
return structuredData;
|
||||
}
|
||||
}
|