Compare commits

..

274 Commits

Author SHA1 Message Date
Eli Grinfeld, MBA
7f287c45a6 Merge 9f4ae1baac into 46541e6c0c 2025-02-03 09:31:06 +01:00
ItzCrazyKns
46541e6c0c feat(package): update markdown-to-jsx version 2025-02-02 14:31:18 +05:30
ItzCrazyKns
f37686189e feat(output-parsers): add empty check 2025-01-31 17:51:16 +05:30
ItzCrazyKns
0737701de0 Merge branch 'master' of https://github.com/ItzCrazyKns/Perplexica 2025-01-11 13:11:18 +05:30
ItzCrazyKns
5c787bbb55 feat(app): lint & beautify 2025-01-11 13:10:23 +05:30
ItzCrazyKns
2dc60d06e3 feat(chat-window): show settings during error on mobile 2025-01-11 13:10:10 +05:30
ItzCrazyKns
ec90ea1686 Merge pull request #531 from hacking-racoon/feat/video-slide-stop
feat(SearchVideos): modify Lightbox to pause the prev video when sliding
2025-01-07 12:47:38 +05:30
ItzCrazyKns
01230bf1c5 Merge pull request #555 from realies/fix/ws-reconnect
fix(ws-error): add exponential reconnect mechanism
2025-01-07 12:32:06 +05:30
ItzCrazyKns
6d9d712790 feat(chat-window): correctly handle server side WS closure 2025-01-07 12:26:38 +05:30
ItzCrazyKns
99cae076a7 feat(chat-window): display toast when retried 2025-01-07 11:49:40 +05:30
ItzCrazyKns
b7f7d25f54 feat(chat-window): lint & beautify 2025-01-07 11:44:19 +05:30
ItzCrazyKns
0ec54fe6c0 feat(chat-window): remove toast 2025-01-07 11:43:54 +05:30
eligrinfeld
9f4ae1baac feat: update backend services and routes
- Add business routes and middleware\n- Update search and database services\n- Improve health check implementation\n- Update CI workflow configuration
2025-01-06 21:25:15 -07:00
eligrinfeld
79f26fce25 feat: add frontend setup with Tailwind CSS 2025-01-06 21:25:03 -07:00
eligrinfeld
7fa0e9dd9d feat: update database schema and migrations 2025-01-06 21:24:54 -07:00
eligrinfeld
765c8e549c chore: update dependencies and lock files 2025-01-06 21:24:45 -07:00
eligrinfeld
2ac1cb3943 refactor: improve server initialization and port handling
- Separate server setup from initialization\n- Add port availability check utility\n- Fix double server start issue\n- Improve error handling for port conflicts
2025-01-06 21:24:30 -07:00
eligrinfeld
ce97671da3 test: add CI/CD workflow 2025-01-05 14:16:31 -07:00
realies
5526d5f60f fix(ws-error): add exponential reconnect mechanism 2025-01-05 17:29:53 +00:00
ItzCrazyKns
0f6b3c2e69 Merge branch 'pr/538' 2025-01-05 14:15:58 +05:30
eligrinfeld
66d44c0774 feat(cleanup): Enhanced business data validation and cleaning
- Added confidence scoring system (0-1) for data quality
- Implemented strict validation for contact info
- Added batch processing and timeout protection
- Improved error handling with fallbacks
- Added smart caching based on confidence scores

Technical changes:
- Added regex validation for emails, phones, addresses
- Implemented business type detection
- Enhanced post-processing for consistent formatting
- Added JSDoc comments for maintainability

Testing:
- Verified with restaurant and plumber searches
- Confirmed improved data quality
- Validated timeout handling
2025-01-04 21:00:55 -07:00
eligrinfeld
6bcee39e63 feat(cleanup): Enhanced business data validation and cleaning
- Added confidence scoring system for data quality
- Implemented strict validation for emails, phones, and addresses
- Added batch processing to prevent LLM overload
- Improved error handling and fallback mechanisms
- Added caching based on confidence scores

Technical changes:
- Added regex validation for contact info
- Implemented scoring system (0-1 scale)
- Added timeout protection for LLM calls
- Enhanced post-processing for consistent formatting
- Added business type detection for context

Breaking changes: None
Dependencies: No new dependencies required
2025-01-04 20:59:00 -07:00
eligrinfeld
fde5b5e318 Add project files:
- Add database initialization scripts
- Add configuration files
- Add documentation
- Add public assets
- Add source code structure
- Update README
2025-01-04 17:22:46 -07:00
eligrinfeld
372943801d Refactor business search functionality:
- Add utility functions for business ID generation
- Improve database service with proper types
- Fix type safety issues in search implementation
- Add caching layer for search results
2025-01-04 17:16:22 -07:00
Sainadh Devireddy
5a648f34b8 Set pageContent correctly 2025-01-04 10:36:33 -08:00
Sainadh Devireddy
d18e88acc9 Delete msgs only belonging to the chat 2024-12-27 20:55:55 -08:00
ItzCrazyKns
409c811a42 feat(ollama): use axios instead of fetch 2024-12-26 19:02:20 +05:30
ItzCrazyKns
b5acf34ef8 feat(chat-window): fix bugs handling custom openai, closes #529 2024-12-26 18:59:57 +05:30
hacking-racoon
d30f714930 feat(SearchVideos): Modify Lightbox to pause the prev video when moving to next one, preventing interference with new video. 2024-12-25 15:19:23 +09:00
ItzCrazyKns
ee68095157 Merge pull request #523 from bart-jaskulski/groq-models
Update available models from Groq provider
2024-12-21 18:08:40 +05:30
Bart Jaskulski
960e34aa3d Add Llama 3.3 model from Groq
Signed-off-by: Bart Jaskulski <bjaskulski@protonmail.com>
2024-12-19 08:07:36 +01:00
Bart Jaskulski
4cb38148b3 Remove deprecated Groq models
Signed-off-by: Bart Jaskulski <bjaskulski@protonmail.com>
2024-12-19 08:07:14 +01:00
ItzCrazyKns
c755f98230 Merge branch 'master' of https://github.com/ItzCrazyKns/Perplexica 2024-12-18 19:42:28 +05:30
ItzCrazyKns
c3a231a528 feat(readme): add discord server 2024-12-16 20:59:21 +05:30
ItzCrazyKns
f30a61c4aa feat(metaSearchAgent): handle undefined content for YT. search 2024-12-16 18:24:01 +05:30
ItzCrazyKns
ea74e3013c Merge pull request #519 from yslinear/hotfix
feat(anthropic): update chat models to include Claude 3.5 Haiku and new version for Sonnet
2024-12-15 21:32:49 +05:30
Ying-Shan Lin
1c3c689039 feat(anthropic): update chat models to include Claude 3.5 Haiku and new version for Sonnet 2024-12-13 17:24:15 +08:00
ItzCrazyKns
2c5ca94b3c feat(app): lint and beautify 2024-12-05 20:19:52 +05:30
ItzCrazyKns
db7407bfac feat(messageBox): style markdown 2024-12-05 20:19:41 +05:30
ItzCrazyKns
5b3e8a3214 feat(prompts): implement new prompt 2024-12-05 20:19:22 +05:30
ItzCrazyKns
d79d854e2d Merge branch 'master' of https://github.com/ItzCrazyKns/Perplexica 2024-12-02 21:08:06 +05:30
ItzCrazyKns
8cb74f1964 feat(contribution): update guidelines 2024-12-02 21:07:59 +05:30
ItzCrazyKns
f88912784b Merge pull request #466 from timoa/fix/docs-markdown-lint
📚 chore(docs): fix Markdown lint issues in the docs
2024-12-01 21:05:23 +05:30
ItzCrazyKns
e08d864445 feat(focus): only icon on small devices 2024-11-30 20:58:11 +05:30
ItzCrazyKns
e4a0799503 feat(package): bump version 2024-11-29 18:37:02 +05:30
ItzCrazyKns
fdb3d09d12 Merge branch 'feat/single-search' 2024-11-29 18:07:33 +05:30
ItzCrazyKns
dc4a843d8a feat(agents): switch to MetaSearchAgent 2024-11-29 18:06:00 +05:30
ItzCrazyKns
92f66266b0 feat(agents): add a unified agent 2024-11-29 18:05:28 +05:30
ItzCrazyKns
177746235a feat(providers): add gemini 2024-11-28 20:47:18 +05:30
ItzCrazyKns
ecad065577 feat(searchAgent): handle empty fileIds 2024-11-27 15:13:46 +05:30
ItzCrazyKns
64ee19c70a feat(messageHandler): switch to webSearch mode if files 2024-11-25 12:34:37 +05:30
ItzCrazyKns
be745501aa feat(package): bump version 2024-11-25 12:23:23 +05:30
ItzCrazyKns
aa176c12f6 Merge pull request #484 from ItzCrazyKns/feat/uploads
Add file uploads
2024-11-24 20:29:46 +05:30
ItzCrazyKns
4b89008f3a feat(app): add file uploads 2024-11-23 15:04:19 +05:30
ItzCrazyKns
c650d1c3d9 feat(ollama): add keep_alive param 2024-11-20 19:11:47 +05:30
ItzCrazyKns
874505cd0e feat(package): bump version 2024-11-19 16:32:30 +05:30
ItzCrazyKns
b4a80d8ca0 feat(dockerfile): downgrade node version, closes #473 2024-11-19 14:40:24 +05:30
ItzCrazyKns
c7bab91803 feat(webSearchAgent): prevent excess results 2024-11-19 10:43:50 +05:30
ItzCrazyKns
a58adbfecc Update README.md 2024-11-17 23:01:24 +05:30
ItzCrazyKns
9e746aea5e feat(readme): remove ? from image URL 2024-11-17 23:01:02 +05:30
ItzCrazyKns
5e1331144a feat(readme): update readme cache 2024-11-17 22:59:29 +05:30
ItzCrazyKns
d789c970b1 feat(assets): update screenshot 2024-11-17 22:55:57 +05:30
ItzCrazyKns
e699cb2921 Merge branch 'master' of https://github.com/ItzCrazyKns/Perplexica 2024-11-17 19:49:22 +05:30
ItzCrazyKns
03eed9693b Merge branch 'pr/451' 2024-11-17 19:48:56 +05:30
ItzCrazyKns
011570dd9b Merge pull request #421 from sjiampojamarn/discover-nit
Make Discover link to a new tab
2024-11-17 19:40:05 +05:30
Damien Laureaux
f3e918c3e3 chore(docs): fix Markdown lint issues in the docs 2024-11-15 07:04:45 +01:00
ItzCrazyKns
18529391f4 Merge branch 'master' of https://github.com/ItzCrazyKns/Perplexica 2024-11-14 13:35:15 +05:30
ItzCrazyKns
a1a7470ca6 feat(package): update markdown-to-jsx 2024-11-14 13:35:10 +05:30
ItzCrazyKns
10c5ac1076 Merge pull request #448 from bastipnt/master
add db setup to CONTRIBUTING.md
2024-11-09 20:54:14 +05:30
Sharun
7c01d2656e fix(EmptyChatMessageInput): focus on mount 2024-11-04 22:00:08 -06:00
litc0de
afb4786ac0 add db setup to CONTRIBUTING.md 2024-11-03 10:33:01 +01:00
ItzCrazyKns
1e99fe8d69 feat(package): bump version 2024-10-31 11:08:49 +05:30
ItzCrazyKns
012dfa5a74 feat(listLineOutputParser): handle unclosed tags 2024-10-30 10:29:21 +05:30
ItzCrazyKns
65d057a05e feat(suggestions): handle custom OpenAI 2024-10-30 10:29:06 +05:30
ItzCrazyKns
3e7645614f feat(image-search): handle custom OpenAI 2024-10-30 10:28:40 +05:30
ItzCrazyKns
7c6ee2ead1 feat(video-search): handle custom OpenAI 2024-10-30 10:28:31 +05:30
ItzCrazyKns
540f38ae68 feat(empty-chat): add settings for mobile 2024-10-30 09:14:09 +05:30
ItzCrazyKns
f1c0b5435b feat(delete-chat): use window.location to refresh page 2024-10-30 09:11:48 +05:30
ItzCrazyKns
b33e5fefba feat(navbar): remove comments 2024-10-29 20:00:31 +05:30
ItzCrazyKns
03d0ff2ca4 feat(navbar): make delete & plus button work 2024-10-29 19:59:58 +05:30
sjiampojamarn
687cbb365f Discover link to new page 2024-10-20 17:23:43 -07:00
ItzCrazyKns
dfb532e4d3 feat(package): bump version 2024-10-18 18:45:23 +05:30
ItzCrazyKns
c8cd959496 feat(dockerfile): update backend image 2024-10-18 17:29:26 +05:30
ItzCrazyKns
4576d3de13 feat(dockerfile): update docker image 2024-10-18 17:26:02 +05:30
ItzCrazyKns
8057f28b20 feat(settings): handle no models 2024-10-18 17:07:09 +05:30
ItzCrazyKns
36bb265e1f feat(dockerfile): revert base image 2024-10-18 12:27:56 +05:30
ItzCrazyKns
71fc19f525 feat(dockerfile): update registry 2024-10-18 12:24:55 +05:30
ItzCrazyKns
c7c0ebe5b6 feat(dockerfile): use NPM registry 2024-10-18 12:15:04 +05:30
ItzCrazyKns
8fe1b7c5e3 feat(webSearchAgent): revert prompt 2024-10-18 12:01:56 +05:30
ItzCrazyKns
6e0d3baef6 feat(dockerfile): update docker image 2024-10-18 11:50:56 +05:30
ItzCrazyKns
54e0bb317a feat(groq): update deprecated models 2024-10-18 11:05:57 +05:30
ItzCrazyKns
3e6e57dab0 feat(chat-window): fix rewrite, use messageID 2024-10-17 18:51:11 +05:30
ItzCrazyKns
5aad2febda feat(messageHandler): fix duplicate messageIDs 2024-10-17 18:50:43 +05:30
ItzCrazyKns
24e1919c5e feat(dockerfile): update image to prevent python errors 2024-10-17 10:46:18 +05:30
ItzCrazyKns
c7abd96b05 feat(readme): add networking 2024-10-17 10:01:00 +05:30
ItzCrazyKns
3a01eebc04 feat(chat): prevent ws not open errors 2024-10-15 18:04:50 +05:30
ItzCrazyKns
7532c436db feat(package): bump version 2024-10-15 16:23:13 +05:30
ItzCrazyKns
b9509a5d41 feat(app): lint & beautify 2024-10-15 16:21:29 +05:30
ItzCrazyKns
9db847c366 feat(library): enhance UI 2024-10-15 16:21:15 +05:30
ItzCrazyKns
19bf71cefc feat(chat-window): only send init msg if ready 2024-10-15 16:21:00 +05:30
ItzCrazyKns
61c0347ef2 feat(app): add discover 2024-10-15 16:20:45 +05:30
ItzCrazyKns
0a7167eb04 feat(search-api): add optimizationMode 2024-10-11 10:54:08 +05:30
ItzCrazyKns
7cce853618 feat(providers): add optimization modes 2024-10-11 10:35:59 +05:30
ItzCrazyKns
877735b852 feat(package): update headlessui 2024-10-11 10:35:33 +05:30
ItzCrazyKns
1680a1786e feat(image-build): improve build time by caching 2024-10-03 10:41:05 +05:30
ItzCrazyKns
66f1e19ce8 feat(image-build): use Docker buildx, publish multi arch images 2024-10-03 09:37:15 +05:30
ItzCrazyKns
ae3fc5f802 feat(docs): modify updating docs 2024-10-02 22:54:16 +05:30
ItzCrazyKns
9f88d16ef1 feat(docker-compose): use env vars from compose 2024-10-02 22:54:00 +05:30
ItzCrazyKns
c233362e70 feat(dockerfile): specify default args 2024-10-02 22:53:45 +05:30
ItzCrazyKns
1aaf172246 feat(build-workflow): update head 2024-10-02 22:01:49 +05:30
ItzCrazyKns
4bba674134 feat(build-workflow): update branch 2024-10-02 22:00:46 +05:30
ItzCrazyKns
dcfe43ebda trigger build 2024-10-02 22:00:04 +05:30
ItzCrazyKns
fc5e35b1b1 feat(docker): add prebuilt images 2024-10-02 21:59:40 +05:30
ItzCrazyKns
425a08432b feat(groq): add Llama 3.2 2024-09-26 21:37:05 +05:30
ItzCrazyKns
e3488366c1 Update SEARCH.md 2024-09-25 17:56:19 +05:30
ItzCrazyKns
8902abdcee Update SEARCH.md 2024-09-25 17:54:35 +05:30
ItzCrazyKns
15203c123d feat(docs): update search docs 2024-09-25 17:49:16 +05:30
ItzCrazyKns
a0aad69f62 feat(readme): update readme 2024-09-25 16:56:41 +05:30
ItzCrazyKns
1cfa3398a3 feat(package): bump version 2024-09-25 16:54:44 +05:30
ItzCrazyKns
ead2d98a9f feat(search): update types 2024-09-25 16:54:19 +05:30
ItzCrazyKns
c52d6ac290 feat(docs): add search API docs 2024-09-25 16:54:07 +05:30
ItzCrazyKns
2785cdd97a feat(routes): add search route 2024-09-25 15:27:48 +05:30
ItzCrazyKns
1589f16d5a feat(providers): add displayName property 2024-09-24 22:34:43 +05:30
ItzCrazyKns
40f551c426 feat(search-button): add empty check 2024-09-15 10:16:20 +05:30
ItzCrazyKns
1fcd64ad42 feat(docker-file): use SearXNG URL from env 2024-09-05 18:40:07 +05:30
ItzCrazyKns
07e5615860 feat(docker-compose): link config.toml as vol. 2024-09-04 18:54:54 +05:30
ItzCrazyKns
c4f52adb45 feat(textarea): handle "/" keys 2024-09-02 11:44:40 +05:30
ItzCrazyKns
92abbc5b98 feat(webSearchRetriever): use question instead of input 2024-08-29 16:54:37 +05:30
ItzCrazyKns
c952469f08 feat(chaWindow): lint & beautify 2024-08-29 16:51:59 +05:30
ItzCrazyKns
449684c419 feat(webSearchAgent): update retriever prompt & change temp 2024-08-29 16:51:42 +05:30
ItzCrazyKns
f620252406 feat(linkDocument): add error handling 2024-08-29 16:51:12 +05:30
ItzCrazyKns
e8ed4df31a feat(chat-window): close socket on unmount 2024-08-28 14:27:22 +05:30
ItzCrazyKns
2873093fee feat(package): bump version 2024-08-28 10:00:05 +05:30
ItzCrazyKns
806c47e705 feat(chatwindow): fix infinite loading 2024-08-28 09:53:06 +05:30
ItzCrazyKns
ff34d1043f feat(app): lint & format 2024-08-25 15:08:47 +05:30
ItzCrazyKns
c521b032a7 feat(agents): fix unresloved types 2024-08-25 15:08:30 +05:30
ItzCrazyKns
6b8f7dc32c Merge branch 'pr/309' 2024-08-25 12:03:54 +05:30
ItzCrazyKns
8bb3e4f016 feat(agents): update types 2024-08-25 12:03:32 +05:30
ItzCrazyKns
51939ff842 feat(webSearchAgent): fix typo, closes #313 2024-08-24 21:48:27 +05:30
Xie Yanbo
e4faa82362 Fix #307, update outdated searxng/settings.yml 2024-08-09 20:53:53 +08:00
ItzCrazyKns
9c1936ec2c feat(chat-window): lint & beautify 2024-08-04 18:14:46 +05:30
ItzCrazyKns
c4932c659a feat(app): lint 2024-07-31 20:17:57 +05:30
ItzCrazyKns
96f67c7028 Merge pull request #290 from ItzCrazyKns/canary 2024-07-30 10:15:52 +05:30
ItzCrazyKns
61dfeb89b4 feat(package): bump version 2024-07-30 10:10:55 +05:30
ItzCrazyKns
8e4f0c6a6d feat(web-search): add URL & PDF searching capibilities 2024-07-30 10:09:05 +05:30
ItzCrazyKns
6f50e25bf3 feat(output-parsers): add line output parser 2024-07-30 10:08:29 +05:30
ItzCrazyKns
9abb4b654d feat(app): handle unhandled exception & rejection 2024-07-30 10:07:28 +05:30
ItzCrazyKns
0a29237732 feat(listLineOutputParser): handle invalid keys 2024-07-30 10:06:52 +05:30
ItzCrazyKns
c62e7f091e feat(package): bump version 2024-07-25 20:39:43 +05:30
ItzCrazyKns
08379fcad5 feat(ws-connector): fix undefined chat model 2024-07-25 20:36:26 +05:30
ItzCrazyKns
cbce39a5dd feat(settings): fix undefined model for custom OpenAI 2024-07-25 20:34:49 +05:30
ItzCrazyKns
27f8cfd212 feat(toast): fix theme colors 2024-07-25 20:33:56 +05:30
ItzCrazyKns
8a76f92e23 feat(groq): add Llama 3.1 2024-07-23 20:49:17 +05:30
ItzCrazyKns
00a52fc3b1 Delete .github/FUNDING.yml 2024-07-23 10:46:32 +05:30
ItzCrazyKns
8143eca2c1 feat(readme): remove patreon 2024-07-23 10:45:52 +05:30
ItzCrazyKns
9bb0b64044 Merge pull request #279 from zandko/perf/filter-first
perf: Optimize document filtering and sorting for performance
2024-07-23 10:08:54 +05:30
Zan
323f3c516c perf: Optimize document filtering and sorting for performance 2024-07-23 10:06:33 +08:00
ItzCrazyKns
c0b3a409dd feat(package): bump version 2024-07-20 09:27:34 +05:30
ItzCrazyKns
9195cbcce0 feat(openai): add GPT-4 Omni mini 2024-07-20 09:26:46 +05:30
ItzCrazyKns
f02393dbe9 feat(providers): add anthropic 2024-07-15 21:20:16 +05:30
ItzCrazyKns
e1732b9bf2 feat(chat-window): fix WS connection errors 2024-07-14 12:37:36 +05:30
sjiampojamarn
fac41d3812 add gemma2-9b-it 2024-07-13 20:20:23 -07:00
ItzCrazyKns
27e6f5b9e1 feat(chat-window): unselect unavailable model 2024-07-09 16:21:45 +05:30
ItzCrazyKns
8539ce82ad feat(providers): fix loading issues 2024-07-08 15:39:27 +05:30
ItzCrazyKns
3b4b8a8b02 feat(providers): add custom_openai 2024-07-08 15:24:45 +05:30
ItzCrazyKns
3ffb20b777 feat(backend): fix type errors 2024-07-08 01:31:11 +05:30
ItzCrazyKns
f4b58c7157 feat(dockerfile): revert base image back to slim 2024-07-06 15:13:05 +05:30
ItzCrazyKns
2678c36e44 feat(agents): fix grammar in prompt, closes 239 & 203 2024-07-06 15:12:51 +05:30
ItzCrazyKns
25b5dbd63e feat(providers): separate each provider 2024-07-06 14:19:33 +05:30
ItzCrazyKns
c63c9b5c8a feat(readme): update ollama guide 2024-07-03 21:02:21 +05:30
ItzCrazyKns
80818983d8 feat(package): bump version 2024-07-03 20:49:13 +05:30
ItzCrazyKns
5217d21366 feat(dockerfile): revert to node:slim 2024-07-03 20:47:31 +05:30
ItzCrazyKns
57ede99b83 Merge branch 'master' of https://github.com/ItzCrazyKns/Perplexica 2024-07-02 10:52:02 +05:30
ItzCrazyKns
c74e16e01c feat(chats): add delete functionality 2024-07-02 10:51:47 +05:30
ItzCrazyKns
ce593daab9 Update README.md 2024-06-30 12:39:37 +05:30
ItzCrazyKns
fcf9b644af Create FUNDING.yml 2024-06-30 12:34:32 +05:30
ItzCrazyKns
6ae825999a feat(readme): update manual install 2024-06-30 10:45:35 +05:30
ItzCrazyKns
b291265944 feat(package): add @langchain/community 2024-06-30 10:42:01 +05:30
ItzCrazyKns
c62684407d feat(chat-window): handle notFound errors 2024-06-29 12:11:34 +05:30
ItzCrazyKns
f4b01a29bb feat(docs): update docs 2024-06-29 11:39:23 +05:30
ItzCrazyKns
022cf55db7 feat(docs): add update docs 2024-06-29 11:38:43 +05:30
ItzCrazyKns
aeef03fbaf feat(readme): update todo 2024-06-29 11:17:43 +05:30
ItzCrazyKns
9588eed710 feat(package): bump version 2024-06-29 11:17:29 +05:30
ItzCrazyKns
7d2344dc85 feat(chats): remove comment 2024-06-29 11:11:10 +05:30
ItzCrazyKns
799f4d6aee feat(docker-compose): implement data volume 2024-06-29 11:10:26 +05:30
ItzCrazyKns
c51ec8ff0f feat(app): implement library feature 2024-06-29 11:09:51 +05:30
ItzCrazyKns
61044715e9 feat(msg-handler): update message types 2024-06-29 11:09:31 +05:30
ItzCrazyKns
d806c7e581 feat(app): add chats route 2024-06-29 11:09:13 +05:30
ItzCrazyKns
93b90dc1c4 feat(db): create schema & config files 2024-06-29 11:08:11 +05:30
ItzCrazyKns
7879167b13 feat(package): add better-sqlite3 2024-06-29 11:07:52 +05:30
ItzCrazyKns
f7d1364f30 feat(discover): remove unadded page 2024-06-28 09:34:40 +05:30
ItzCrazyKns
91bba8eaca feat(utils): accept string in time difference 2024-06-28 09:34:03 +05:30
ItzCrazyKns
4545ff1d7d feat(chat-window): adjust color & size 2024-06-25 16:11:39 +05:30
asifrahaman13
a152e58132 🎉 wip: implemented error state for backend socket connection and othe 2024-06-25 15:43:36 +05:30
ItzCrazyKns
9d827d4cc2 feat(package): update WS module 2024-06-24 21:34:14 +05:30
ItzCrazyKns
336ceefe2b feat(readme): update connection error docs 2024-06-23 14:36:15 +05:30
ItzCrazyKns
9a96fd4788 feat(message-input): focus on / key 2024-06-23 10:46:22 +05:30
ItzCrazyKns
87cc86d406 feat(package): bump version 2024-06-23 09:55:25 +05:30
ItzCrazyKns
5fd64ef6e6 Merge pull request #168 from WanQuanXie/fix-ui-compile-type-error
fix(ui): ui compile fail
2024-06-23 09:42:07 +05:30
WanQuanXie
594106aea3 update(ui): remove useless imports 2024-06-07 16:39:14 +08:00
WanQuanXie
2ae5846b3d fix(ui): ui compile fail
remove both of them, a new feature is coming soon -  mobile device support setting navbar
2024-06-03 18:54:12 +08:00
ItzCrazyKns
476303f52b feat(package): bump version 2024-06-02 14:20:23 +05:30
ItzCrazyKns
21b315d14b Merge pull request #135 from WanQuanXie/light-mode
Adapt light mode
2024-06-02 12:23:10 +05:30
ItzCrazyKns
7c676479d4 feat(theme-switcher): move to settings menu 2024-06-02 12:19:53 +05:30
ItzCrazyKns
8e18c32e23 Merge branch 'pr/137' 2024-06-01 10:52:34 +05:30
ItzCrazyKns
5f6e61d7a0 feat(docker-compose): remove extra hosts from frontend 2024-06-01 10:51:56 +05:30
ItzCrazyKns
32cc430b1b feat(chat-window): use light theme for spinner 2024-05-31 11:08:32 +05:30
ItzCrazyKns
cf0abbb9d2 feat(message-actions): move to separate components 2024-05-31 11:02:37 +05:30
ItzCrazyKns
dcbcab3122 feat(theme-components): use default exports 2024-05-31 11:02:00 +05:30
ItzCrazyKns
90f9edea95 feat(components): use arrow function 2024-05-30 21:38:37 +05:30
ItzCrazyKns
6fb0c5b362 Merge pull request #153 from aiyogg/master
feat(docker-compose): update docker-compose.yaml with restart policy
2024-05-30 16:02:09 +05:30
Chuck
f4628ae52d feat(docker-compose): update docker-compose.yaml with restart policy 2024-05-30 18:12:22 +08:00
WanQuanXie
9e7e1d76a2 update(ui): correct SearchVideo and SearchImages plus action button hover background color 2024-05-29 14:44:25 +08:00
WanQuanXie
9a36e48de5 fix(ui): correct the dom elements' position 2024-05-29 14:31:42 +08:00
WanQuanXie
cfab91ddbf update(ui): restore both message input field dark mode background color 2024-05-29 12:22:29 +08:00
WanQuanXie
2d9ca3835e update(SettingDialog): restore SettingDialog form input and select field dark mode background color 2024-05-29 12:10:24 +08:00
WanQuanXie
f061345c74 fix(MessageBox): multi line related item text will turn the plus icon small 2024-05-28 12:48:08 +08:00
WanQuanXie
5fe08b5ec8 update(MessageBox): parsed markdown message render style fix 2024-05-28 12:45:19 +08:00
WanQuanXie
6a2f4b8ebf update(EmptyChat): EmptyChat theme switcher hide on lg screen 2024-05-28 11:29:04 +08:00
WanQuanXie
4eadc0c797 feat(EmptyChat): EmptyChat page add theme switcher 2024-05-28 11:25:31 +08:00
WanQuanXie
743b67d0e9 update(MessageSources): tune the source panel and inner block background color and border color 2024-05-28 11:11:45 +08:00
WanQuanXie
c8a16a622e update(ui): remove light-300 color level 2024-05-28 10:55:52 +08:00
WanQuanXie
cae05bcf5e update(ui): input action panel background adapt to light mode 2024-05-28 10:50:54 +08:00
WanQuanXie
710b72d053 feat(ui): theme switcher show in responsive mode 2024-05-28 10:48:58 +08:00
WanQuanXie
af9862c019 update(ui): sidebar in mobile screen adapt light mode 2024-05-28 10:26:24 +08:00
WanQuanXie
984b80b5ec fix(ui): restore some hover style in dark mode 2024-05-28 10:15:42 +08:00
WanQuanXie
cb65f67140 update(MessageInput): weaken button border color and background color in light mode 2024-05-28 08:03:49 +08:00
WanQuanXie
62c7f535db update(MessageSources): source block's mark point adapt light mode
which is before the number in bottom-right corner
2024-05-28 07:57:59 +08:00
WanQuanXie
943458440c update(MessageSources): weaken sources Dialog panel and inner block border color 2024-05-28 07:50:35 +08:00
WanQuanXie
d28cfa3319 fix(MessageBox): <code/> type message text-color adapt light mode 2024-05-28 07:47:45 +08:00
WanQuanXie
b37a6e1560 fix(MessageInputActions): focus mode action hover style align before 2024-05-28 07:36:20 +08:00
WanQuanXie
0a2934935e update(ui): change light mode color 2024-05-28 07:30:28 +08:00
WanQuanXie
a5978d544c update(ui): re-manage theme config 2024-05-27 11:49:09 +08:00
WanQuanXie
d46a844df8 update(ui): realign dark mode style with before 2024-05-27 10:42:40 +08:00
WanQuanXie
c97a434723 fix(ui): hover style class uses 2024-05-25 07:26:51 +08:00
Devin Stokes
382fa295e5 fix: add extra_hosts to docker-compose.yaml to allow connection to ollama 2024-05-24 08:19:15 -07:00
WanQuanXie
90f68ab214 update(SearchVideos): video cover label style adapt light mode 2024-05-24 22:41:06 +08:00
WanQuanXie
89c30530bc update(Navbar): update Navbar light mode background 2024-05-24 22:08:47 +08:00
WanQuanXie
776d389c1e refactor(SettingDialog): extract reduplicate code to common component
DO NOT REPEAT YOURSELF!
2024-05-24 21:58:14 +08:00
WanQuanXie
996cc1b674 feat: adaptive light mode 2024-05-24 21:18:10 +08:00
WanQuanXie
f9664d48e7 feat: setup theme context config 2024-05-24 18:20:15 +08:00
WanQuanXie
79cfd0a722 chore(ui): add next-themes 2024-05-24 17:32:14 +08:00
ItzCrazyKns
d04ba91c85 feat(routes): use coalescing operator 2024-05-22 10:45:16 +05:30
ItzCrazyKns
7853c18b6f feat(docs): update port 2024-05-19 11:35:28 +05:30
ItzCrazyKns
64ea4b4289 feat(package): bump version 2024-05-18 13:11:24 +05:30
ItzCrazyKns
c61facef13 feat(message-box): display suggestions 2024-05-18 13:11:15 +05:30
ItzCrazyKns
fcff93a594 feat(message-actions): update rewrite button 2024-05-18 13:10:54 +05:30
ItzCrazyKns
3bfaf9be28 feat(app): add suggestion generation 2024-05-18 13:10:39 +05:30
ItzCrazyKns
68b595023e feat(suggestion-generator): update prompt 2024-05-18 13:10:09 +05:30
ItzCrazyKns
180e204c2d feat(providers): add GPT-4 omni 2024-05-14 19:33:54 +05:30
ItzCrazyKns
0e2f4514b4 feat(readme): update readme 2024-05-13 20:10:44 +05:30
ItzCrazyKns
0993c5a760 feat(app): revert port & network changes 2024-05-13 19:58:17 +05:30
ItzCrazyKns
100872f2d9 feat(docker-compose): revert network changes 2024-05-12 14:04:05 +05:30
ItzCrazyKns
22aee27cda feat(env): remove port 2024-05-12 12:48:01 +05:30
ItzCrazyKns
9d30224faa feat(readme): update readme 2024-05-12 12:24:36 +05:30
ItzCrazyKns
b622df5a9f feat(docker-compose): update ports, change network type 2024-05-12 12:16:08 +05:30
ItzCrazyKns
1b18715f8f feat(docs): update PORT 2024-05-12 12:15:53 +05:30
ItzCrazyKns
9816eb1d36 feat(server): add bind address 2024-05-12 12:15:25 +05:30
ItzCrazyKns
828eeb0c77 feat(app-dockerfile): add PORT arg 2024-05-12 12:14:52 +05:30
ItzCrazyKns
c852bee8ed feat(app): add suspense boundary 2024-05-11 21:19:38 +05:30
ItzCrazyKns
954b4bf89a feat(readme): add search engine guide 2024-05-11 12:14:49 +05:30
ItzCrazyKns
3ef39c69a7 feat(chat-window): add ability to use q query param 2024-05-11 12:09:39 +05:30
ItzCrazyKns
7a28be9e1a feat(readme): add installation docs 2024-05-11 12:09:08 +05:30
ItzCrazyKns
a60145137c feat(docs): add networking 2024-05-11 10:23:05 +05:30
ItzCrazyKns
7eace1e6bd feat(searxng-container): bind mount & add limiter 2024-05-10 20:55:08 +05:30
Chuck
baef45b456 Merge branch 'ItzCrazyKns:master' into master 2024-05-10 12:00:18 +08:00
ItzCrazyKns
9a7af945b0 lint 2024-05-09 20:43:04 +05:30
ItzCrazyKns
09463999c2 feat(routes): add suggestions route 2024-05-09 20:42:03 +05:30
ItzCrazyKns
0f6986fc9b feat(agents): add suggestion generator agent 2024-05-09 20:41:43 +05:30
ItzCrazyKns
5e940914a3 feat(output-parsers): add list line output parser 2024-05-09 20:39:38 +05:30
Chuck
ac4cba32c8 fix(SettingsDialog): baseURL storage key 2024-05-09 15:53:57 +08:00
ItzCrazyKns
4f5f6be85f feat(working): fix grammatical mistake 2024-05-08 20:05:29 +05:30
ItzCrazyKns
17fbc28172 Merge pull request #86 from WanQuanXie/list-map-key-fix
fix(Chat): list map element must specify a unique key
2024-05-08 12:56:00 +05:30
WanQuanXie
0af66f8b72 fix(Chat): list map element must specify a unique key 2024-05-08 09:57:11 +08:00
200 changed files with 37113 additions and 5217 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 151 KiB

After

Width:  |  Height:  |  Size: 641 KiB

11
.env.example Normal file
View File

@@ -0,0 +1,11 @@
PORT=3000
NODE_ENV=development
SUPABASE_URL=your_supabase_url
SUPABASE_KEY=your_supabase_key
OLLAMA_URL=http://localhost:11434
OLLAMA_MODEL=llama2
SEARXNG_URL=http://localhost:4000
SEARXNG_INSTANCES=["http://localhost:4000"]
MAX_RESULTS_PER_QUERY=50
CACHE_DURATION_HOURS=24
CACHE_DURATION_DAYS=7

29
.github/workflows/ci.yml vendored Normal file
View File

@@ -0,0 +1,29 @@
---
name: CI
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Setup Node.js
uses: actions/setup-node@v2
with:
node-version: '18'
- name: Install dependencies
run: npm ci
- name: Run tests
run: npm test
- name: Run type check
run: npm run build

73
.github/workflows/docker-build.yaml vendored Normal file
View File

@@ -0,0 +1,73 @@
name: Build & Push Docker Images
on:
push:
branches:
- master
release:
types: [published]
jobs:
build-and-push:
runs-on: ubuntu-latest
strategy:
matrix:
service: [backend, app]
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true
- name: Log in to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Extract version from release tag
if: github.event_name == 'release'
id: version
run: echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
- name: Build and push Docker image for ${{ matrix.service }}
if: github.ref == 'refs/heads/master' && github.event_name == 'push'
run: |
docker buildx create --use
if [[ "${{ matrix.service }}" == "backend" ]]; then \
DOCKERFILE=backend.dockerfile; \
IMAGE_NAME=perplexica-backend; \
else \
DOCKERFILE=app.dockerfile; \
IMAGE_NAME=perplexica-frontend; \
fi
docker buildx build --platform linux/amd64,linux/arm64 \
--cache-from=type=registry,ref=itzcrazykns1337/${IMAGE_NAME}:main \
--cache-to=type=inline \
-f $DOCKERFILE \
-t itzcrazykns1337/${IMAGE_NAME}:main \
--push .
- name: Build and push release Docker image for ${{ matrix.service }}
if: github.event_name == 'release'
run: |
docker buildx create --use
if [[ "${{ matrix.service }}" == "backend" ]]; then \
DOCKERFILE=backend.dockerfile; \
IMAGE_NAME=perplexica-backend; \
else \
DOCKERFILE=app.dockerfile; \
IMAGE_NAME=perplexica-frontend; \
fi
docker buildx build --platform linux/amd64,linux/arm64 \
--cache-from=type=registry,ref=itzcrazykns1337/${IMAGE_NAME}:${{ env.RELEASE_VERSION }} \
--cache-to=type=inline \
-f $DOCKERFILE \
-t itzcrazykns1337/${IMAGE_NAME}:${{ env.RELEASE_VERSION }} \
--push .

52
.gitignore vendored
View File

@@ -1,34 +1,32 @@
# Node.js
node_modules/
npm-debug.log
yarn-error.log
# Build output
/.next/
/out/
# IDE/Editor specific
.vscode/
.idea/
*.iml
# Environment variables
.env
.env.local
.env.development.local
.env.test.local
.env.production.local
.env.*
!.env.example
# Config files
config.toml
# Dependencies
node_modules/
yarn-error.log
npm-debug.log
# Log files
# Build outputs
dist/
build/
.next/
# IDE/Editor
.vscode/
.idea/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db
# Logs
logs/
*.log
# Testing
/coverage/
# Miscellaneous
.DS_Store
Thumbs.db
# Cache
.cache/
.npm/

View File

@@ -36,3 +36,6 @@ coverage
# Ignore all files with the .DS_Store extension (macOS specific)
.DS_Store
# Ignore all files in uploads directory
uploads

View File

@@ -8,6 +8,7 @@ Perplexica's design consists of two main domains:
- **Frontend (`ui` directory)**: This is a Next.js application holding all user interface components. It's a self-contained environment that manages everything the user interacts with.
- **Backend (root and `src` directory)**: The backend logic is situated in the `src` folder, but the root directory holds the main `package.json` for backend dependency management.
- All of the focus modes are created using the Meta Search Agent class present in `src/search/metaSearchAgent.ts`. The main logic behind Perplexica lies there.
## Setting Up Your Environment
@@ -18,7 +19,8 @@ Before diving into coding, setting up your local environment is key. Here's what
1. In the root directory, locate the `sample.config.toml` file.
2. Rename it to `config.toml` and fill in the necessary configuration fields specific to the backend.
3. Run `npm install` to install dependencies.
4. Use `npm run dev` to start the backend in development mode.
4. Run `npm run db:push` to set up the local sqlite.
5. Use `npm run dev` to start the backend in development mode.
### Frontend

189
README.md
View File

@@ -1,137 +1,120 @@
# 🚀 Perplexica - An AI-powered search engine 🔎 <!-- omit in toc -->
# BizSearch
![preview](.assets/perplexica-screenshot.png)
A tool for finding and analyzing local businesses using AI-powered data extraction.
## Table of Contents <!-- omit in toc -->
## Prerequisites
- [Overview](#overview)
- [Preview](#preview)
- [Features](#features)
- [Installation](#installation)
- [Getting Started with Docker (Recommended)](#getting-started-with-docker-recommended)
- [Non-Docker Installation](#non-docker-installation)
- [Ollama connection errors](#ollama-connection-errors)
- [One-Click Deployment](#one-click-deployment)
- [Upcoming Features](#upcoming-features)
- [Support Us](#support-us)
- [Donations](#donations)
- [Contribution](#contribution)
- [Help and Support](#help-and-support)
## Overview
Perplexica is an open-source AI-powered searching tool or an AI-powered search engine that goes deep into the internet to find answers. Inspired by Perplexity AI, it's an open-source option that not just searches the web but understands your questions. It uses advanced machine learning algorithms like similarity searching and embeddings to refine results and provides clear answers with sources cited.
Using SearxNG to stay current and fully open source, Perplexica ensures you always get the most up-to-date information without compromising your privacy.
Want to know more about its architecture and how it works? You can read it [here](https://github.com/ItzCrazyKns/Perplexica/tree/master/docs/architecture/README.md).
## Preview
![video-preview](.assets/perplexica-preview.gif)
## Features
- **Local LLMs**: You can make use local LLMs such as Llama3 and Mixtral using Ollama.
- **Two Main Modes:**
- **Copilot Mode:** (In development) Boosts search by generating different queries to find more relevant internet sources. Like normal search instead of just using the context by SearxNG, it visits the top matches and tries to find relevant sources to the user's query directly from the page.
- **Normal Mode:** Processes your query and performs a web search.
- **Focus Modes:** Special modes to better answer specific types of questions. Perplexica currently has 6 focus modes:
- **All Mode:** Searches the entire web to find the best results.
- **Writing Assistant Mode:** Helpful for writing tasks that does not require searching the web.
- **Academic Search Mode:** Finds articles and papers, ideal for academic research.
- **YouTube Search Mode:** Finds YouTube videos based on the search query.
- **Wolfram Alpha Search Mode:** Answers queries that need calculations or data analysis using Wolfram Alpha.
- **Reddit Search Mode:** Searches Reddit for discussions and opinions related to the query.
- **Current Information:** Some search tools might give you outdated info because they use data from crawling bots and convert them into embeddings and store them in a index. Unlike them, Perplexica uses SearxNG, a metasearch engine to get the results and rerank and get the most relevant source out of it, ensuring you always get the latest information without the overhead of daily data updates.
It has many more features like image and video search. Some of the planned features are mentioned in [upcoming features](#upcoming-features).
- Node.js 16+
- Ollama (for local LLM)
- SearxNG instance
## Installation
There are mainly 2 ways of installing Perplexica - With Docker, Without Docker. Using Docker is highly recommended.
### Getting Started with Docker (Recommended)
1. Ensure Docker is installed and running on your system.
2. Clone the Perplexica repository:
1. Install Ollama:
```bash
git clone https://github.com/ItzCrazyKns/Perplexica.git
# On macOS
brew install ollama
```
3. After cloning, navigate to the directory containing the project files.
4. Rename the `sample.config.toml` file to `config.toml`. For Docker setups, you need only fill in the following fields:
- `OPENAI`: Your OpenAI API key. **You only need to fill this if you wish to use OpenAI's models**.
- `OLLAMA`: Your Ollama API URL. You should enter it as `http://host.docker.internal:PORT_NUMBER`. If you installed Ollama on port 11434, use `http://host.docker.internal:11434`. For other ports, adjust accordingly. **You need to fill this if you wish to use Ollama's models instead of OpenAI's**.
- `GROQ`: Your Groq API key. **You only need to fill this if you wish to use Groq's hosted models**
**Note**: You can change these after starting Perplexica from the settings dialog.
- `SIMILARITY_MEASURE`: The similarity measure to use (This is filled by default; you can leave it as is if you are unsure about it.)
5. Ensure you are in the directory containing the `docker-compose.yaml` file and execute:
2. Start Ollama:
```bash
docker compose up -d
# Start and enable on login
brew services start ollama
# Or run without auto-start
/usr/local/opt/ollama/bin/ollama serve
```
6. Wait a few minutes for the setup to complete. You can access Perplexica at http://localhost:3000 in your web browser.
3. Pull the required model:
```bash
ollama pull mistral
```
**Note**: After the containers are built, you can start Perplexica directly from Docker without having to open a terminal.
4. Clone and set up the project:
```bash
git clone https://github.com/yourusername/bizsearch.git
cd bizsearch
npm install
```
### Non-Docker Installation
5. Configure environment:
```bash
cp .env.example .env
# Edit .env with your settings
```
1. Clone the repository and rename the `sample.config.toml` file to `config.toml` in the root directory. Ensure you complete all required fields in this file.
2. Rename the `.env.example` file to `.env` in the `ui` folder and fill in all necessary fields.
3. After populating the configuration and environment files, run `npm i` in both the `ui` folder and the root directory.
4. Install the dependencies and then execute `npm run build` in both the `ui` folder and the root directory.
5. Finally, start both the frontend and the backend by running `npm run start` in both the `ui` folder and the root directory.
6. Start the application:
```bash
npm run dev
```
**Note**: Using Docker is recommended as it simplifies the setup process, especially for managing environment variables and dependencies.
7. Open http://localhost:3000 in your browser
### Ollama connection errors
## Troubleshooting
If you're facing an Ollama connection error, it is often related to the backend not being able to connect to Ollama's API. How can you fix it? You can fix it by updating your Ollama API URL in the settings menu to the following:
If Ollama fails to start:
```bash
# Stop any existing instance
brew services stop ollama
# Wait a few seconds
sleep 5
# Start again
brew services start ollama
```
On Windows: `http://host.docker.internal:11434`<br>
On Mac: `http://host.docker.internal:11434`<br>
On Linux: `http://private_ip_of_computer_hosting_ollama:11434`
To verify Ollama is running:
```bash
curl http://localhost:11434/api/version
```
You need to edit the ports accordingly.
## Features
## One-Click Deployment
- Business search with location filtering
- Contact information extraction
- AI-powered data validation
- Clean, user-friendly interface
- Service health monitoring
[![Deploy to RepoCloud](https://d16t0pc4846x52.cloudfront.net/deploylobe.svg)](https://repocloud.io/details/?app_id=267)
## Configuration
## Upcoming Features
Key environment variables:
- `SEARXNG_URL`: Your SearxNG instance URL
- `OLLAMA_URL`: Ollama API endpoint (default: http://localhost:11434)
- `SUPABASE_URL`: Your Supabase project URL
- `SUPABASE_ANON_KEY`: Your Supabase anonymous key
- `CACHE_DURATION_DAYS`: How long to cache results (default: 7)
- [ ] Finalizing Copilot Mode
- [x] Add settings page
- [x] Adding support for local LLMs
- [ ] Adding Discover and History Saving features
- [x] Introducing various Focus Modes
## Supabase Setup
## Support Us
1. Create a new Supabase project
2. Run the SQL commands in `db/init.sql` to create the cache table
3. Copy your project URL and anon key to `.env`
If you find Perplexica useful, consider giving us a star on GitHub. This helps more people discover Perplexica and supports the development of new features. Your support is greatly appreciated.
## License
### Donations
MIT
We also accept donations to help sustain our project. If you would like to contribute, you can use the following button to make a donation in cryptocurrency. Thank you for your support!
## Cache Management
<a href="https://nowpayments.io/donation?api_key=RFFKJH1-GRR4DQG-HFV1DZP-00G6MMK&source=lk_donation&medium=referral" target="_blank">
<img src="https://nowpayments.io/images/embeds/donation-button-white.svg" alt="Crypto donation button by NOWPayments">
</a>
The application uses Supabase for caching search results. Cache entries expire after 7 days.
## Contribution
### Manual Cache Cleanup
Perplexica is built on the idea that AI and large language models should be easy for everyone to use. If you find bugs or have ideas, please share them in via GitHub Issues. For more information on contributing to Perplexica you can read the [CONTRIBUTING.md](CONTRIBUTING.md) file to learn more about Perplexica and how you can contribute to it.
If automatic cleanup is not available, you can manually clean up expired entries:
## Help and Support
1. Using the API:
```bash
curl -X POST http://localhost:3000/api/cleanup
```
If you have any questions or feedback, please feel free to reach out to us. You can create an issue on GitHub or join our Discord server. There, you can connect with other users, share your experiences and reviews, and receive more personalized help. [Click here](https://discord.gg/EFwsmQDgAu) to join the Discord server. To discuss matters outside of regular support, feel free to contact me on Discord at `itzcrazykns`.
2. Using SQL:
```sql
select manual_cleanup();
```
Thank you for exploring Perplexica, the AI-powered search engine designed to enhance your search experience. We are constantly working to improve Perplexica and expand its capabilities. We value your feedback and contributions which help us make Perplexica even better. Don't forget to check back for updates and new features!
### Cache Statistics
View cache statistics using:
```sql
select * from cache_stats;
```

View File

@@ -1,7 +1,7 @@
FROM node:alpine
FROM node:20.18.0-alpine
ARG NEXT_PUBLIC_WS_URL
ARG NEXT_PUBLIC_API_URL
ARG NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
ARG NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
ENV NEXT_PUBLIC_WS_URL=${NEXT_PUBLIC_WS_URL}
ENV NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL}
@@ -9,7 +9,7 @@ WORKDIR /home/perplexica
COPY ui /home/perplexica/
RUN yarn install
RUN yarn install --frozen-lockfile
RUN yarn build
CMD ["yarn", "start"]

View File

@@ -1,18 +1,17 @@
FROM node:buster-slim
ARG SEARXNG_API_URL
FROM node:18-slim
WORKDIR /home/perplexica
COPY src /home/perplexica/src
COPY tsconfig.json /home/perplexica/
COPY config.toml /home/perplexica/
COPY drizzle.config.ts /home/perplexica/
COPY package.json /home/perplexica/
COPY yarn.lock /home/perplexica/
RUN sed -i "s|SEARXNG = \".*\"|SEARXNG = \"${SEARXNG_API_URL}\"|g" /home/perplexica/config.toml
RUN mkdir /home/perplexica/data
RUN mkdir /home/perplexica/uploads
RUN yarn install
RUN yarn install --frozen-lockfile --network-timeout 600000
RUN yarn build
CMD ["yarn", "start"]

14
config.toml Normal file
View File

@@ -0,0 +1,14 @@
[GENERAL]
PORT = 3001 # Port to run the server on
SIMILARITY_MEASURE = "cosine" # "cosine" or "dot"
KEEP_ALIVE = "5m" # How long to keep Ollama models loaded into memory. (Instead of using -1 use "-1m")
[API_KEYS]
OPENAI = "" # OpenAI API key - sk-1234567890abcdef1234567890abcdef
GROQ = "" # Groq API key - gsk_1234567890abcdef1234567890abcdef
ANTHROPIC = "" # Anthropic API key - sk-ant-1234567890abcdef1234567890abcdef
GEMINI = "" # Gemini API key - sk-1234567890abcdef1234567890abcdef
[API_ENDPOINTS]
SEARXNG = "http://localhost:32768" # SearxNG API URL
OLLAMA = "" # Ollama API URL - http://host.docker.internal:11434

2
data/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
*
!.gitignore

189
db/init.sql Normal file
View File

@@ -0,0 +1,189 @@
-- Enable required extensions
create extension if not exists "uuid-ossp"; -- For UUID generation
create extension if not exists pg_cron; -- For scheduled jobs
-- Create the search_cache table
create table public.search_cache (
id uuid default uuid_generate_v4() primary key,
query text not null,
results jsonb not null,
location text not null,
category text not null,
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
updated_at timestamp with time zone default timezone('utc'::text, now()) not null,
expires_at timestamp with time zone default timezone('utc'::text, now() + interval '7 days') not null
);
-- Create indexes
create index search_cache_query_idx on public.search_cache (query);
create index search_cache_location_category_idx on public.search_cache (location, category);
create index search_cache_expires_at_idx on public.search_cache (expires_at);
-- Enable RLS
alter table public.search_cache enable row level security;
-- Create policies
create policy "Allow public read access"
on public.search_cache for select
using (true);
create policy "Allow service write access"
on public.search_cache for insert
with check (true);
create policy "Allow service update access"
on public.search_cache for update
using (true)
with check (true);
create policy "Allow delete expired records"
on public.search_cache for delete
using (expires_at < now());
-- Create function to clean up expired records
create or replace function cleanup_expired_cache()
returns void
language plpgsql
security definer
as $$
begin
delete from public.search_cache
where expires_at < now();
end;
$$;
-- Create a manual cleanup function since pg_cron might not be available
create or replace function manual_cleanup()
returns void
language plpgsql
security definer
as $$
begin
delete from public.search_cache
where expires_at < now();
end;
$$;
-- Create a view for cache statistics
create or replace view cache_stats as
select
count(*) as total_entries,
count(*) filter (where expires_at < now()) as expired_entries,
count(*) filter (where expires_at >= now()) as valid_entries,
min(created_at) as oldest_entry,
max(created_at) as newest_entry,
count(distinct category) as unique_categories,
count(distinct location) as unique_locations
from public.search_cache;
-- Grant permissions to access the view
grant select on cache_stats to postgres;
-- Create table if not exists businesses
create table if not exists businesses (
id text primary key,
name text not null,
phone text,
email text,
address text,
rating numeric,
website text,
logo text,
source text,
description text,
latitude numeric,
longitude numeric,
last_updated timestamp with time zone default timezone('utc'::text, now()),
search_count integer default 1,
created_at timestamp with time zone default timezone('utc'::text, now())
);
-- Create indexes for common queries
create index if not exists businesses_name_idx on businesses (name);
create index if not exists businesses_rating_idx on businesses (rating desc);
create index if not exists businesses_search_count_idx on businesses (search_count desc);
create index if not exists businesses_last_updated_idx on businesses (last_updated desc);
-- Create tables if they don't exist
CREATE TABLE IF NOT EXISTS businesses (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
phone TEXT,
email TEXT,
address TEXT,
rating INTEGER,
website TEXT,
logo TEXT,
source TEXT,
description TEXT,
location JSONB,
place_id TEXT,
photos TEXT[],
opening_hours TEXT[],
distance JSONB,
last_updated TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
search_count INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS searches (
id SERIAL PRIMARY KEY,
query TEXT NOT NULL,
location TEXT NOT NULL,
timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
results_count INTEGER
);
CREATE TABLE IF NOT EXISTS cache (
key TEXT PRIMARY KEY,
value JSONB NOT NULL,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
expires_at TIMESTAMP WITH TIME ZONE NOT NULL
);
-- Create indexes
CREATE INDEX IF NOT EXISTS idx_businesses_location ON businesses USING GIN (location);
CREATE INDEX IF NOT EXISTS idx_businesses_search ON businesses USING GIN (to_tsvector('english', name || ' ' || COALESCE(description, '')));
CREATE INDEX IF NOT EXISTS idx_cache_expires ON cache (expires_at);
-- Set up RLS (Row Level Security)
ALTER TABLE businesses ENABLE ROW LEVEL SECURITY;
ALTER TABLE searches ENABLE ROW LEVEL SECURITY;
ALTER TABLE cache ENABLE ROW LEVEL SECURITY;
-- Create policies
CREATE POLICY "Allow anonymous select" ON businesses FOR SELECT USING (true);
CREATE POLICY "Allow service role insert" ON businesses FOR INSERT WITH CHECK (true);
CREATE POLICY "Allow service role update" ON businesses FOR UPDATE USING (true);
CREATE POLICY "Allow anonymous select" ON searches FOR SELECT USING (true);
CREATE POLICY "Allow service role insert" ON searches FOR INSERT WITH CHECK (true);
CREATE POLICY "Allow anonymous select" ON cache FOR SELECT USING (true);
CREATE POLICY "Allow service role all" ON cache USING (true);
-- Add place_id column to businesses table if it doesn't exist
ALTER TABLE businesses ADD COLUMN IF NOT EXISTS place_id TEXT;
CREATE INDEX IF NOT EXISTS idx_businesses_place_id ON businesses(place_id);
-- Create a unique constraint on place_id (excluding nulls)
CREATE UNIQUE INDEX IF NOT EXISTS idx_businesses_place_id_unique
ON businesses(place_id)
WHERE place_id IS NOT NULL;
CREATE TABLE IF NOT EXISTS businesses (
id TEXT PRIMARY KEY,
name TEXT NOT NULL,
address TEXT NOT NULL,
phone TEXT NOT NULL,
description TEXT NOT NULL,
website TEXT,
source TEXT NOT NULL,
rating REAL,
lat REAL,
lng REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_businesses_source ON businesses(source);
CREATE INDEX IF NOT EXISTS idx_businesses_rating ON businesses(rating);

44
db/schema.sql Normal file
View File

@@ -0,0 +1,44 @@
-- Create the businesses table
create table businesses (
id uuid primary key,
name text not null,
phone text,
address text,
city text,
state text,
zip text,
category text[],
rating numeric,
review_count integer,
license text,
services text[],
hours jsonb,
website text,
email text,
verified boolean default false,
last_updated timestamp with time zone,
search_query text,
search_location text,
search_timestamp timestamp with time zone,
reliability_score integer,
-- Create a composite index for deduplication
constraint unique_business unique (phone, address)
);
-- Create indexes for common queries
create index idx_business_location on businesses (city, state);
create index idx_business_category on businesses using gin (category);
create index idx_search_query on businesses using gin (search_query gin_trgm_ops);
create index idx_search_location on businesses using gin (search_location gin_trgm_ops);
create index idx_reliability on businesses (reliability_score);
-- Enable full text search
alter table businesses add column search_vector tsvector
generated always as (
setweight(to_tsvector('english', coalesce(name, '')), 'A') ||
setweight(to_tsvector('english', coalesce(search_query, '')), 'B') ||
setweight(to_tsvector('english', coalesce(search_location, '')), 'C')
) stored;
create index idx_business_search on businesses using gin(search_vector);

15
db/verify.sql Normal file
View File

@@ -0,0 +1,15 @@
-- Check if table exists
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'public'
AND table_name = 'businesses'
);
-- Check table structure
SELECT column_name, data_type, is_nullable
FROM information_schema.columns
WHERE table_schema = 'public'
AND table_name = 'businesses';
-- Check row count
SELECT COUNT(*) as count FROM businesses;

View File

@@ -1,25 +1,34 @@
services:
searxng:
build:
context: .
dockerfile: searxng.dockerfile
image: docker.io/searxng/searxng:latest
volumes:
- ./searxng:/etc/searxng:rw
ports:
- 4000:8080
networks:
- perplexica-network
restart: unless-stopped
perplexica-backend:
build:
context: .
dockerfile: backend.dockerfile
args:
image: itzcrazykns1337/perplexica-backend:main
environment:
- SEARXNG_API_URL=http://searxng:8080
depends_on:
- searxng
ports:
- 3001:3001
volumes:
- backend-dbstore:/home/perplexica/data
- uploads:/home/perplexica/uploads
- ./config.toml:/home/perplexica/config.toml
extra_hosts:
- 'host.docker.internal:host-gateway'
networks:
- perplexica-network
restart: unless-stopped
perplexica-frontend:
build:
@@ -28,12 +37,18 @@ services:
args:
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
image: itzcrazykns1337/perplexica-frontend:main
depends_on:
- perplexica-backend
ports:
- 3000:3000
networks:
- perplexica-network
restart: unless-stopped
networks:
perplexica-network:
volumes:
backend-dbstore:
uploads:

26
docker-compose.yml Normal file
View File

@@ -0,0 +1,26 @@
version: '3'
services:
searxng:
image: searxng/searxng
ports:
- "4000:8080"
volumes:
- ./searxng:/etc/searxng
environment:
- INSTANCE_NAME=perplexica-searxng
- BASE_URL=http://localhost:4000/
- SEARXNG_SECRET=your_secret_key_here
restart: unless-stopped
app:
build:
context: .
dockerfile: backend.dockerfile
ports:
- "3000:3000"
environment:
- SEARXNG_URL=http://searxng:8080
volumes:
- ./config.toml:/home/perplexica/config.toml
depends_on:
- searxng

117
docs/API/SEARCH.md Normal file
View File

@@ -0,0 +1,117 @@
# Perplexica Search API Documentation
## Overview
Perplexicas Search API makes it easy to use our AI-powered search engine. You can run different types of searches, pick the models you want to use, and get the most recent info. Follow the following headings to learn more about Perplexica's search API.
## Endpoint
### **POST** `http://localhost:3001/api/search`
**Note**: Replace `3001` with any other port if you've changed the default PORT
### Request
The API accepts a JSON object in the request body, where you define the focus mode, chat models, embedding models, and your query.
#### Request Body Structure
```json
{
"chatModel": {
"provider": "openai",
"model": "gpt-4o-mini"
},
"embeddingModel": {
"provider": "openai",
"model": "text-embedding-3-large"
},
"optimizationMode": "speed",
"focusMode": "webSearch",
"query": "What is Perplexica",
"history": [
["human", "Hi, how are you?"],
["assistant", "I am doing well, how can I help you today?"]
]
}
```
### Request Parameters
- **`chatModel`** (object, optional): Defines the chat model to be used for the query. For model details you can send a GET request at `http://localhost:3001/api/models`. Make sure to use the key value (For example "gpt-4o-mini" instead of the display name "GPT 4 omni mini").
- `provider`: Specifies the provider for the chat model (e.g., `openai`, `ollama`).
- `model`: The specific model from the chosen provider (e.g., `gpt-4o-mini`).
- Optional fields for custom OpenAI configuration:
- `customOpenAIBaseURL`: If youre using a custom OpenAI instance, provide the base URL.
- `customOpenAIKey`: The API key for a custom OpenAI instance.
- **`embeddingModel`** (object, optional): Defines the embedding model for similarity-based searching. For model details you can send a GET request at `http://localhost:3001/api/models`. Make sure to use the key value (For example "text-embedding-3-large" instead of the display name "Text Embedding 3 Large").
- `provider`: The provider for the embedding model (e.g., `openai`).
- `model`: The specific embedding model (e.g., `text-embedding-3-large`).
- **`focusMode`** (string, required): Specifies which focus mode to use. Available modes:
- `webSearch`, `academicSearch`, `writingAssistant`, `wolframAlphaSearch`, `youtubeSearch`, `redditSearch`.
- **`optimizationMode`** (string, optional): Specifies the optimization mode to control the balance between performance and quality. Available modes:
- `speed`: Prioritize speed and return the fastest answer.
- `balanced`: Provide a balanced answer with good speed and reasonable quality.
- **`query`** (string, required): The search query or question.
- **`history`** (array, optional): An array of message pairs representing the conversation history. Each pair consists of a role (either 'human' or 'assistant') and the message content. This allows the system to use the context of the conversation to refine results. Example:
```json
[
["human", "What is Perplexica?"],
["assistant", "Perplexica is an AI-powered search engine..."]
]
```
### Response
The response from the API includes both the final message and the sources used to generate that message.
#### Example Response
```json
{
"message": "Perplexica is an innovative, open-source AI-powered search engine designed to enhance the way users search for information online. Here are some key features and characteristics of Perplexica:\n\n- **AI-Powered Technology**: It utilizes advanced machine learning algorithms to not only retrieve information but also to understand the context and intent behind user queries, providing more relevant results [1][5].\n\n- **Open-Source**: Being open-source, Perplexica offers flexibility and transparency, allowing users to explore its functionalities without the constraints of proprietary software [3][10].",
"sources": [
{
"pageContent": "Perplexica is an innovative, open-source AI-powered search engine designed to enhance the way users search for information online.",
"metadata": {
"title": "What is Perplexica, and how does it function as an AI-powered search ...",
"url": "https://askai.glarity.app/search/What-is-Perplexica--and-how-does-it-function-as-an-AI-powered-search-engine"
}
},
{
"pageContent": "Perplexica is an open-source AI-powered search tool that dives deep into the internet to find precise answers.",
"metadata": {
"title": "Sahar Mor's Post",
"url": "https://www.linkedin.com/posts/sahar-mor_a-new-open-source-project-called-perplexica-activity-7204489745668694016-ncja"
}
}
....
]
}
```
### Fields in the Response
- **`message`** (string): The search result, generated based on the query and focus mode.
- **`sources`** (array): A list of sources that were used to generate the search result. Each source includes:
- `pageContent`: A snippet of the relevant content from the source.
- `metadata`: Metadata about the source, including:
- `title`: The title of the webpage.
- `url`: The URL of the webpage.
### Error Handling
If an error occurs during the search process, the API will return an appropriate error message with an HTTP status code.
- **400**: If the request is malformed or missing required fields (e.g., no focus mode or query).
- **500**: If an internal server error occurs during the search.

108
docs/ETHICAL_SCRAPING.md Normal file
View File

@@ -0,0 +1,108 @@
# Ethical Web Scraping Guidelines
## Core Principles
1. **Respect Robots.txt**
- Always check and honor robots.txt directives
- Cache robots.txt to reduce server load
- Default to conservative behavior when uncertain
2. **Proper Identification**
- Use clear, identifiable User-Agent strings
- Provide contact information
- Be transparent about your purpose
3. **Rate Limiting**
- Implement conservative rate limits
- Use exponential backoff for errors
- Distribute requests over time
4. **Data Usage**
- Only collect publicly available business information
- Respect privacy and data protection laws
- Provide clear opt-out mechanisms
- Keep data accurate and up-to-date
5. **Technical Considerations**
- Cache results to minimize requests
- Handle errors gracefully
- Monitor and log access patterns
- Use structured data when available
## Implementation
1. **Request Headers**
```typescript
const headers = {
'User-Agent': 'BizSearch/1.0 (+https://bizsearch.com/about)',
'Accept': 'text/html,application/xhtml+xml',
'From': 'contact@bizsearch.com'
};
```
2. **Rate Limiting**
```typescript
const rateLimits = {
requestsPerMinute: 10,
requestsPerHour: 100,
requestsPerDomain: 20
};
```
3. **Caching**
```typescript
const cacheSettings = {
ttl: 24 * 60 * 60, // 24 hours
maxSize: 1000 // entries
};
```
## Opt-Out Process
1. Business owners can opt-out by:
- Submitting a form on our website
- Emailing opt-out@bizsearch.com
- Adding a meta tag: `<meta name="bizsearch" content="noindex">`
2. We honor opt-outs within:
- 24 hours for direct requests
- 72 hours for cached data
## Legal Compliance
1. **Data Protection**
- GDPR compliance for EU businesses
- CCPA compliance for California businesses
- Regular data audits and cleanup
2. **Attribution**
- Clear source attribution
- Last-updated timestamps
- Data accuracy disclaimers
## Best Practices
1. **Before Scraping**
- Check robots.txt
- Verify site status
- Review terms of service
- Look for API alternatives
2. **During Scraping**
- Monitor response codes
- Respect server hints
- Implement backoff strategies
- Log access patterns
3. **After Scraping**
- Verify data accuracy
- Update cache entries
- Clean up old data
- Monitor opt-out requests
## Contact
For questions or concerns about our scraping practices:
- Email: ethics@bizsearch.com
- Phone: (555) 123-4567
- Web: https://bizsearch.com/ethics

View File

@@ -1,4 +1,4 @@
## Perplexica's Architecture
# Perplexica's Architecture
Perplexica's architecture consists of the following key components:

View File

@@ -1,19 +1,19 @@
## How does Perplexica work?
# How does Perplexica work?
Curious about how Perplexica works? Don't worry, we'll cover it here. Before we begin, make sure you've read about the architecture of Perplexica to ensure you understand what it's made up of. Haven't read it? You can read it [here](https://github.com/ItzCrazyKns/Perplexica/tree/master/docs/architecture/README.md).
We'll understand how Perplexica works by taking an example of a scenario where a user asks: "How does an A.C. work?". We'll break down the process into steps to make it easier to understand. The steps are as follows:
1. The message is sent via WS to the backend server where it invokes the chain. The chain will depend on your focus mode. For this example, let's assume we use the "webSearch" focus mode.
2. The chain is now invoked; first, the message is passed to another chain where it first predicts (using the chat history and the question) whether there is a need for sources or searching the web. If there is, it will generate a query (in accordance with the chat history) for searching the web that we'll take up later. If not, the chain will end there, and then the answer generator chain, also known as the response generator, will be started.
2. The chain is now invoked; first, the message is passed to another chain where it first predicts (using the chat history and the question) whether there is a need for sources and searching the web. If there is, it will generate a query (in accordance with the chat history) for searching the web that we'll take up later. If not, the chain will end there, and then the answer generator chain, also known as the response generator, will be started.
3. The query returned by the first chain is passed to SearXNG to search the web for information.
4. After the information is retrieved, it is based on keyword-based search. We then convert the information into embeddings and the query as well, then we perform a similarity search to find the most relevant sources to answer the query.
5. After all this is done, the sources are passed to the response generator. This chain takes all the chat history, the query, and the sources. It generates a response that is streamed to the UI.
### How are the answers cited?
## How are the answers cited?
The LLMs are prompted to do so. We've prompted them so well that they cite the answers themselves, and using some UI magic, we display it to the user.
### Image and Video Search
## Image and Video Search
Image and video searches are conducted in a similar manner. A query is always generated first, then we search the web for images and videos that match the query. These results are then returned to the user.

View File

@@ -0,0 +1,109 @@
# Expose Perplexica to a network
This guide will show you how to make Perplexica available over a network. Follow these steps to allow computers on the same network to interact with Perplexica. Choose the instructions that match the operating system you are using.
## Windows
1. Open PowerShell as Administrator
2. Navigate to the directory containing the `docker-compose.yaml` file
3. Stop and remove the existing Perplexica containers and images:
```bash
docker compose down --rmi all
```
4. Open the `docker-compose.yaml` file in a text editor like Notepad++
5. Replace `127.0.0.1` with the IP address of the server Perplexica is running on in these two lines:
```bash
args:
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
```
6. Save and close the `docker-compose.yaml` file
7. Rebuild and restart the Perplexica container:
```bash
docker compose up -d --build
```
## macOS
1. Open the Terminal application
2. Navigate to the directory with the `docker-compose.yaml` file:
```bash
cd /path/to/docker-compose.yaml
```
3. Stop and remove existing containers and images:
```bash
docker compose down --rmi all
```
4. Open `docker-compose.yaml` in a text editor like Sublime Text:
```bash
nano docker-compose.yaml
```
5. Replace `127.0.0.1` with the server IP in these lines:
```bash
args:
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
```
6. Save and exit the editor
7. Rebuild and restart Perplexica:
```bash
docker compose up -d --build
```
## Linux
1. Open the terminal
2. Navigate to the `docker-compose.yaml` directory:
```bash
cd /path/to/docker-compose.yaml
```
3. Stop and remove containers and images:
```bash
docker compose down --rmi all
```
4. Edit `docker-compose.yaml`:
```bash
nano docker-compose.yaml
```
5. Replace `127.0.0.1` with the server IP:
```bash
args:
- NEXT_PUBLIC_API_URL=http://127.0.0.1:3001/api
- NEXT_PUBLIC_WS_URL=ws://127.0.0.1:3001
```
6. Save and exit the editor
7. Rebuild and restart Perplexica:
```bash
docker compose up -d --build
```

View File

@@ -0,0 +1,40 @@
# Update Perplexica to the latest version
To update Perplexica to the latest version, follow these steps:
## For Docker users
1. Clone the latest version of Perplexica from GitHub:
```bash
git clone https://github.com/ItzCrazyKns/Perplexica.git
```
2. Navigate to the Project Directory.
3. Pull latest images from registry.
```bash
docker compose pull
```
4. Update and Recreate containers.
```bash
docker compose up -d
```
5. Once the command completes running go to http://localhost:3000 and verify the latest changes.
## For non Docker users
1. Clone the latest version of Perplexica from GitHub:
```bash
git clone https://github.com/ItzCrazyKns/Perplexica.git
```
2. Navigate to the Project Directory
3. Execute `npm i` in both the `ui` folder and the root directory.
4. Once packages are updated, execute `npm run build` in both the `ui` folder and the root directory.
5. Finally, start both the frontend and the backend by running `npm run start` in both the `ui` folder and the root directory.

10
drizzle.config.ts Normal file
View File

@@ -0,0 +1,10 @@
import { defineConfig } from 'drizzle-kit';
export default defineConfig({
dialect: 'sqlite',
schema: './src/db/schema.ts',
out: './drizzle',
dbCredentials: {
url: './data/db.sqlite',
},
});

41
frontend/.gitignore vendored Normal file
View File

@@ -0,0 +1,41 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/versions
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# env files (can opt-in for committing if needed)
.env*
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts

36
frontend/README.md Normal file
View File

@@ -0,0 +1,36 @@
This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app).
## Getting Started
First, run the development server:
```bash
npm run dev
# or
yarn dev
# or
pnpm dev
# or
bun dev
```
Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
## Learn More
To learn more about Next.js, take a look at the following resources:
- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
## Deploy on Vercel
The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.

View File

@@ -0,0 +1,16 @@
import { dirname } from "path";
import { fileURLToPath } from "url";
import { FlatCompat } from "@eslint/eslintrc";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
const compat = new FlatCompat({
baseDirectory: __dirname,
});
const eslintConfig = [
...compat.extends("next/core-web-vitals", "next/typescript"),
];
export default eslintConfig;

13
frontend/next.config.js Normal file
View File

@@ -0,0 +1,13 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
async rewrites() {
return [
{
source: '/api/:path*',
destination: 'http://localhost:3000/api/:path*',
},
]
}
}
module.exports = nextConfig

7
frontend/next.config.ts Normal file
View File

@@ -0,0 +1,7 @@
import type { NextConfig } from "next";
const nextConfig: NextConfig = {
/* config options here */
};
export default nextConfig;

5848
frontend/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

33
frontend/package.json Normal file
View File

@@ -0,0 +1,33 @@
{
"name": "frontend",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@radix-ui/react-icons": "^1.3.2",
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"lucide-react": "^0.469.0",
"next": "15.1.3",
"react": "^19.0.0",
"react-dom": "^19.0.0",
"tailwind-merge": "^2.6.0",
"tailwindcss-animate": "^1.0.7"
},
"devDependencies": {
"@eslint/eslintrc": "^3",
"@types/node": "^20",
"@types/react": "^19",
"@types/react-dom": "^19",
"eslint": "^9",
"eslint-config-next": "15.1.3",
"postcss": "^8",
"tailwindcss": "^3.4.1",
"typescript": "^5"
}
}

View File

@@ -0,0 +1,8 @@
/** @type {import('postcss-load-config').Config} */
const config = {
plugins: {
tailwindcss: {},
},
};
export default config;

1
frontend/public/file.svg Normal file
View File

@@ -0,0 +1 @@
<svg fill="none" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><path d="M14.5 13.5V5.41a1 1 0 0 0-.3-.7L9.8.29A1 1 0 0 0 9.08 0H1.5v13.5A2.5 2.5 0 0 0 4 16h8a2.5 2.5 0 0 0 2.5-2.5m-1.5 0v-7H8v-5H3v12a1 1 0 0 0 1 1h8a1 1 0 0 0 1-1M9.5 5V2.12L12.38 5zM5.13 5h-.62v1.25h2.12V5zm-.62 3h7.12v1.25H4.5zm.62 3h-.62v1.25h7.12V11z" clip-rule="evenodd" fill="#666" fill-rule="evenodd"/></svg>

After

Width:  |  Height:  |  Size: 391 B

View File

@@ -0,0 +1 @@
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><g clip-path="url(#a)"><path fill-rule="evenodd" clip-rule="evenodd" d="M10.27 14.1a6.5 6.5 0 0 0 3.67-3.45q-1.24.21-2.7.34-.31 1.83-.97 3.1M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16m.48-1.52a7 7 0 0 1-.96 0H7.5a4 4 0 0 1-.84-1.32q-.38-.89-.63-2.08a40 40 0 0 0 3.92 0q-.25 1.2-.63 2.08a4 4 0 0 1-.84 1.31zm2.94-4.76q1.66-.15 2.95-.43a7 7 0 0 0 0-2.58q-1.3-.27-2.95-.43a18 18 0 0 1 0 3.44m-1.27-3.54a17 17 0 0 1 0 3.64 39 39 0 0 1-4.3 0 17 17 0 0 1 0-3.64 39 39 0 0 1 4.3 0m1.1-1.17q1.45.13 2.69.34a6.5 6.5 0 0 0-3.67-3.44q.65 1.26.98 3.1M8.48 1.5l.01.02q.41.37.84 1.31.38.89.63 2.08a40 40 0 0 0-3.92 0q.25-1.2.63-2.08a4 4 0 0 1 .85-1.32 7 7 0 0 1 .96 0m-2.75.4a6.5 6.5 0 0 0-3.67 3.44 29 29 0 0 1 2.7-.34q.31-1.83.97-3.1M4.58 6.28q-1.66.16-2.95.43a7 7 0 0 0 0 2.58q1.3.27 2.95.43a18 18 0 0 1 0-3.44m.17 4.71q-1.45-.12-2.69-.34a6.5 6.5 0 0 0 3.67 3.44q-.65-1.27-.98-3.1" fill="#666"/></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h16v16H0z"/></clipPath></defs></svg>

After

Width:  |  Height:  |  Size: 1.0 KiB

1
frontend/public/next.svg Normal file
View File

@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@@ -0,0 +1 @@
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1155 1000"><path d="m577.3 0 577.4 1000H0z" fill="#fff"/></svg>

After

Width:  |  Height:  |  Size: 128 B

View File

@@ -0,0 +1 @@
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.5 2.5h13v10a1 1 0 0 1-1 1h-11a1 1 0 0 1-1-1zM0 1h16v11.5a2.5 2.5 0 0 1-2.5 2.5h-11A2.5 2.5 0 0 1 0 12.5zm3.75 4.5a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5M7 4.75a.75.75 0 1 1-1.5 0 .75.75 0 0 1 1.5 0m1.75.75a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5" fill="#666"/></svg>

After

Width:  |  Height:  |  Size: 385 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

View File

@@ -0,0 +1,76 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
@layer base {
:root {
--background: 0 0% 100%;
--foreground: 222.2 84% 4.9%;
--card: 0 0% 100%;
--card-foreground: 222.2 84% 4.9%;
--popover: 0 0% 100%;
--popover-foreground: 222.2 84% 4.9%;
--primary: 222.2 47.4% 11.2%;
--primary-foreground: 210 40% 98%;
--secondary: 210 40% 96.1%;
--secondary-foreground: 222.2 47.4% 11.2%;
--muted: 210 40% 96.1%;
--muted-foreground: 215.4 16.3% 46.9%;
--accent: 210 40% 96.1%;
--accent-foreground: 222.2 47.4% 11.2%;
--destructive: 0 84.2% 60.2%;
--destructive-foreground: 210 40% 98%;
--border: 214.3 31.8% 91.4%;
--input: 214.3 31.8% 91.4%;
--ring: 222.2 84% 4.9%;
--radius: 0.5rem;
}
.dark {
--background: 222.2 84% 4.9%;
--foreground: 210 40% 98%;
--card: 222.2 84% 4.9%;
--card-foreground: 210 40% 98%;
--popover: 222.2 84% 4.9%;
--popover-foreground: 210 40% 98%;
--primary: 210 40% 98%;
--primary-foreground: 222.2 47.4% 11.2%;
--secondary: 217.2 32.6% 17.5%;
--secondary-foreground: 210 40% 98%;
--muted: 217.2 32.6% 17.5%;
--muted-foreground: 215 20.2% 65.1%;
--accent: 217.2 32.6% 17.5%;
--accent-foreground: 210 40% 98%;
--destructive: 0 62.8% 30.6%;
--destructive-foreground: 210 40% 98%;
--border: 217.2 32.6% 17.5%;
--input: 217.2 32.6% 17.5%;
--ring: 212.7 26.8% 83.9%;
}
}
@layer base {
* {
@apply border-border;
}
body {
@apply bg-background text-foreground;
}
}

View File

@@ -0,0 +1,34 @@
import type { Metadata } from "next";
import { Geist, Geist_Mono } from "next/font/google";
import "./globals.css";
const geistSans = Geist({
variable: "--font-geist-sans",
subsets: ["latin"],
});
const geistMono = Geist_Mono({
variable: "--font-geist-mono",
subsets: ["latin"],
});
export const metadata: Metadata = {
title: "Create Next App",
description: "Generated by create next app",
};
export default function RootLayout({
children,
}: Readonly<{
children: React.ReactNode;
}>) {
return (
<html lang="en">
<body
className={`${geistSans.variable} ${geistMono.variable} antialiased`}
>
{children}
</body>
</html>
);
}

26
frontend/src/app/page.tsx Normal file
View File

@@ -0,0 +1,26 @@
'use client'
import { ServerStatus } from "@/components/server-status"
import { SearchForm } from "@/components/search-form"
import { SearchResults } from "@/components/search-results"
import { useState } from "react"
export default function Home() {
const [searchResults, setSearchResults] = useState([])
const [isSearching, setIsSearching] = useState(false)
const services = [
{ name: "Ollama", status: "running" as const },
{ name: "SearxNG", status: "running" as const },
{ name: "Supabase", status: "running" as const }
]
return (
<main className="container mx-auto p-4">
<h1 className="text-4xl font-bold text-center mb-8">Business Search</h1>
<SearchForm onSearch={setSearchResults} onSearchingChange={setIsSearching} />
<SearchResults results={searchResults} isLoading={isSearching} />
<ServerStatus services={services} />
</main>
)
}

View File

@@ -0,0 +1,79 @@
import { Search } from "lucide-react"
import { useState } from "react"
interface SearchFormProps {
onSearch: (results: any[]) => void;
onSearchingChange: (isSearching: boolean) => void;
}
export function SearchForm({ onSearch, onSearchingChange }: SearchFormProps) {
const [query, setQuery] = useState("")
const [error, setError] = useState<string | null>(null)
const handleSearch = async (e: React.FormEvent) => {
e.preventDefault()
if (!query.trim()) return
setError(null)
onSearchingChange(true)
try {
const response = await fetch("/api/search", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ query: query.trim() }),
})
if (!response.ok) {
throw new Error("Search failed")
}
const data = await response.json()
onSearch(data.results || [])
} catch (error) {
console.error("Search error:", error)
onSearch([])
setError("Failed to perform search. Please try again.")
} finally {
onSearchingChange(false)
}
}
return (
<div className="w-full max-w-2xl mx-auto mt-8 mb-12">
<div className="flex flex-col gap-4">
<div className="flex flex-col gap-2">
<label htmlFor="search" className="text-lg font-medium text-center">
Find local businesses
</label>
<form onSubmit={handleSearch} className="relative">
<input
id="search"
type="text"
value={query}
onChange={(e) => setQuery(e.target.value)}
placeholder="e.g. plumbers in Denver, CO"
className="w-full px-4 py-3 text-lg rounded-lg border border-border bg-background focus:outline-none focus:ring-2 focus:ring-primary"
/>
<button
type="submit"
disabled={!query.trim()}
className="absolute right-2 top-1/2 -translate-y-1/2 p-3 rounded-md bg-primary text-primary-foreground hover:bg-primary/90 transition-colors disabled:opacity-50 disabled:cursor-not-allowed"
aria-label="Search"
>
<Search className="h-5 w-5" />
</button>
</form>
{error && (
<p className="text-sm text-destructive text-center">{error}</p>
)}
<p className="text-sm text-muted-foreground text-center mt-2">
Try searching for: restaurants, dentists, electricians, etc.
</p>
</div>
</div>
</div>
)
}

View File

@@ -0,0 +1,76 @@
interface Business {
id: string;
name: string;
address: string;
phone: string;
website?: string;
email?: string;
description?: string;
rating?: number;
}
interface SearchResultsProps {
results: Business[];
isLoading: boolean;
}
export function SearchResults({ results, isLoading }: SearchResultsProps) {
if (isLoading) {
return (
<div className="w-full max-w-4xl mx-auto mt-8">
<div className="animate-pulse space-y-4">
{[...Array(3)].map((_, i) => (
<div key={i} className="bg-muted rounded-lg p-6">
<div className="h-4 bg-muted-foreground/20 rounded w-3/4 mb-4"></div>
<div className="h-3 bg-muted-foreground/20 rounded w-1/2"></div>
</div>
))}
</div>
</div>
)
}
if (!results.length) {
return null
}
return (
<div className="w-full max-w-4xl mx-auto mt-8">
<div className="space-y-4">
{results.map((business) => (
<div key={business.id} className="bg-card rounded-lg p-6 shadow-sm">
<h3 className="text-xl font-semibold mb-2">{business.name}</h3>
{business.address && (
<p className="text-muted-foreground mb-2">{business.address}</p>
)}
<div className="flex flex-wrap gap-4 text-sm">
{business.phone && (
<a
href={`tel:${business.phone}`}
className="text-primary hover:underline"
>
{business.phone}
</a>
)}
{business.website && (
<a
href={business.website}
target="_blank"
rel="noopener noreferrer"
className="text-primary hover:underline"
>
Visit Website
</a>
)}
</div>
{business.description && (
<p className="mt-4 text-sm text-muted-foreground">
{business.description}
</p>
)}
</div>
))}
</div>
</div>
)
}

View File

@@ -0,0 +1,59 @@
import { CheckCircle2, XCircle, AlertCircle } from "lucide-react"
import { Alert, AlertDescription, AlertTitle } from "@/components/ui/alert"
interface ServiceStatus {
name: string
status: "running" | "error" | "warning"
}
interface ServerStatusProps {
services: ServiceStatus[]
error?: string
}
export function ServerStatus({ services, error }: ServerStatusProps) {
if (error) {
return (
<Alert variant="destructive" className="max-w-md mx-auto mt-4">
<XCircle className="h-4 w-4" />
<AlertTitle>Server Error</AlertTitle>
<AlertDescription>{error}</AlertDescription>
</Alert>
)
}
return (
<div className="space-y-4 max-w-md mx-auto mt-4">
<h2 className="text-xl font-semibold text-center mb-6">Service Status</h2>
<div className="space-y-3">
{services.map((service) => (
<Alert
key={service.name}
variant={service.status === "error" ? "destructive" : "default"}
className="flex items-center justify-between hover:bg-accent/50 transition-colors"
>
<div className="flex items-center gap-3">
{service.status === "running" && (
<CheckCircle2 className="h-5 w-5 text-green-500 shrink-0" />
)}
{service.status === "error" && (
<XCircle className="h-5 w-5 text-red-500 shrink-0" />
)}
{service.status === "warning" && (
<AlertCircle className="h-5 w-5 text-yellow-500 shrink-0" />
)}
<AlertTitle className="font-medium">{service.name}</AlertTitle>
</div>
<span className={`text-sm ${
service.status === "running" ? "text-green-600" :
service.status === "error" ? "text-red-600" :
"text-yellow-600"
}`}>
{service.status.charAt(0).toUpperCase() + service.status.slice(1)}
</span>
</Alert>
))}
</div>
</div>
)
}

View File

@@ -0,0 +1,58 @@
import * as React from "react"
import { cva, type VariantProps } from "class-variance-authority"
import { cn } from "@/lib/utils"
const alertVariants = cva(
"relative w-full rounded-lg border p-4 [&>svg~*]:pl-7 [&>svg+div]:translate-y-[-3px] [&>svg]:absolute [&>svg]:left-4 [&>svg]:top-4 [&>svg]:text-foreground",
{
variants: {
variant: {
default: "bg-background text-foreground",
destructive:
"border-destructive/50 text-destructive dark:border-destructive [&>svg]:text-destructive",
},
},
defaultVariants: {
variant: "default",
},
}
)
const Alert = React.forwardRef<
HTMLDivElement,
React.HTMLAttributes<HTMLDivElement> & VariantProps<typeof alertVariants>
>(({ className, variant, ...props }, ref) => (
<div
ref={ref}
role="alert"
className={cn(alertVariants({ variant }), className)}
{...props}
/>
))
Alert.displayName = "Alert"
const AlertTitle = React.forwardRef<
HTMLParagraphElement,
React.HTMLAttributes<HTMLHeadingElement>
>(({ className, ...props }, ref) => (
<h5
ref={ref}
className={cn("mb-1 font-medium leading-none tracking-tight", className)}
{...props}
/>
))
AlertTitle.displayName = "AlertTitle"
const AlertDescription = React.forwardRef<
HTMLParagraphElement,
React.HTMLAttributes<HTMLParagraphElement>
>(({ className, ...props }, ref) => (
<div
ref={ref}
className={cn("text-sm [&_p]:leading-relaxed", className)}
{...props}
/>
))
AlertDescription.displayName = "AlertDescription"
export { Alert, AlertTitle, AlertDescription }

View File

@@ -0,0 +1,6 @@
import { type ClassValue, clsx } from "clsx"
import { twMerge } from "tailwind-merge"
export function cn(...inputs: ClassValue[]) {
return twMerge(clsx(inputs))
}

View File

@@ -0,0 +1,79 @@
import type { Config } from "tailwindcss";
const config: Config = {
darkMode: ["class"],
content: [
'./pages/**/*.{ts,tsx}',
'./components/**/*.{ts,tsx}',
'./app/**/*.{ts,tsx}',
'./src/**/*.{ts,tsx}',
],
theme: {
container: {
center: true,
padding: "2rem",
screens: {
"2xl": "1400px",
},
},
extend: {
colors: {
border: "hsl(var(--border))",
input: "hsl(var(--input))",
ring: "hsl(var(--ring))",
background: "hsl(var(--background))",
foreground: "hsl(var(--foreground))",
primary: {
DEFAULT: "hsl(var(--primary))",
foreground: "hsl(var(--primary-foreground))",
},
secondary: {
DEFAULT: "hsl(var(--secondary))",
foreground: "hsl(var(--secondary-foreground))",
},
destructive: {
DEFAULT: "hsl(var(--destructive))",
foreground: "hsl(var(--destructive-foreground))",
},
muted: {
DEFAULT: "hsl(var(--muted))",
foreground: "hsl(var(--muted-foreground))",
},
accent: {
DEFAULT: "hsl(var(--accent))",
foreground: "hsl(var(--accent-foreground))",
},
popover: {
DEFAULT: "hsl(var(--popover))",
foreground: "hsl(var(--popover-foreground))",
},
card: {
DEFAULT: "hsl(var(--card))",
foreground: "hsl(var(--card-foreground))",
},
},
borderRadius: {
lg: "var(--radius)",
md: "calc(var(--radius) - 2px)",
sm: "calc(var(--radius) - 4px)",
},
keyframes: {
"accordion-down": {
from: { height: "0" },
to: { height: "var(--radix-accordion-content-height)" },
},
"accordion-up": {
from: { height: "var(--radix-accordion-content-height)" },
to: { height: "0" },
},
},
animation: {
"accordion-down": "accordion-down 0.2s ease-out",
"accordion-up": "accordion-up 0.2s ease-out",
},
},
},
plugins: [require("tailwindcss-animate")],
}
export default config;

27
frontend/tsconfig.json Normal file
View File

@@ -0,0 +1,27 @@
{
"compilerOptions": {
"target": "ES2017",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "bundler",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"plugins": [
{
"name": "next"
}
],
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
"exclude": ["node_modules"]
}

17
jest.config.js Normal file
View File

@@ -0,0 +1,17 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
roots: ['<rootDir>/src'],
testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
transform: {
'^.+\\.ts$': 'ts-jest',
},
moduleFileExtensions: ['ts', 'js', 'json', 'node'],
collectCoverageFrom: [
'src/**/*.{ts,js}',
'!src/tests/**',
'!**/node_modules/**',
],
coverageDirectory: 'coverage',
setupFilesAfterEnv: ['<rootDir>/src/tests/setup.ts'],
};

14318
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,37 +1,80 @@
{
"name": "perplexica-backend",
"version": "1.4.0",
"version": "1.10.0-rc2",
"license": "MIT",
"author": "ItzCrazyKns",
"scripts": {
"start": "node dist/app.js",
"start": "ts-node src/index.ts",
"build": "tsc",
"dev": "nodemon src/app.ts" ,
"dev": "nodemon src/index.ts",
"db:push": "drizzle-kit push sqlite",
"format": "prettier . --check",
"format:write": "prettier . --write"
"format:write": "prettier . --write",
"test:search": "ts-node src/tests/testSearch.ts",
"test:supabase": "ts-node src/tests/supabaseTest.ts",
"test:deepseek": "ts-node src/tests/testDeepseek.ts",
"test:ollama": "ts-node src/tests/testOllama.ts",
"test": "jest",
"test:watch": "jest --watch",
"test:coverage": "jest --coverage",
"build:css": "tailwindcss -i ./src/styles/input.css -o ./public/styles/output.css",
"watch:css": "tailwindcss -i ./src/styles/input.css -o ./public/styles/output.css --watch"
},
"devDependencies": {
"@testing-library/jest-dom": "^6.1.5",
"@types/better-sqlite3": "^7.6.10",
"@types/cors": "^2.8.17",
"@types/express": "^4.17.21",
"@types/html-to-text": "^9.0.4",
"@types/jest": "^29.5.11",
"@types/multer": "^1.4.12",
"@types/node-fetch": "^2.6.12",
"@types/pdf-parse": "^1.1.4",
"@types/readable-stream": "^4.0.11",
"@types/supertest": "^6.0.2",
"@types/ws": "^8.5.12",
"autoprefixer": "^10.4.20",
"drizzle-kit": "^0.22.7",
"jest": "^29.7.0",
"nodemon": "^3.1.0",
"postcss": "^8.4.49",
"prettier": "^3.2.5",
"supertest": "^7.0.0",
"tailwindcss": "^3.4.17",
"ts-jest": "^29.1.1",
"ts-node": "^10.9.2",
"typescript": "^5.4.3"
},
"dependencies": {
"@huggingface/transformers": "latest",
"@iarna/toml": "^2.2.5",
"@langchain/anthropic": "^0.2.3",
"@langchain/community": "^0.2.16",
"@langchain/google-genai": "^0.0.23",
"@langchain/openai": "^0.0.25",
"@shadcn/ui": "^0.0.4",
"@supabase/supabase-js": "^2.47.10",
"@xenova/transformers": "^2.17.1",
"axios": "^1.6.8",
"better-sqlite3": "^11.7.0",
"cheerio": "^1.0.0",
"compute-cosine-similarity": "^1.1.0",
"compute-dot": "^1.1.0",
"cors": "^2.8.5",
"dotenv": "^16.4.5",
"dotenv": "^16.4.7",
"drizzle-orm": "^0.31.2",
"express": "^4.19.2",
"html-to-text": "^9.0.5",
"langchain": "^0.1.30",
"mammoth": "^1.8.0",
"multer": "^1.4.5-lts.1",
"node-fetch": "^2.7.0",
"pdf-parse": "^1.1.1",
"robots-parser": "^3.0.1",
"tesseract.js": "^4.1.4",
"torch": "latest",
"winston": "^3.13.0",
"ws": "^8.16.0",
"zod": "^3.22.4"
"ws": "^8.17.1",
"zod": "^3.24.1"
}
}

6
postcss.config.js Normal file
View File

@@ -0,0 +1,6 @@
module.exports = {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
}

214
public/index.html Normal file
View File

@@ -0,0 +1,214 @@
<!DOCTYPE html>
<html lang="en" class="h-full bg-gray-50">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OffMarket Pro - Business Search</title>
<link href="/styles/output.css" rel="stylesheet">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
</head>
<body class="min-h-full">
<div class="bg-white">
<!-- Navigation -->
<nav class="bg-white shadow-sm">
<div class="mx-auto max-w-7xl px-4 sm:px-6 lg:px-8">
<div class="flex h-16 justify-between items-center">
<div class="flex-shrink-0 flex items-center">
<h1 class="text-xl font-bold text-gray-900">OffMarket Pro</h1>
</div>
</div>
</div>
</nav>
<!-- Main Content -->
<main class="mx-auto max-w-7xl px-4 sm:px-6 lg:px-8 py-8">
<!-- Search Form -->
<div class="mb-8">
<h2 class="text-2xl font-bold text-gray-900 mb-6">Find Off-Market Property Services</h2>
<div class="grid grid-cols-1 gap-4 sm:grid-cols-2">
<div>
<label for="searchQuery" class="block text-sm font-medium text-gray-700">Service Type</label>
<input type="text" id="searchQuery" class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-primary focus:ring-primary sm:text-sm" placeholder="e.g. plumber, electrician">
</div>
<div>
<label for="searchLocation" class="block text-sm font-medium text-gray-700">Location</label>
<input type="text" id="searchLocation" class="mt-1 block w-full rounded-md border-gray-300 shadow-sm focus:border-primary focus:ring-primary sm:text-sm" placeholder="e.g. Denver, CO">
</div>
</div>
<div class="mt-4">
<button onclick="performSearch()" class="inline-flex items-center px-4 py-2 border border-transparent text-sm font-medium rounded-md shadow-sm text-white bg-primary hover:bg-primary-hover focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-primary">
Search
</button>
</div>
</div>
<!-- Progress Indicator -->
<div id="searchProgress" class="hidden mb-8">
<div class="bg-white shadow sm:rounded-lg">
<div class="px-4 py-5 sm:p-6">
<h3 class="text-lg font-medium leading-6 text-gray-900">Search Progress</h3>
<div class="mt-4">
<div class="relative pt-1">
<div class="overflow-hidden h-2 mb-4 text-xs flex rounded bg-gray-200">
<div id="progressBar" class="shadow-none flex flex-col text-center whitespace-nowrap text-white justify-center bg-primary transition-all duration-500" style="width: 0%"></div>
</div>
<div id="progressText" class="text-sm text-gray-600"></div>
</div>
</div>
</div>
</div>
</div>
<!-- Error Display -->
<div id="errorDisplay" class="hidden mb-8">
<div class="rounded-md bg-red-50 p-4">
<div class="flex">
<div class="flex-shrink-0">
<svg class="h-5 w-5 text-red-400" viewBox="0 0 20 20" fill="currentColor">
<path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zM8.707 7.293a1 1 0 00-1.414 1.414L8.586 10l-1.293 1.293a1 1 0 101.414 1.414L10 11.414l1.293 1.293a1 1 0 001.414-1.414L11.414 10l1.293-1.293a1 1 0 00-1.414-1.414L10 8.586 8.707 7.293z" clip-rule="evenodd"/>
</svg>
</div>
<div class="ml-3">
<h3 class="text-sm font-medium text-red-800">Error</h3>
<div class="mt-2 text-sm text-red-700">
<p id="errorMessage"></p>
</div>
</div>
</div>
</div>
</div>
<!-- Results Table -->
<div id="resultsContainer" class="hidden">
<div class="bg-white shadow overflow-hidden sm:rounded-lg">
<div class="px-4 py-5 sm:px-6">
<h3 class="text-lg leading-6 font-medium text-gray-900">Search Results</h3>
</div>
<div class="border-t border-gray-200">
<div class="overflow-x-auto">
<table class="min-w-full divide-y divide-gray-200">
<thead class="bg-gray-50">
<tr>
<th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">Business</th>
<th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">Contact</th>
<th scope="col" class="px-6 py-3 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">Actions</th>
</tr>
</thead>
<tbody id="resultsBody" class="bg-white divide-y divide-gray-200">
<!-- Results will be inserted here -->
</tbody>
</table>
</div>
</div>
</div>
</div>
</main>
</div>
<script>
class SearchProgress {
constructor() {
this.progressBar = document.getElementById('progressBar');
this.progressText = document.getElementById('progressText');
this.container = document.getElementById('searchProgress');
}
show() {
this.container.classList.remove('hidden');
this.setProgress(0, 'Starting search...');
}
hide() {
this.container.classList.add('hidden');
}
setProgress(percent, message) {
this.progressBar.style.width = `${percent}%`;
this.progressText.textContent = message;
}
showError(message) {
this.setProgress(100, `Error: ${message}`);
this.progressBar.classList.remove('bg-primary');
this.progressBar.classList.add('bg-red-500');
}
}
async function performSearch() {
const query = document.getElementById('searchQuery').value;
const location = document.getElementById('searchLocation').value;
if (!query || !location) {
showError('Please enter both search query and location');
return;
}
const progress = new SearchProgress();
progress.show();
try {
document.getElementById('errorDisplay').classList.add('hidden');
document.getElementById('resultsContainer').classList.add('hidden');
const response = await fetch('/api/search', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ query, location })
});
const data = await response.json();
if (!data.success) {
throw new Error(data.error || 'Search failed');
}
displayResults(data.results);
progress.hide();
} catch (error) {
console.error('Search error:', error);
progress.showError(error.message);
showError(error.message);
}
}
function showError(message) {
const errorDisplay = document.getElementById('errorDisplay');
const errorMessage = document.getElementById('errorMessage');
errorMessage.textContent = message;
errorDisplay.classList.remove('hidden');
}
function displayResults(results) {
const container = document.getElementById('resultsContainer');
const tbody = document.getElementById('resultsBody');
tbody.innerHTML = results.map(business => `
<tr>
<td class="px-6 py-4">
<div class="text-sm font-medium text-gray-900">${business.name}</div>
<div class="text-sm text-gray-500">${business.description}</div>
</td>
<td class="px-6 py-4">
<div class="text-sm text-gray-900">${business.address}</div>
<div class="text-sm text-gray-500">${business.phone}</div>
</td>
<td class="px-6 py-4">
${business.website ?
`<a href="${business.website}" target="_blank"
class="inline-flex items-center px-3 py-2 border border-transparent text-sm leading-4 font-medium rounded-md text-white bg-primary hover:bg-primary-hover focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-primary">
Visit Website
</a>` :
'<span class="text-sm text-gray-500">No website available</span>'
}
</td>
</tr>
`).join('');
container.classList.remove('hidden');
}
</script>
</body>
</html>

View File

@@ -1,10 +1,13 @@
[GENERAL]
PORT = 3001 # Port to run the server on
SIMILARITY_MEASURE = "cosine" # "cosine" or "dot"
KEEP_ALIVE = "5m" # How long to keep Ollama models loaded into memory. (Instead of using -1 use "-1m")
[API_KEYS]
OPENAI = "" # OpenAI API key - sk-1234567890abcdef1234567890abcdef
GROQ = "" # Groq API key - gsk_1234567890abcdef1234567890abcdef
ANTHROPIC = "" # Anthropic API key - sk-ant-1234567890abcdef1234567890abcdef
GEMINI = "" # Gemini API key - sk-1234567890abcdef1234567890abcdef
[API_ENDPOINTS]
SEARXNG = "http://localhost:32768" # SearxNG API URL

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +0,0 @@
FROM searxng/searxng
COPY searxng-settings.yml /etc/searxng/settings.yml

3
searxng/limiter.toml Normal file
View File

@@ -0,0 +1,3 @@
[botdetection.ip_limit]
# activate link_token method in the ip_limit method
link_token = true

59
searxng/settings.yml Normal file
View File

@@ -0,0 +1,59 @@
use_default_settings: true
general:
instance_name: 'searxng'
search:
autocomplete: 'google'
formats:
- html
- json
server:
secret_key: 'a2fb23f1b02e6ee83875b09826990de0f6bd908b6638e8c10277d415f6ab852b' # Is overwritten by ${SEARXNG_SECRET}
port: 8080
bind_address: "0.0.0.0"
base_url: http://localhost:8080/
engines:
- name: wolframalpha
disabled: false
- name: google
engine: google
shortcut: g
disabled: false
- name: bing
engine: bing
shortcut: b
disabled: false
- name: duckduckgo
engine: duckduckgo
shortcut: d
disabled: false
- name: yelp
engine: yelp
shortcut: y
disabled: false
ui:
static_path: ""
templates_path: ""
default_theme: simple
default_locale: en
results_on_new_tab: false
outgoing:
request_timeout: 6.0
max_request_timeout: 10.0
pool_connections: 100
pool_maxsize: 10
enable_http2: true
server:
limiter: false
image_proxy: false
http_protocol_version: "1.0"

50
searxng/uwsgi.ini Normal file
View File

@@ -0,0 +1,50 @@
[uwsgi]
# Who will run the code
uid = searxng
gid = searxng
# Number of workers (usually CPU count)
# default value: %k (= number of CPU core, see Dockerfile)
workers = %k
# Number of threads per worker
# default value: 4 (see Dockerfile)
threads = 4
# The right granted on the created socket
chmod-socket = 666
# Plugin to use and interpreter config
single-interpreter = true
master = true
plugin = python3
lazy-apps = true
enable-threads = 4
# Module to import
module = searx.webapp
# Virtualenv and python path
pythonpath = /usr/local/searxng/
chdir = /usr/local/searxng/searx/
# automatically set processes name to something meaningful
auto-procname = true
# Disable request logging for privacy
disable-logging = true
log-5xx = true
# Set the max size of a request (request-body excluded)
buffer-size = 8192
# No keep alive
# See https://github.com/searx/searx-docker/issues/24
add-header = Connection: close
# uwsgi serves the static files
static-map = /static=/usr/local/searxng/searx/static
# expires set to one day
static-expires = /* 86400
static-gzip-all = True
offload-threads = 4

View File

@@ -1,265 +0,0 @@
import { BaseMessage } from '@langchain/core/messages';
import {
PromptTemplate,
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts';
import {
RunnableSequence,
RunnableMap,
RunnableLambda,
} from '@langchain/core/runnables';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { Document } from '@langchain/core/documents';
import { searchSearxng } from '../lib/searxng';
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import computeSimilarity from '../utils/computeSimilarity';
import logger from '../utils/logger';
const basicAcademicSearchRetrieverPrompt = `
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
Example:
1. Follow up question: How does stable diffusion work?
Rephrased: Stable diffusion working
2. Follow up question: What is linear algebra?
Rephrased: Linear algebra
3. Follow up question: What is the third law of thermodynamics?
Rephrased: Third law of thermodynamics
Conversation:
{chat_history}
Follow up question: {query}
Rephrased question:
`;
const basicAcademicSearchResponsePrompt = `
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Academic', this means you will be searching for academic papers and articles on the web.
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
<context>
{context}
</context>
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
Anything between the \`context\` is retrieved from a search engine and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
`;
const strParser = new StringOutputParser();
const handleStream = async (
stream: AsyncGenerator<StreamEvent, any, unknown>,
emitter: eventEmitter,
) => {
for await (const event of stream) {
if (
event.event === 'on_chain_end' &&
event.name === 'FinalSourceRetriever'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: event.data.output }),
);
}
if (
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
);
}
if (
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit('end');
}
}
};
type BasicChainInput = {
chat_history: BaseMessage[];
query: string;
};
const createBasicAcademicSearchRetrieverChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
PromptTemplate.fromTemplate(basicAcademicSearchRetrieverPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
if (input === 'not_needed') {
return { query: '', docs: [] };
}
const res = await searchSearxng(input, {
language: 'en',
engines: [
'arxiv',
'google scholar',
'internetarchivescholar',
'pubmed',
],
});
const documents = res.results.map(
(result) =>
new Document({
pageContent: result.content,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
}),
);
return { query: input, docs: documents };
}),
]);
};
const createBasicAcademicSearchAnsweringChain = (
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const basicAcademicSearchRetrieverChain =
createBasicAcademicSearchRetrieverChain(llm);
const processDocs = async (docs: Document[]) => {
return docs
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
.join('\n');
};
const rerankDocs = async ({
query,
docs,
}: {
query: string;
docs: Document[];
}) => {
if (docs.length === 0) {
return docs;
}
const docsWithContent = docs.filter(
(doc) => doc.pageContent && doc.pageContent.length > 0,
);
const [docEmbeddings, queryEmbedding] = await Promise.all([
embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)),
embeddings.embedQuery(query),
]);
const similarity = docEmbeddings.map((docEmbedding, i) => {
const sim = computeSimilarity(queryEmbedding, docEmbedding);
return {
index: i,
similarity: sim,
};
});
const sortedDocs = similarity
.sort((a, b) => b.similarity - a.similarity)
.slice(0, 15)
.map((sim) => docsWithContent[sim.index]);
return sortedDocs;
};
return RunnableSequence.from([
RunnableMap.from({
query: (input: BasicChainInput) => input.query,
chat_history: (input: BasicChainInput) => input.chat_history,
context: RunnableSequence.from([
(input) => ({
query: input.query,
chat_history: formatChatHistoryAsString(input.chat_history),
}),
basicAcademicSearchRetrieverChain
.pipe(rerankDocs)
.withConfig({
runName: 'FinalSourceRetriever',
})
.pipe(processDocs),
]),
}),
ChatPromptTemplate.fromMessages([
['system', basicAcademicSearchResponsePrompt],
new MessagesPlaceholder('chat_history'),
['user', '{query}'],
]),
llm,
strParser,
]).withConfig({
runName: 'FinalResponseGenerator',
});
};
const basicAcademicSearch = (
query: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = new eventEmitter();
try {
const basicAcademicSearchAnsweringChain =
createBasicAcademicSearchAnsweringChain(llm, embeddings);
const stream = basicAcademicSearchAnsweringChain.streamEvents(
{
chat_history: history,
query: query,
},
{
version: 'v1',
},
);
handleStream(stream, emitter);
} catch (err) {
emitter.emit(
'error',
JSON.stringify({ data: 'An error has occurred please try again later' }),
);
logger.error(`Error in academic search: ${err}`);
}
return emitter;
};
const handleAcademicSearch = (
message: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = basicAcademicSearch(message, history, llm, embeddings);
return emitter;
};
export default handleAcademicSearch;

View File

@@ -1,260 +0,0 @@
import { BaseMessage } from '@langchain/core/messages';
import {
PromptTemplate,
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts';
import {
RunnableSequence,
RunnableMap,
RunnableLambda,
} from '@langchain/core/runnables';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { Document } from '@langchain/core/documents';
import { searchSearxng } from '../lib/searxng';
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import computeSimilarity from '../utils/computeSimilarity';
import logger from '../utils/logger';
const basicRedditSearchRetrieverPrompt = `
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
Example:
1. Follow up question: Which company is most likely to create an AGI
Rephrased: Which company is most likely to create an AGI
2. Follow up question: Is Earth flat?
Rephrased: Is Earth flat?
3. Follow up question: Is there life on Mars?
Rephrased: Is there life on Mars?
Conversation:
{chat_history}
Follow up question: {query}
Rephrased question:
`;
const basicRedditSearchResponsePrompt = `
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Reddit', this means you will be searching for information, opinions and discussions on the web using Reddit.
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by Reddit and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
<context>
{context}
</context>
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
Anything between the \`context\` is retrieved from Reddit and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
`;
const strParser = new StringOutputParser();
const handleStream = async (
stream: AsyncGenerator<StreamEvent, any, unknown>,
emitter: eventEmitter,
) => {
for await (const event of stream) {
if (
event.event === 'on_chain_end' &&
event.name === 'FinalSourceRetriever'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: event.data.output }),
);
}
if (
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
);
}
if (
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit('end');
}
}
};
type BasicChainInput = {
chat_history: BaseMessage[];
query: string;
};
const createBasicRedditSearchRetrieverChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
PromptTemplate.fromTemplate(basicRedditSearchRetrieverPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
if (input === 'not_needed') {
return { query: '', docs: [] };
}
const res = await searchSearxng(input, {
language: 'en',
engines: ['reddit'],
});
const documents = res.results.map(
(result) =>
new Document({
pageContent: result.content ? result.content : result.title,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
}),
);
return { query: input, docs: documents };
}),
]);
};
const createBasicRedditSearchAnsweringChain = (
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const basicRedditSearchRetrieverChain =
createBasicRedditSearchRetrieverChain(llm);
const processDocs = async (docs: Document[]) => {
return docs
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
.join('\n');
};
const rerankDocs = async ({
query,
docs,
}: {
query: string;
docs: Document[];
}) => {
if (docs.length === 0) {
return docs;
}
const docsWithContent = docs.filter(
(doc) => doc.pageContent && doc.pageContent.length > 0,
);
const [docEmbeddings, queryEmbedding] = await Promise.all([
embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)),
embeddings.embedQuery(query),
]);
const similarity = docEmbeddings.map((docEmbedding, i) => {
const sim = computeSimilarity(queryEmbedding, docEmbedding);
return {
index: i,
similarity: sim,
};
});
const sortedDocs = similarity
.sort((a, b) => b.similarity - a.similarity)
.slice(0, 15)
.filter((sim) => sim.similarity > 0.3)
.map((sim) => docsWithContent[sim.index]);
return sortedDocs;
};
return RunnableSequence.from([
RunnableMap.from({
query: (input: BasicChainInput) => input.query,
chat_history: (input: BasicChainInput) => input.chat_history,
context: RunnableSequence.from([
(input) => ({
query: input.query,
chat_history: formatChatHistoryAsString(input.chat_history),
}),
basicRedditSearchRetrieverChain
.pipe(rerankDocs)
.withConfig({
runName: 'FinalSourceRetriever',
})
.pipe(processDocs),
]),
}),
ChatPromptTemplate.fromMessages([
['system', basicRedditSearchResponsePrompt],
new MessagesPlaceholder('chat_history'),
['user', '{query}'],
]),
llm,
strParser,
]).withConfig({
runName: 'FinalResponseGenerator',
});
};
const basicRedditSearch = (
query: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = new eventEmitter();
try {
const basicRedditSearchAnsweringChain =
createBasicRedditSearchAnsweringChain(llm, embeddings);
const stream = basicRedditSearchAnsweringChain.streamEvents(
{
chat_history: history,
query: query,
},
{
version: 'v1',
},
);
handleStream(stream, emitter);
} catch (err) {
emitter.emit(
'error',
JSON.stringify({ data: 'An error has occurred please try again later' }),
);
logger.error(`Error in RedditSearch: ${err}`);
}
return emitter;
};
const handleRedditSearch = (
message: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = basicRedditSearch(message, history, llm, embeddings);
return emitter;
};
export default handleRedditSearch;

View File

@@ -1,261 +0,0 @@
import { BaseMessage } from '@langchain/core/messages';
import {
PromptTemplate,
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts';
import {
RunnableSequence,
RunnableMap,
RunnableLambda,
} from '@langchain/core/runnables';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { Document } from '@langchain/core/documents';
import { searchSearxng } from '../lib/searxng';
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import computeSimilarity from '../utils/computeSimilarity';
import logger from '../utils/logger';
const basicSearchRetrieverPrompt = `
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
Example:
1. Follow up question: What is the capital of France?
Rephrased: Capital of france
2. Follow up question: What is the population of New York City?
Rephrased: Population of New York City
3. Follow up question: What is Docker?
Rephrased: What is Docker
Conversation:
{chat_history}
Follow up question: {query}
Rephrased question:
`;
const basicWebSearchResponsePrompt = `
You are Perplexica, an AI model who is expert at searching the web and answering user's queries.
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
<context>
{context}
</context>
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
Anything between the \`context\` is retrieved from a search engine and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
`;
const strParser = new StringOutputParser();
const handleStream = async (
stream: AsyncGenerator<StreamEvent, any, unknown>,
emitter: eventEmitter,
) => {
for await (const event of stream) {
if (
event.event === 'on_chain_end' &&
event.name === 'FinalSourceRetriever'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: event.data.output }),
);
}
if (
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
);
}
if (
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit('end');
}
}
};
type BasicChainInput = {
chat_history: BaseMessage[];
query: string;
};
const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
PromptTemplate.fromTemplate(basicSearchRetrieverPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
if (input === 'not_needed') {
return { query: '', docs: [] };
}
const res = await searchSearxng(input, {
language: 'en',
});
const documents = res.results.map(
(result) =>
new Document({
pageContent: result.content,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
}),
);
return { query: input, docs: documents };
}),
]);
};
const createBasicWebSearchAnsweringChain = (
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const basicWebSearchRetrieverChain = createBasicWebSearchRetrieverChain(llm);
const processDocs = async (docs: Document[]) => {
return docs
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
.join('\n');
};
const rerankDocs = async ({
query,
docs,
}: {
query: string;
docs: Document[];
}) => {
if (docs.length === 0) {
return docs;
}
const docsWithContent = docs.filter(
(doc) => doc.pageContent && doc.pageContent.length > 0,
);
const [docEmbeddings, queryEmbedding] = await Promise.all([
embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)),
embeddings.embedQuery(query),
]);
const similarity = docEmbeddings.map((docEmbedding, i) => {
const sim = computeSimilarity(queryEmbedding, docEmbedding);
return {
index: i,
similarity: sim,
};
});
const sortedDocs = similarity
.sort((a, b) => b.similarity - a.similarity)
.filter((sim) => sim.similarity > 0.5)
.slice(0, 15)
.map((sim) => docsWithContent[sim.index]);
return sortedDocs;
};
return RunnableSequence.from([
RunnableMap.from({
query: (input: BasicChainInput) => input.query,
chat_history: (input: BasicChainInput) => input.chat_history,
context: RunnableSequence.from([
(input) => ({
query: input.query,
chat_history: formatChatHistoryAsString(input.chat_history),
}),
basicWebSearchRetrieverChain
.pipe(rerankDocs)
.withConfig({
runName: 'FinalSourceRetriever',
})
.pipe(processDocs),
]),
}),
ChatPromptTemplate.fromMessages([
['system', basicWebSearchResponsePrompt],
new MessagesPlaceholder('chat_history'),
['user', '{query}'],
]),
llm,
strParser,
]).withConfig({
runName: 'FinalResponseGenerator',
});
};
const basicWebSearch = (
query: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = new eventEmitter();
try {
const basicWebSearchAnsweringChain = createBasicWebSearchAnsweringChain(
llm,
embeddings,
);
const stream = basicWebSearchAnsweringChain.streamEvents(
{
chat_history: history,
query: query,
},
{
version: 'v1',
},
);
handleStream(stream, emitter);
} catch (err) {
emitter.emit(
'error',
JSON.stringify({ data: 'An error has occurred please try again later' }),
);
logger.error(`Error in websearch: ${err}`);
}
return emitter;
};
const handleWebSearch = (
message: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = basicWebSearch(message, history, llm, embeddings);
return emitter;
};
export default handleWebSearch;

View File

@@ -1,219 +0,0 @@
import { BaseMessage } from '@langchain/core/messages';
import {
PromptTemplate,
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts';
import {
RunnableSequence,
RunnableMap,
RunnableLambda,
} from '@langchain/core/runnables';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { Document } from '@langchain/core/documents';
import { searchSearxng } from '../lib/searxng';
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import logger from '../utils/logger';
const basicWolframAlphaSearchRetrieverPrompt = `
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
Example:
1. Follow up question: What is the atomic radius of S?
Rephrased: Atomic radius of S
2. Follow up question: What is linear algebra?
Rephrased: Linear algebra
3. Follow up question: What is the third law of thermodynamics?
Rephrased: Third law of thermodynamics
Conversation:
{chat_history}
Follow up question: {query}
Rephrased question:
`;
const basicWolframAlphaSearchResponsePrompt = `
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Wolfram Alpha', this means you will be searching for information on the web using Wolfram Alpha. It is a computational knowledge engine that can answer factual queries and perform computations.
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by Wolfram Alpha and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
<context>
{context}
</context>
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
Anything between the \`context\` is retrieved from Wolfram Alpha and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
`;
const strParser = new StringOutputParser();
const handleStream = async (
stream: AsyncGenerator<StreamEvent, any, unknown>,
emitter: eventEmitter,
) => {
for await (const event of stream) {
if (
event.event === 'on_chain_end' &&
event.name === 'FinalSourceRetriever'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: event.data.output }),
);
}
if (
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
);
}
if (
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit('end');
}
}
};
type BasicChainInput = {
chat_history: BaseMessage[];
query: string;
};
const createBasicWolframAlphaSearchRetrieverChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
PromptTemplate.fromTemplate(basicWolframAlphaSearchRetrieverPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
if (input === 'not_needed') {
return { query: '', docs: [] };
}
const res = await searchSearxng(input, {
language: 'en',
engines: ['wolframalpha'],
});
const documents = res.results.map(
(result) =>
new Document({
pageContent: result.content,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
}),
);
return { query: input, docs: documents };
}),
]);
};
const createBasicWolframAlphaSearchAnsweringChain = (llm: BaseChatModel) => {
const basicWolframAlphaSearchRetrieverChain =
createBasicWolframAlphaSearchRetrieverChain(llm);
const processDocs = (docs: Document[]) => {
return docs
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
.join('\n');
};
return RunnableSequence.from([
RunnableMap.from({
query: (input: BasicChainInput) => input.query,
chat_history: (input: BasicChainInput) => input.chat_history,
context: RunnableSequence.from([
(input) => ({
query: input.query,
chat_history: formatChatHistoryAsString(input.chat_history),
}),
basicWolframAlphaSearchRetrieverChain
.pipe(({ query, docs }) => {
return docs;
})
.withConfig({
runName: 'FinalSourceRetriever',
})
.pipe(processDocs),
]),
}),
ChatPromptTemplate.fromMessages([
['system', basicWolframAlphaSearchResponsePrompt],
new MessagesPlaceholder('chat_history'),
['user', '{query}'],
]),
llm,
strParser,
]).withConfig({
runName: 'FinalResponseGenerator',
});
};
const basicWolframAlphaSearch = (
query: string,
history: BaseMessage[],
llm: BaseChatModel,
) => {
const emitter = new eventEmitter();
try {
const basicWolframAlphaSearchAnsweringChain =
createBasicWolframAlphaSearchAnsweringChain(llm);
const stream = basicWolframAlphaSearchAnsweringChain.streamEvents(
{
chat_history: history,
query: query,
},
{
version: 'v1',
},
);
handleStream(stream, emitter);
} catch (err) {
emitter.emit(
'error',
JSON.stringify({ data: 'An error has occurred please try again later' }),
);
logger.error(`Error in WolframAlphaSearch: ${err}`);
}
return emitter;
};
const handleWolframAlphaSearch = (
message: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = basicWolframAlphaSearch(message, history, llm);
return emitter;
};
export default handleWolframAlphaSearch;

View File

@@ -1,90 +0,0 @@
import { BaseMessage } from '@langchain/core/messages';
import {
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts';
import { RunnableSequence } from '@langchain/core/runnables';
import { StringOutputParser } from '@langchain/core/output_parsers';
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
import eventEmitter from 'events';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import logger from '../utils/logger';
const writingAssistantPrompt = `
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are currently set on focus mode 'Writing Assistant', this means you will be helping the user write a response to a given query.
Since you are a writing assistant, you would not perform web searches. If you think you lack information to answer the query, you can ask the user for more information or suggest them to switch to a different focus mode.
`;
const strParser = new StringOutputParser();
const handleStream = async (
stream: AsyncGenerator<StreamEvent, any, unknown>,
emitter: eventEmitter,
) => {
for await (const event of stream) {
if (
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
);
}
if (
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit('end');
}
}
};
const createWritingAssistantChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
ChatPromptTemplate.fromMessages([
['system', writingAssistantPrompt],
new MessagesPlaceholder('chat_history'),
['user', '{query}'],
]),
llm,
strParser,
]).withConfig({
runName: 'FinalResponseGenerator',
});
};
const handleWritingAssistant = (
query: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = new eventEmitter();
try {
const writingAssistantChain = createWritingAssistantChain(llm);
const stream = writingAssistantChain.streamEvents(
{
chat_history: history,
query: query,
},
{
version: 'v1',
},
);
handleStream(stream, emitter);
} catch (err) {
emitter.emit(
'error',
JSON.stringify({ data: 'An error has occurred please try again later' }),
);
logger.error(`Error in writing assistant: ${err}`);
}
return emitter;
};
export default handleWritingAssistant;

View File

@@ -1,261 +0,0 @@
import { BaseMessage } from '@langchain/core/messages';
import {
PromptTemplate,
ChatPromptTemplate,
MessagesPlaceholder,
} from '@langchain/core/prompts';
import {
RunnableSequence,
RunnableMap,
RunnableLambda,
} from '@langchain/core/runnables';
import { StringOutputParser } from '@langchain/core/output_parsers';
import { Document } from '@langchain/core/documents';
import { searchSearxng } from '../lib/searxng';
import type { StreamEvent } from '@langchain/core/tracers/log_stream';
import type { BaseChatModel } from '@langchain/core/language_models/chat_models';
import type { Embeddings } from '@langchain/core/embeddings';
import formatChatHistoryAsString from '../utils/formatHistory';
import eventEmitter from 'events';
import computeSimilarity from '../utils/computeSimilarity';
import logger from '../utils/logger';
const basicYoutubeSearchRetrieverPrompt = `
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question if needed so it is a standalone question that can be used by the LLM to search the web for information.
If it is a writing task or a simple hi, hello rather than a question, you need to return \`not_needed\` as the response.
Example:
1. Follow up question: How does an A.C work?
Rephrased: A.C working
2. Follow up question: Linear algebra explanation video
Rephrased: What is linear algebra?
3. Follow up question: What is theory of relativity?
Rephrased: What is theory of relativity?
Conversation:
{chat_history}
Follow up question: {query}
Rephrased question:
`;
const basicYoutubeSearchResponsePrompt = `
You are Perplexica, an AI model who is expert at searching the web and answering user's queries. You are set on focus mode 'Youtube', this means you will be searching for videos on the web using Youtube and providing information based on the video's transcript.
Generate a response that is informative and relevant to the user's query based on provided context (the context consits of search results containg a brief description of the content of that page).
You must use this context to answer the user's query in the best way possible. Use an unbaised and journalistic tone in your response. Do not repeat the text.
You must not tell the user to open any link or visit any website to get the answer. You must provide the answer in the response itself. If the user asks for links you can provide them.
Your responses should be medium to long in length be informative and relevant to the user's query. You can use markdowns to format your response. You should use bullet points to list the information. Make sure the answer is not short and is informative.
You have to cite the answer using [number] notation. You must cite the sentences with their relevent context number. You must cite each and every part of the answer so the user can know where the information is coming from.
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
Aything inside the following \`context\` HTML block provided below is for your knowledge returned by Youtube and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
<context>
{context}
</context>
If you think there's nothing relevant in the search results, you can say that 'Hmm, sorry I could not find any relevant information on this topic. Would you like me to search again or ask something else?'.
Anything between the \`context\` is retrieved from Youtube and is not a part of the conversation with the user. Today's date is ${new Date().toISOString()}
`;
const strParser = new StringOutputParser();
const handleStream = async (
stream: AsyncGenerator<StreamEvent, any, unknown>,
emitter: eventEmitter,
) => {
for await (const event of stream) {
if (
event.event === 'on_chain_end' &&
event.name === 'FinalSourceRetriever'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'sources', data: event.data.output }),
);
}
if (
event.event === 'on_chain_stream' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit(
'data',
JSON.stringify({ type: 'response', data: event.data.chunk }),
);
}
if (
event.event === 'on_chain_end' &&
event.name === 'FinalResponseGenerator'
) {
emitter.emit('end');
}
}
};
type BasicChainInput = {
chat_history: BaseMessage[];
query: string;
};
const createBasicYoutubeSearchRetrieverChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
PromptTemplate.fromTemplate(basicYoutubeSearchRetrieverPrompt),
llm,
strParser,
RunnableLambda.from(async (input: string) => {
if (input === 'not_needed') {
return { query: '', docs: [] };
}
const res = await searchSearxng(input, {
language: 'en',
engines: ['youtube'],
});
const documents = res.results.map(
(result) =>
new Document({
pageContent: result.content ? result.content : result.title,
metadata: {
title: result.title,
url: result.url,
...(result.img_src && { img_src: result.img_src }),
},
}),
);
return { query: input, docs: documents };
}),
]);
};
const createBasicYoutubeSearchAnsweringChain = (
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const basicYoutubeSearchRetrieverChain =
createBasicYoutubeSearchRetrieverChain(llm);
const processDocs = async (docs: Document[]) => {
return docs
.map((_, index) => `${index + 1}. ${docs[index].pageContent}`)
.join('\n');
};
const rerankDocs = async ({
query,
docs,
}: {
query: string;
docs: Document[];
}) => {
if (docs.length === 0) {
return docs;
}
const docsWithContent = docs.filter(
(doc) => doc.pageContent && doc.pageContent.length > 0,
);
const [docEmbeddings, queryEmbedding] = await Promise.all([
embeddings.embedDocuments(docsWithContent.map((doc) => doc.pageContent)),
embeddings.embedQuery(query),
]);
const similarity = docEmbeddings.map((docEmbedding, i) => {
const sim = computeSimilarity(queryEmbedding, docEmbedding);
return {
index: i,
similarity: sim,
};
});
const sortedDocs = similarity
.sort((a, b) => b.similarity - a.similarity)
.slice(0, 15)
.filter((sim) => sim.similarity > 0.3)
.map((sim) => docsWithContent[sim.index]);
return sortedDocs;
};
return RunnableSequence.from([
RunnableMap.from({
query: (input: BasicChainInput) => input.query,
chat_history: (input: BasicChainInput) => input.chat_history,
context: RunnableSequence.from([
(input) => ({
query: input.query,
chat_history: formatChatHistoryAsString(input.chat_history),
}),
basicYoutubeSearchRetrieverChain
.pipe(rerankDocs)
.withConfig({
runName: 'FinalSourceRetriever',
})
.pipe(processDocs),
]),
}),
ChatPromptTemplate.fromMessages([
['system', basicYoutubeSearchResponsePrompt],
new MessagesPlaceholder('chat_history'),
['user', '{query}'],
]),
llm,
strParser,
]).withConfig({
runName: 'FinalResponseGenerator',
});
};
const basicYoutubeSearch = (
query: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = new eventEmitter();
try {
const basicYoutubeSearchAnsweringChain =
createBasicYoutubeSearchAnsweringChain(llm, embeddings);
const stream = basicYoutubeSearchAnsweringChain.streamEvents(
{
chat_history: history,
query: query,
},
{
version: 'v1',
},
);
handleStream(stream, emitter);
} catch (err) {
emitter.emit(
'error',
JSON.stringify({ data: 'An error has occurred please try again later' }),
);
logger.error(`Error in youtube search: ${err}`);
}
return emitter;
};
const handleYoutubeSearch = (
message: string,
history: BaseMessage[],
llm: BaseChatModel,
embeddings: Embeddings,
) => {
const emitter = basicYoutubeSearch(message, history, llm, embeddings);
return emitter;
};
export default handleYoutubeSearch;

View File

@@ -1,30 +1,16 @@
import { startWebSocketServer } from './websocket';
import express from 'express';
import cors from 'cors';
import http from 'http';
import routes from './routes';
import { getPort } from './config';
import logger from './utils/logger';
const port = getPort();
import searchRoutes from './routes/search';
import businessRoutes from './routes/business';
const app = express();
const server = http.createServer(app);
const corsOptions = {
origin: '*',
};
app.use(cors(corsOptions));
// Middleware
app.use(cors());
app.use(express.json());
app.use('/api', routes);
app.get('/api', (_, res) => {
res.status(200).json({ status: 'ok' });
});
// Routes
app.use('/api/search', searchRoutes);
app.use('/api/business', businessRoutes);
server.listen(port, () => {
logger.info(`Server is running on port ${port}`);
});
startWebSocketServer(server);
export default app;

View File

@@ -0,0 +1,55 @@
import { RunnableSequence, RunnableMap } from '@langchain/core/runnables';
import ListLineOutputParser from '../lib/outputParsers/listLineOutputParser';
import { PromptTemplate } from '@langchain/core/prompts';
import formatChatHistoryAsString from '../utils/formatHistory';
import { BaseMessage } from '@langchain/core/messages';
import { BaseChatModel } from '@langchain/core/language_models/chat_models';
import { ChatOpenAI } from '@langchain/openai';
const suggestionGeneratorPrompt = `
You are an AI suggestion generator for an AI powered search engine. You will be given a conversation below. You need to generate 4-5 suggestions based on the conversation. The suggestion should be relevant to the conversation that can be used by the user to ask the chat model for more information.
You need to make sure the suggestions are relevant to the conversation and are helpful to the user. Keep a note that the user might use these suggestions to ask a chat model for more information.
Make sure the suggestions are medium in length and are informative and relevant to the conversation.
Provide these suggestions separated by newlines between the XML tags <suggestions> and </suggestions>. For example:
<suggestions>
Tell me more about SpaceX and their recent projects
What is the latest news on SpaceX?
Who is the CEO of SpaceX?
</suggestions>
Conversation:
{chat_history}
`;
type SuggestionGeneratorInput = {
chat_history: BaseMessage[];
};
const outputParser = new ListLineOutputParser({
key: 'suggestions',
});
const createSuggestionGeneratorChain = (llm: BaseChatModel) => {
return RunnableSequence.from([
RunnableMap.from({
chat_history: (input: SuggestionGeneratorInput) =>
formatChatHistoryAsString(input.chat_history),
}),
PromptTemplate.fromTemplate(suggestionGeneratorPrompt),
llm,
outputParser,
]);
};
const generateSuggestions = (
input: SuggestionGeneratorInput,
llm: BaseChatModel,
) => {
(llm as unknown as ChatOpenAI).temperature = 0;
const suggestionGeneratorChain = createSuggestionGeneratorChain(llm);
return suggestionGeneratorChain.invoke(input);
};
export default generateSuggestions;

View File

@@ -8,10 +8,13 @@ interface Config {
GENERAL: {
PORT: number;
SIMILARITY_MEASURE: string;
KEEP_ALIVE: string;
};
API_KEYS: {
OPENAI: string;
GROQ: string;
ANTHROPIC: string;
GEMINI: string;
};
API_ENDPOINTS: {
SEARXNG: string;
@@ -33,11 +36,18 @@ export const getPort = () => loadConfig().GENERAL.PORT;
export const getSimilarityMeasure = () =>
loadConfig().GENERAL.SIMILARITY_MEASURE;
export const getKeepAlive = () => loadConfig().GENERAL.KEEP_ALIVE;
export const getOpenaiApiKey = () => loadConfig().API_KEYS.OPENAI;
export const getGroqApiKey = () => loadConfig().API_KEYS.GROQ;
export const getSearxngApiEndpoint = () => loadConfig().API_ENDPOINTS.SEARXNG;
export const getAnthropicApiKey = () => loadConfig().API_KEYS.ANTHROPIC;
export const getGeminiApiKey = () => loadConfig().API_KEYS.GEMINI;
export const getSearxngApiEndpoint = () =>
process.env.SEARXNG_API_URL || loadConfig().API_ENDPOINTS.SEARXNG;
export const getOllamaApiEndpoint = () => loadConfig().API_ENDPOINTS.OLLAMA;
@@ -67,3 +77,16 @@ export const updateConfig = (config: RecursivePartial<Config>) => {
toml.stringify(config),
);
};
export const config = {
ollama: {
url: process.env.OLLAMA_URL || 'http://localhost:11434',
model: process.env.OLLAMA_MODEL || 'mistral',
options: {
temperature: 0.1,
top_p: 0.9,
timeout: 30000 // 30 seconds timeout
}
},
// ... other config
};

40
src/config/env.ts Normal file
View File

@@ -0,0 +1,40 @@
import dotenv from 'dotenv';
// Load environment variables
dotenv.config();
// Environment configuration
const env = {
// Supabase Configuration
SUPABASE_URL: process.env.SUPABASE_URL || '',
SUPABASE_KEY: process.env.SUPABASE_KEY || '',
// Server Configuration
PORT: parseInt(process.env.PORT || '3001', 10),
NODE_ENV: process.env.NODE_ENV || 'development',
// Search Configuration
MAX_RESULTS_PER_QUERY: parseInt(process.env.MAX_RESULTS_PER_QUERY || '50', 10),
CACHE_DURATION_HOURS: parseInt(process.env.CACHE_DURATION_HOURS || '24', 10),
CACHE_DURATION_DAYS: parseInt(process.env.CACHE_DURATION_DAYS || '7', 10),
// SearxNG Configuration
SEARXNG_URL: process.env.SEARXNG_URL || 'http://localhost:4000',
// Ollama Configuration
OLLAMA_URL: process.env.OLLAMA_URL || 'http://localhost:11434',
OLLAMA_MODEL: process.env.OLLAMA_MODEL || 'deepseek-coder:6.7b',
// Hugging Face Configuration
HUGGING_FACE_API_KEY: process.env.HUGGING_FACE_API_KEY || ''
};
// Validate required environment variables
const requiredEnvVars = ['SUPABASE_URL', 'SUPABASE_KEY', 'SEARXNG_URL'];
for (const envVar of requiredEnvVars) {
if (!env[envVar as keyof typeof env]) {
throw new Error(`Missing required environment variable: ${envVar}`);
}
}
export { env };

77
src/config/index.ts Normal file
View File

@@ -0,0 +1,77 @@
import dotenv from 'dotenv';
import path from 'path';
// Load .env file
dotenv.config({ path: path.resolve(__dirname, '../../.env') });
export interface Config {
supabase: {
url: string;
anonKey: string;
};
server: {
port: number;
nodeEnv: string;
};
search: {
maxResultsPerQuery: number;
cacheDurationHours: number;
searxngUrl?: string;
};
rateLimit: {
windowMs: number;
maxRequests: number;
};
security: {
corsOrigin: string;
jwtSecret: string;
};
proxy?: {
http?: string;
https?: string;
};
logging: {
level: string;
};
}
const config: Config = {
supabase: {
url: process.env.SUPABASE_URL || '',
anonKey: process.env.SUPABASE_ANON_KEY || '',
},
server: {
port: parseInt(process.env.PORT || '3000', 10),
nodeEnv: process.env.NODE_ENV || 'development',
},
search: {
maxResultsPerQuery: parseInt(process.env.MAX_RESULTS_PER_QUERY || '20', 10),
cacheDurationHours: parseInt(process.env.CACHE_DURATION_HOURS || '24', 10),
searxngUrl: process.env.SEARXNG_URL
},
rateLimit: {
windowMs: parseInt(process.env.RATE_LIMIT_WINDOW_MS || '900000', 10),
maxRequests: parseInt(process.env.RATE_LIMIT_MAX_REQUESTS || '100', 10),
},
security: {
corsOrigin: process.env.CORS_ORIGIN || 'http://localhost:3000',
jwtSecret: process.env.JWT_SECRET || 'your_jwt_secret_key',
},
logging: {
level: process.env.LOG_LEVEL || 'info',
},
};
// Validate required configuration
const validateConfig = () => {
if (!config.supabase.url) {
throw new Error('SUPABASE_URL is required');
}
if (!config.supabase.anonKey) {
throw new Error('SUPABASE_ANON_KEY is required');
}
};
validateConfig();
export { config };

10
src/db/index.ts Normal file
View File

@@ -0,0 +1,10 @@
import { drizzle } from 'drizzle-orm/better-sqlite3';
import Database from 'better-sqlite3';
import * as schema from './schema';
const sqlite = new Database('data/db.sqlite');
const db = drizzle(sqlite, {
schema: schema,
});
export default db;

28
src/db/schema.ts Normal file
View File

@@ -0,0 +1,28 @@
import { sql } from 'drizzle-orm';
import { text, integer, sqliteTable } from 'drizzle-orm/sqlite-core';
export const messages = sqliteTable('messages', {
id: integer('id').primaryKey(),
content: text('content').notNull(),
chatId: text('chatId').notNull(),
messageId: text('messageId').notNull(),
role: text('type', { enum: ['assistant', 'user'] }),
metadata: text('metadata', {
mode: 'json',
}),
});
interface File {
name: string;
fileId: string;
}
export const chats = sqliteTable('chats', {
id: text('id').primaryKey(),
title: text('title').notNull(),
createdAt: text('createdAt').notNull(),
focusMode: text('focusMode').notNull(),
files: text('files', { mode: 'json' })
.$type<File[]>()
.default(sql`'[]'`),
});

24
src/index.ts Normal file
View File

@@ -0,0 +1,24 @@
import './config/env'; // Load environment variables first
import { startServer } from './server';
import { isPortAvailable } from './utils/portCheck';
import { testConnection } from './lib/supabase';
const PORT = process.env.PORT || 3001;
const init = async () => {
if (!await isPortAvailable(PORT)) {
console.error(`Port ${PORT} is in use. Please try a different port or free up the current one.`);
process.exit(1);
}
// Test Supabase connection
const isConnected = await testConnection();
if (!isConnected) {
console.error('Failed to connect to Supabase. Please check your configuration.');
process.exit(1);
}
startServer();
};
init().catch(console.error);

116
src/lib/categories.ts Normal file
View File

@@ -0,0 +1,116 @@
export interface Category {
id: string;
name: string;
icon: string;
subcategories: SubCategory[];
}
export interface SubCategory {
id: string;
name: string;
}
export const categories: Category[] = [
{
id: 'real-estate-pros',
name: 'Real Estate Professionals',
icon: '🏢',
subcategories: [
{ id: 'wholesalers', name: 'Real Estate Wholesalers' },
{ id: 'agents', name: 'Real Estate Agents' },
{ id: 'attorneys', name: 'Real Estate Attorneys' },
{ id: 'scouts', name: 'Property Scouts' },
{ id: 'brokers', name: 'Real Estate Brokers' },
{ id: 'consultants', name: 'Real Estate Consultants' }
]
},
{
id: 'legal-title',
name: 'Legal & Title Services',
icon: '⚖️',
subcategories: [
{ id: 'title-companies', name: 'Title Companies' },
{ id: 'closing-attorneys', name: 'Closing Attorneys' },
{ id: 'zoning-consultants', name: 'Zoning Consultants' },
{ id: 'probate-specialists', name: 'Probate Specialists' },
{ id: 'eviction-specialists', name: 'Eviction Specialists' }
]
},
{
id: 'financial',
name: 'Financial Services',
icon: '💰',
subcategories: [
{ id: 'hard-money', name: 'Hard Money Lenders' },
{ id: 'private-equity', name: 'Private Equity Investors' },
{ id: 'mortgage-brokers', name: 'Mortgage Brokers' },
{ id: 'tax-advisors', name: 'Tax Advisors' },
{ id: 'appraisers', name: 'Appraisers' }
]
},
{
id: 'contractors',
name: 'Specialist Contractors',
icon: '🔨',
subcategories: [
{ id: 'general', name: 'General Contractors' },
{ id: 'plumbers', name: 'Plumbers' },
{ id: 'electricians', name: 'Electricians' },
{ id: 'hvac', name: 'HVAC Technicians' },
{ id: 'roofers', name: 'Roofers' },
{ id: 'foundation', name: 'Foundation Specialists' },
{ id: 'asbestos', name: 'Asbestos Removal' },
{ id: 'mold', name: 'Mold Remediation' }
]
},
{
id: 'property-services',
name: 'Property Services',
icon: '🏠',
subcategories: [
{ id: 'surveyors', name: 'Surveyors' },
{ id: 'inspectors', name: 'Inspectors' },
{ id: 'property-managers', name: 'Property Managers' },
{ id: 'environmental', name: 'Environmental Consultants' },
{ id: 'junk-removal', name: 'Junk Removal Services' },
{ id: 'cleaning', name: 'Property Cleaning' }
]
},
{
id: 'marketing',
name: 'Marketing & Lead Gen',
icon: '📢',
subcategories: [
{ id: 'direct-mail', name: 'Direct Mail Services' },
{ id: 'social-media', name: 'Social Media Marketing' },
{ id: 'seo', name: 'SEO Specialists' },
{ id: 'ppc', name: 'PPC Advertising' },
{ id: 'lead-gen', name: 'Lead Generation' },
{ id: 'skip-tracing', name: 'Skip Tracing Services' }
]
},
{
id: 'data-tech',
name: 'Data & Technology',
icon: '💻',
subcategories: [
{ id: 'data-providers', name: 'Property Data Providers' },
{ id: 'crm', name: 'CRM Systems' },
{ id: 'valuation', name: 'Valuation Tools' },
{ id: 'virtual-tours', name: 'Virtual Tour Services' },
{ id: 'automation', name: 'Automation Tools' }
]
},
{
id: 'specialty',
name: 'Specialty Services',
icon: '🎯',
subcategories: [
{ id: 'auction', name: 'Auction Companies' },
{ id: 'relocation', name: 'Relocation Services' },
{ id: 'staging', name: 'Home Staging' },
{ id: 'photography', name: 'Real Estate Photography' },
{ id: 'virtual-assistant', name: 'Virtual Assistants' }
]
}
];

51
src/lib/db/optOutDb.ts Normal file
View File

@@ -0,0 +1,51 @@
import { Database } from 'better-sqlite3';
import path from 'path';
interface OptOutEntry {
domain: string;
email: string;
reason?: string;
timestamp: Date;
}
export class OptOutDatabase {
private db: Database;
constructor() {
this.db = new Database(path.join(__dirname, '../../../data/optout.db'));
this.initializeDatabase();
}
private initializeDatabase() {
this.db.exec(`
CREATE TABLE IF NOT EXISTS opt_outs (
domain TEXT PRIMARY KEY,
email TEXT NOT NULL,
reason TEXT,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_domain ON opt_outs(domain);
`);
}
async addOptOut(entry: OptOutEntry): Promise<void> {
const stmt = this.db.prepare(
'INSERT OR REPLACE INTO opt_outs (domain, email, reason, timestamp) VALUES (?, ?, ?, ?)'
);
stmt.run(entry.domain, entry.email, entry.reason, entry.timestamp.toISOString());
}
isOptedOut(domain: string): boolean {
const stmt = this.db.prepare('SELECT 1 FROM opt_outs WHERE domain = ?');
return stmt.get(domain) !== undefined;
}
removeOptOut(domain: string): void {
const stmt = this.db.prepare('DELETE FROM opt_outs WHERE domain = ?');
stmt.run(domain);
}
getOptOutList(): OptOutEntry[] {
return this.db.prepare('SELECT * FROM opt_outs').all();
}
}

74
src/lib/db/supabase.ts Normal file
View File

@@ -0,0 +1,74 @@
import { createClient } from '@supabase/supabase-js';
import { BusinessData } from '../searxng';
import { env } from '../../config/env';
// Create the Supabase client with validated environment variables
export const supabase = createClient(
env.supabase.url,
env.supabase.anonKey,
{
auth: {
persistSession: false // Since this is a server environment
}
}
);
// Define the cache record type
export interface CacheRecord {
id: string;
query: string;
results: BusinessData[];
location: string;
category: string;
created_at: string;
updated_at: string;
expires_at: string;
}
// Export database helper functions
export async function getCacheEntry(
category: string,
location: string
): Promise<CacheRecord | null> {
const { data, error } = await supabase
.from('search_cache')
.select('*')
.eq('category', category.toLowerCase())
.eq('location', location.toLowerCase())
.gt('expires_at', new Date().toISOString())
.order('created_at', { ascending: false })
.limit(1)
.single();
if (error) {
console.error('Cache lookup failed:', error);
return null;
}
return data;
}
export async function saveCacheEntry(
category: string,
location: string,
results: BusinessData[],
expiresInDays: number = 7
): Promise<void> {
const expiresAt = new Date();
expiresAt.setDate(expiresAt.getDate() + expiresInDays);
const { error } = await supabase
.from('search_cache')
.insert({
query: `${category} in ${location}`,
category: category.toLowerCase(),
location: location.toLowerCase(),
results,
expires_at: expiresAt.toISOString()
});
if (error) {
console.error('Failed to save cache entry:', error);
throw error;
}
}

195
src/lib/emailScraper.ts Normal file
View File

@@ -0,0 +1,195 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import { Cache } from './utils/cache';
import { RateLimiter } from './utils/rateLimiter';
import robotsParser from 'robots-parser';
interface ScrapingResult {
emails: string[];
phones: string[];
addresses: string[];
socialLinks: string[];
source: string;
timestamp: Date;
attribution: string;
}
export class EmailScraper {
private cache: Cache<ScrapingResult>;
private rateLimiter: RateLimiter;
private robotsCache = new Map<string, any>();
constructor(private options = {
timeout: 5000,
cacheTTL: 60,
rateLimit: { windowMs: 60000, maxRequests: 10 }, // More conservative rate limiting
userAgent: 'BizSearch/1.0 (+https://your-domain.com/about) - Business Directory Service'
}) {
this.cache = new Cache<ScrapingResult>(options.cacheTTL);
this.rateLimiter = new RateLimiter(options.rateLimit.windowMs, options.rateLimit.maxRequests);
}
private async checkRobotsPermission(url: string): Promise<boolean> {
try {
const { protocol, host } = new URL(url);
const robotsUrl = `${protocol}//${host}/robots.txt`;
let parser = this.robotsCache.get(host);
if (!parser) {
const response = await axios.get(robotsUrl);
parser = robotsParser(robotsUrl, response.data);
this.robotsCache.set(host, parser);
}
return parser.isAllowed(url, this.options.userAgent);
} catch (error) {
console.warn(`Could not check robots.txt for ${url}:`, error);
return true; // Assume allowed if robots.txt is unavailable
}
}
async scrapeEmails(url: string): Promise<ScrapingResult> {
// Check cache first
const cached = this.cache.get(url);
if (cached) return cached;
// Check robots.txt
const allowed = await this.checkRobotsPermission(url);
if (!allowed) {
console.log(`Respecting robots.txt disallow for ${url}`);
return {
emails: [],
phones: [],
addresses: [],
socialLinks: [],
source: url,
timestamp: new Date(),
attribution: 'Restricted by robots.txt'
};
}
// Wait for rate limiting slot
await this.rateLimiter.waitForSlot();
try {
const response = await axios.get(url, {
timeout: this.options.timeout,
headers: {
'User-Agent': this.options.userAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
}
});
// Check for noindex meta tag
const $ = cheerio.load(response.data);
if ($('meta[name="robots"][content*="noindex"]').length > 0) {
return {
emails: [],
phones: [],
addresses: [],
socialLinks: [],
source: url,
timestamp: new Date(),
attribution: 'Respecting noindex directive'
};
}
// Only extract contact information from public contact pages or structured data
const isContactPage = /contact|about/i.test(url) ||
$('h1, h2').text().toLowerCase().includes('contact');
const result = {
emails: new Set<string>(),
phones: new Set<string>(),
addresses: new Set<string>(),
socialLinks: new Set<string>(),
source: url,
timestamp: new Date(),
attribution: `Data from public business listing at ${new URL(url).hostname}`
};
// Extract from structured data (Schema.org)
$('script[type="application/ld+json"]').each((_, element) => {
try {
const data = JSON.parse($(element).html() || '{}');
if (data['@type'] === 'LocalBusiness' || data['@type'] === 'Organization') {
if (data.email) result.emails.add(data.email.toLowerCase());
if (data.telephone) result.phones.add(this.formatPhoneNumber(data.telephone));
if (data.address) {
const fullAddress = this.formatAddress(data.address);
if (fullAddress) result.addresses.add(fullAddress);
}
}
} catch (e) {
console.error('Error parsing JSON-LD:', e);
}
});
// Only scrape additional info if it's a contact page
if (isContactPage) {
// Extract clearly marked contact information
$('[itemprop="email"], .contact-email, .email').each((_, element) => {
const email = $(element).text().trim();
if (this.isValidEmail(email)) {
result.emails.add(email.toLowerCase());
}
});
$('[itemprop="telephone"], .phone, .contact-phone').each((_, element) => {
const phone = $(element).text().trim();
const formatted = this.formatPhoneNumber(phone);
if (formatted) result.phones.add(formatted);
});
}
const finalResult = {
...result,
emails: Array.from(result.emails),
phones: Array.from(result.phones),
addresses: Array.from(result.addresses),
socialLinks: Array.from(result.socialLinks)
};
this.cache.set(url, finalResult);
return finalResult;
} catch (error) {
console.error(`Failed to scrape ${url}:`, error);
return {
emails: [],
phones: [],
addresses: [],
socialLinks: [],
source: url,
timestamp: new Date(),
attribution: 'Error accessing page'
};
}
}
private isValidEmail(email: string): boolean {
return /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/.test(email);
}
private formatPhoneNumber(phone: string): string {
const digits = phone.replace(/\D/g, '');
if (digits.length === 10) {
return `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
}
return phone;
}
private formatAddress(address: any): string | null {
if (typeof address === 'string') return address;
if (typeof address === 'object') {
const parts = [
address.streetAddress,
address.addressLocality,
address.addressRegion,
address.postalCode
].filter(Boolean);
if (parts.length > 0) return parts.join(', ');
}
return null;
}
}

View File

@@ -0,0 +1,48 @@
import { BaseOutputParser } from '@langchain/core/output_parsers';
interface LineOutputParserArgs {
key?: string;
}
class LineOutputParser extends BaseOutputParser<string> {
private key = 'questions';
constructor(args?: LineOutputParserArgs) {
super();
this.key = args.key ?? this.key;
}
static lc_name() {
return 'LineOutputParser';
}
lc_namespace = ['langchain', 'output_parsers', 'line_output_parser'];
async parse(text: string): Promise<string> {
text = text.trim() || '';
const regex = /^(\s*(-|\*|\d+\.\s|\d+\)\s|\u2022)\s*)+/;
const startKeyIndex = text.indexOf(`<${this.key}>`);
const endKeyIndex = text.indexOf(`</${this.key}>`);
if (startKeyIndex === -1 || endKeyIndex === -1) {
return '';
}
const questionsStartIndex =
startKeyIndex === -1 ? 0 : startKeyIndex + `<${this.key}>`.length;
const questionsEndIndex = endKeyIndex === -1 ? text.length : endKeyIndex;
const line = text
.slice(questionsStartIndex, questionsEndIndex)
.trim()
.replace(regex, '');
return line;
}
getFormatInstructions(): string {
throw new Error('Not implemented.');
}
}
export default LineOutputParser;

View File

@@ -0,0 +1,50 @@
import { BaseOutputParser } from '@langchain/core/output_parsers';
interface LineListOutputParserArgs {
key?: string;
}
class LineListOutputParser extends BaseOutputParser<string[]> {
private key = 'questions';
constructor(args?: LineListOutputParserArgs) {
super();
this.key = args.key ?? this.key;
}
static lc_name() {
return 'LineListOutputParser';
}
lc_namespace = ['langchain', 'output_parsers', 'line_list_output_parser'];
async parse(text: string): Promise<string[]> {
text = text.trim() || '';
const regex = /^(\s*(-|\*|\d+\.\s|\d+\)\s|\u2022)\s*)+/;
const startKeyIndex = text.indexOf(`<${this.key}>`);
const endKeyIndex = text.indexOf(`</${this.key}>`);
if (startKeyIndex === -1 || endKeyIndex === -1) {
return [];
}
const questionsStartIndex =
startKeyIndex === -1 ? 0 : startKeyIndex + `<${this.key}>`.length;
const questionsEndIndex = endKeyIndex === -1 ? text.length : endKeyIndex;
const lines = text
.slice(questionsStartIndex, questionsEndIndex)
.trim()
.split('\n')
.filter((line) => line.trim() !== '')
.map((line) => line.replace(regex, ''));
return lines;
}
getFormatInstructions(): string {
throw new Error('Not implemented.');
}
}
export default LineListOutputParser;

View File

@@ -1,183 +0,0 @@
import { ChatOpenAI, OpenAIEmbeddings } from '@langchain/openai';
import { ChatOllama } from '@langchain/community/chat_models/ollama';
import { OllamaEmbeddings } from '@langchain/community/embeddings/ollama';
import { HuggingFaceTransformersEmbeddings } from './huggingfaceTransformer';
import {
getGroqApiKey,
getOllamaApiEndpoint,
getOpenaiApiKey,
} from '../config';
import logger from '../utils/logger';
export const getAvailableChatModelProviders = async () => {
const openAIApiKey = getOpenaiApiKey();
const groqApiKey = getGroqApiKey();
const ollamaEndpoint = getOllamaApiEndpoint();
const models = {};
if (openAIApiKey) {
try {
models['openai'] = {
'GPT-3.5 turbo': new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-3.5-turbo',
temperature: 0.7,
}),
'GPT-4': new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-4',
temperature: 0.7,
}),
'GPT-4 turbo': new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-4-turbo',
temperature: 0.7,
}),
};
} catch (err) {
logger.error(`Error loading OpenAI models: ${err}`);
}
}
if (groqApiKey) {
try {
models['groq'] = {
'LLaMA3 8b': new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama3-8b-8192',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
'LLaMA3 70b': new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama3-70b-8192',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
'Mixtral 8x7b': new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'mixtral-8x7b-32768',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
'Gemma 7b': new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'gemma-7b-it',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
};
} catch (err) {
logger.error(`Error loading Groq models: ${err}`);
}
}
if (ollamaEndpoint) {
try {
const response = await fetch(`${ollamaEndpoint}/api/tags`, {
headers: {
'Content-Type': 'application/json',
},
});
const { models: ollamaModels } = (await response.json()) as any;
models['ollama'] = ollamaModels.reduce((acc, model) => {
acc[model.model] = new ChatOllama({
baseUrl: ollamaEndpoint,
model: model.model,
temperature: 0.7,
});
return acc;
}, {});
} catch (err) {
logger.error(`Error loading Ollama models: ${err}`);
}
}
models['custom_openai'] = {};
return models;
};
export const getAvailableEmbeddingModelProviders = async () => {
const openAIApiKey = getOpenaiApiKey();
const ollamaEndpoint = getOllamaApiEndpoint();
const models = {};
if (openAIApiKey) {
try {
models['openai'] = {
'Text embedding 3 small': new OpenAIEmbeddings({
openAIApiKey,
modelName: 'text-embedding-3-small',
}),
'Text embedding 3 large': new OpenAIEmbeddings({
openAIApiKey,
modelName: 'text-embedding-3-large',
}),
};
} catch (err) {
logger.error(`Error loading OpenAI embeddings: ${err}`);
}
}
if (ollamaEndpoint) {
try {
const response = await fetch(`${ollamaEndpoint}/api/tags`, {
headers: {
'Content-Type': 'application/json',
},
});
const { models: ollamaModels } = (await response.json()) as any;
models['ollama'] = ollamaModels.reduce((acc, model) => {
acc[model.model] = new OllamaEmbeddings({
baseUrl: ollamaEndpoint,
model: model.model,
});
return acc;
}, {});
} catch (err) {
logger.error(`Error loading Ollama embeddings: ${err}`);
}
}
try {
models['local'] = {
'BGE Small': new HuggingFaceTransformersEmbeddings({
modelName: 'Xenova/bge-small-en-v1.5',
}),
'GTE Small': new HuggingFaceTransformersEmbeddings({
modelName: 'Xenova/gte-small',
}),
'Bert Multilingual': new HuggingFaceTransformersEmbeddings({
modelName: 'Xenova/bert-base-multilingual-uncased'
}),
};
} catch(err) {
logger.error(`Error loading local embeddings: ${err}`);
}
return models;
};

View File

@@ -0,0 +1,59 @@
import { ChatAnthropic } from '@langchain/anthropic';
import { getAnthropicApiKey } from '../../config';
import logger from '../../utils/logger';
export const loadAnthropicChatModels = async () => {
const anthropicApiKey = getAnthropicApiKey();
if (!anthropicApiKey) return {};
try {
const chatModels = {
'claude-3-5-sonnet-20241022': {
displayName: 'Claude 3.5 Sonnet',
model: new ChatAnthropic({
temperature: 0.7,
anthropicApiKey: anthropicApiKey,
model: 'claude-3-5-sonnet-20241022',
}),
},
'claude-3-5-haiku-20241022': {
displayName: 'Claude 3.5 Haiku',
model: new ChatAnthropic({
temperature: 0.7,
anthropicApiKey: anthropicApiKey,
model: 'claude-3-5-haiku-20241022',
}),
},
'claude-3-opus-20240229': {
displayName: 'Claude 3 Opus',
model: new ChatAnthropic({
temperature: 0.7,
anthropicApiKey: anthropicApiKey,
model: 'claude-3-opus-20240229',
}),
},
'claude-3-sonnet-20240229': {
displayName: 'Claude 3 Sonnet',
model: new ChatAnthropic({
temperature: 0.7,
anthropicApiKey: anthropicApiKey,
model: 'claude-3-sonnet-20240229',
}),
},
'claude-3-haiku-20240307': {
displayName: 'Claude 3 Haiku',
model: new ChatAnthropic({
temperature: 0.7,
anthropicApiKey: anthropicApiKey,
model: 'claude-3-haiku-20240307',
}),
},
};
return chatModels;
} catch (err) {
logger.error(`Error loading Anthropic models: ${err}`);
return {};
}
};

View File

@@ -0,0 +1,19 @@
import { Business, SearchParams } from '../../../types/business';
import { WebScraperProvider } from './webScraper';
export class BusinessProvider {
private scraper: WebScraperProvider;
constructor() {
this.scraper = new WebScraperProvider();
}
async search(params: SearchParams): Promise<Business[]> {
return this.scraper.search(params);
}
async getDetails(businessId: string): Promise<Business | null> {
// Implement detailed business lookup using stored data or additional scraping
return null;
}
}

View File

@@ -0,0 +1,111 @@
import { Business, SearchParams } from '../../../types/business';
import { searchWeb } from '../search'; // This is Perplexica's existing search function
import { parseHTML } from '../utils/parser';
export class WebScraperProvider {
async search(params: SearchParams): Promise<Business[]> {
const searchQueries = this.generateQueries(params);
const businesses: Business[] = [];
for (const query of searchQueries) {
// Use Perplexica's existing search functionality
const results = await searchWeb(query, {
maxResults: 20,
type: 'general' // or 'news' depending on what we want
});
for (const result of results) {
try {
const html = await fetch(result.url).then(res => res.text());
const businessData = await this.extractBusinessData(html, result.url);
if (businessData) {
businesses.push(businessData);
}
} catch (error) {
console.error(`Failed to extract data from ${result.url}:`, error);
}
}
}
return this.deduplicateBusinesses(businesses);
}
private generateQueries(params: SearchParams): string[] {
const { location, category } = params;
return [
`${category} in ${location}`,
`${category} business ${location}`,
`best ${category} near ${location}`,
`${category} services ${location} reviews`
];
}
private async extractBusinessData(html: string, sourceUrl: string): Promise<Business | null> {
const $ = parseHTML(html);
// Different extraction logic based on source
if (sourceUrl.includes('yelp.com')) {
return this.extractYelpData($);
} else if (sourceUrl.includes('yellowpages.com')) {
return this.extractYellowPagesData($);
}
// ... other source-specific extractors
return null;
}
private extractYelpData($: any): Business | null {
try {
return {
id: crypto.randomUUID(),
name: $('.business-name').text().trim(),
phone: $('.phone-number').text().trim(),
address: $('.address').text().trim(),
city: $('.city').text().trim(),
state: $('.state').text().trim(),
zip: $('.zip').text().trim(),
category: $('.category-str-list').text().split(',').map(s => s.trim()),
rating: parseFloat($('.rating').text()),
reviewCount: parseInt($('.review-count').text()),
services: $('.services-list').text().split(',').map(s => s.trim()),
hours: this.extractHours($),
website: $('.website-link').attr('href'),
verified: false,
lastUpdated: new Date()
};
} catch (error) {
return null;
}
}
private deduplicateBusinesses(businesses: Business[]): Business[] {
// Group by phone number and address to identify duplicates
const uniqueBusinesses = new Map<string, Business>();
for (const business of businesses) {
const key = `${business.phone}-${business.address}`.toLowerCase();
if (!uniqueBusinesses.has(key)) {
uniqueBusinesses.set(key, business);
} else {
// Merge data if we have additional information
const existing = uniqueBusinesses.get(key)!;
uniqueBusinesses.set(key, this.mergeBusinessData(existing, business));
}
}
return Array.from(uniqueBusinesses.values());
}
private mergeBusinessData(existing: Business, newData: Business): Business {
return {
...existing,
services: [...new Set([...existing.services, ...newData.services])],
rating: (existing.rating + newData.rating) / 2,
reviewCount: existing.reviewCount + newData.reviewCount,
// Keep the most complete data for other fields
website: existing.website || newData.website,
email: existing.email || newData.email,
hours: existing.hours || newData.hours
};
}
}

View File

@@ -0,0 +1,69 @@
import {
ChatGoogleGenerativeAI,
GoogleGenerativeAIEmbeddings,
} from '@langchain/google-genai';
import { getGeminiApiKey } from '../../config';
import logger from '../../utils/logger';
export const loadGeminiChatModels = async () => {
const geminiApiKey = getGeminiApiKey();
if (!geminiApiKey) return {};
try {
const chatModels = {
'gemini-1.5-flash': {
displayName: 'Gemini 1.5 Flash',
model: new ChatGoogleGenerativeAI({
modelName: 'gemini-1.5-flash',
temperature: 0.7,
apiKey: geminiApiKey,
}),
},
'gemini-1.5-flash-8b': {
displayName: 'Gemini 1.5 Flash 8B',
model: new ChatGoogleGenerativeAI({
modelName: 'gemini-1.5-flash-8b',
temperature: 0.7,
apiKey: geminiApiKey,
}),
},
'gemini-1.5-pro': {
displayName: 'Gemini 1.5 Pro',
model: new ChatGoogleGenerativeAI({
modelName: 'gemini-1.5-pro',
temperature: 0.7,
apiKey: geminiApiKey,
}),
},
};
return chatModels;
} catch (err) {
logger.error(`Error loading Gemini models: ${err}`);
return {};
}
};
export const loadGeminiEmbeddingsModels = async () => {
const geminiApiKey = getGeminiApiKey();
if (!geminiApiKey) return {};
try {
const embeddingModels = {
'text-embedding-004': {
displayName: 'Text Embedding',
model: new GoogleGenerativeAIEmbeddings({
apiKey: geminiApiKey,
modelName: 'text-embedding-004',
}),
},
};
return embeddingModels;
} catch (err) {
logger.error(`Error loading Gemini embeddings model: ${err}`);
return {};
}
};

136
src/lib/providers/groq.ts Normal file
View File

@@ -0,0 +1,136 @@
import { ChatOpenAI } from '@langchain/openai';
import { getGroqApiKey } from '../../config';
import logger from '../../utils/logger';
export const loadGroqChatModels = async () => {
const groqApiKey = getGroqApiKey();
if (!groqApiKey) return {};
try {
const chatModels = {
'llama-3.3-70b-versatile': {
displayName: 'Llama 3.3 70B',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama-3.3-70b-versatile',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'llama-3.2-3b-preview': {
displayName: 'Llama 3.2 3B',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama-3.2-3b-preview',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'llama-3.2-11b-vision-preview': {
displayName: 'Llama 3.2 11B Vision',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama-3.2-11b-vision-preview',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'llama-3.2-90b-vision-preview': {
displayName: 'Llama 3.2 90B Vision',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama-3.2-90b-vision-preview',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'llama-3.1-8b-instant': {
displayName: 'Llama 3.1 8B',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama-3.1-8b-instant',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'llama3-8b-8192': {
displayName: 'LLaMA3 8B',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama3-8b-8192',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'llama3-70b-8192': {
displayName: 'LLaMA3 70B',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'llama3-70b-8192',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'mixtral-8x7b-32768': {
displayName: 'Mixtral 8x7B',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'mixtral-8x7b-32768',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
'gemma2-9b-it': {
displayName: 'Gemma2 9B',
model: new ChatOpenAI(
{
openAIApiKey: groqApiKey,
modelName: 'gemma2-9b-it',
temperature: 0.7,
},
{
baseURL: 'https://api.groq.com/openai/v1',
},
),
},
};
return chatModels;
} catch (err) {
logger.error(`Error loading Groq models: ${err}`);
return {};
}
};

View File

@@ -0,0 +1,49 @@
import { loadGroqChatModels } from './groq';
import { loadOllamaChatModels, loadOllamaEmbeddingsModels } from './ollama';
import { loadOpenAIChatModels, loadOpenAIEmbeddingsModels } from './openai';
import { loadAnthropicChatModels } from './anthropic';
import { loadTransformersEmbeddingsModels } from './transformers';
import { loadGeminiChatModels, loadGeminiEmbeddingsModels } from './gemini';
const chatModelProviders = {
openai: loadOpenAIChatModels,
groq: loadGroqChatModels,
ollama: loadOllamaChatModels,
anthropic: loadAnthropicChatModels,
gemini: loadGeminiChatModels,
};
const embeddingModelProviders = {
openai: loadOpenAIEmbeddingsModels,
local: loadTransformersEmbeddingsModels,
ollama: loadOllamaEmbeddingsModels,
gemini: loadGeminiEmbeddingsModels,
};
export const getAvailableChatModelProviders = async () => {
const models = {};
for (const provider in chatModelProviders) {
const providerModels = await chatModelProviders[provider]();
if (Object.keys(providerModels).length > 0) {
models[provider] = providerModels;
}
}
models['custom_openai'] = {};
return models;
};
export const getAvailableEmbeddingModelProviders = async () => {
const models = {};
for (const provider in embeddingModelProviders) {
const providerModels = await embeddingModelProviders[provider]();
if (Object.keys(providerModels).length > 0) {
models[provider] = providerModels;
}
}
return models;
};

View File

@@ -0,0 +1,74 @@
import { OllamaEmbeddings } from '@langchain/community/embeddings/ollama';
import { getKeepAlive, getOllamaApiEndpoint } from '../../config';
import logger from '../../utils/logger';
import { ChatOllama } from '@langchain/community/chat_models/ollama';
import axios from 'axios';
export const loadOllamaChatModels = async () => {
const ollamaEndpoint = getOllamaApiEndpoint();
const keepAlive = getKeepAlive();
if (!ollamaEndpoint) return {};
try {
const response = await axios.get(`${ollamaEndpoint}/api/tags`, {
headers: {
'Content-Type': 'application/json',
},
});
const { models: ollamaModels } = response.data;
const chatModels = ollamaModels.reduce((acc, model) => {
acc[model.model] = {
displayName: model.name,
model: new ChatOllama({
baseUrl: ollamaEndpoint,
model: model.model,
temperature: 0.7,
keepAlive: keepAlive,
}),
};
return acc;
}, {});
return chatModels;
} catch (err) {
logger.error(`Error loading Ollama models: ${err}`);
return {};
}
};
export const loadOllamaEmbeddingsModels = async () => {
const ollamaEndpoint = getOllamaApiEndpoint();
if (!ollamaEndpoint) return {};
try {
const response = await axios.get(`${ollamaEndpoint}/api/tags`, {
headers: {
'Content-Type': 'application/json',
},
});
const { models: ollamaModels } = response.data;
const embeddingsModels = ollamaModels.reduce((acc, model) => {
acc[model.model] = {
displayName: model.name,
model: new OllamaEmbeddings({
baseUrl: ollamaEndpoint,
model: model.model,
}),
};
return acc;
}, {});
return embeddingsModels;
} catch (err) {
logger.error(`Error loading Ollama embeddings model: ${err}`);
return {};
}
};

View File

@@ -0,0 +1,89 @@
import { ChatOpenAI, OpenAIEmbeddings } from '@langchain/openai';
import { getOpenaiApiKey } from '../../config';
import logger from '../../utils/logger';
export const loadOpenAIChatModels = async () => {
const openAIApiKey = getOpenaiApiKey();
if (!openAIApiKey) return {};
try {
const chatModels = {
'gpt-3.5-turbo': {
displayName: 'GPT-3.5 Turbo',
model: new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-3.5-turbo',
temperature: 0.7,
}),
},
'gpt-4': {
displayName: 'GPT-4',
model: new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-4',
temperature: 0.7,
}),
},
'gpt-4-turbo': {
displayName: 'GPT-4 turbo',
model: new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-4-turbo',
temperature: 0.7,
}),
},
'gpt-4o': {
displayName: 'GPT-4 omni',
model: new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-4o',
temperature: 0.7,
}),
},
'gpt-4o-mini': {
displayName: 'GPT-4 omni mini',
model: new ChatOpenAI({
openAIApiKey,
modelName: 'gpt-4o-mini',
temperature: 0.7,
}),
},
};
return chatModels;
} catch (err) {
logger.error(`Error loading OpenAI models: ${err}`);
return {};
}
};
export const loadOpenAIEmbeddingsModels = async () => {
const openAIApiKey = getOpenaiApiKey();
if (!openAIApiKey) return {};
try {
const embeddingModels = {
'text-embedding-3-small': {
displayName: 'Text Embedding 3 Small',
model: new OpenAIEmbeddings({
openAIApiKey,
modelName: 'text-embedding-3-small',
}),
},
'text-embedding-3-large': {
displayName: 'Text Embedding 3 Large',
model: new OpenAIEmbeddings({
openAIApiKey,
modelName: 'text-embedding-3-large',
}),
},
};
return embeddingModels;
} catch (err) {
logger.error(`Error loading OpenAI embeddings model: ${err}`);
return {};
}
};

View File

@@ -0,0 +1,32 @@
import logger from '../../utils/logger';
import { HuggingFaceTransformersEmbeddings } from '../huggingfaceTransformer';
export const loadTransformersEmbeddingsModels = async () => {
try {
const embeddingModels = {
'xenova-bge-small-en-v1.5': {
displayName: 'BGE Small',
model: new HuggingFaceTransformersEmbeddings({
modelName: 'Xenova/bge-small-en-v1.5',
}),
},
'xenova-gte-small': {
displayName: 'GTE Small',
model: new HuggingFaceTransformersEmbeddings({
modelName: 'Xenova/gte-small',
}),
},
'xenova-bert-base-multilingual-uncased': {
displayName: 'Bert Multilingual',
model: new HuggingFaceTransformersEmbeddings({
modelName: 'Xenova/bert-base-multilingual-uncased',
}),
},
};
return embeddingModels;
} catch (err) {
logger.error(`Error loading Transformers embeddings model: ${err}`);
return {};
}
};

54
src/lib/search.ts Normal file
View File

@@ -0,0 +1,54 @@
import axios from 'axios';
import { config } from '../config';
interface SearchOptions {
maxResults?: number;
type?: 'general' | 'news';
engines?: string[];
}
interface SearchResult {
url: string;
title: string;
content: string;
score?: number;
}
export async function searchWeb(
query: string,
options: SearchOptions = {}
): Promise<SearchResult[]> {
const {
maxResults = 20,
type = 'general',
engines = ['google', 'bing', 'duckduckgo']
} = options;
try {
const response = await axios.get(`${config.search.searxngUrl || process.env.SEARXNG_URL}/search`, {
params: {
q: query,
format: 'json',
categories: type,
engines: engines.join(','),
limit: maxResults
}
});
if (!response.data || !response.data.results) {
console.error('Invalid response from SearxNG:', response.data);
return [];
}
return response.data.results.map((result: any) => ({
url: result.url,
title: result.title,
content: result.content || result.snippet || '',
score: result.score
}));
} catch (error) {
console.error('Search failed:', error);
throw error;
}
}

View File

@@ -1,47 +1,313 @@
import axios from 'axios';
import { getSearxngApiEndpoint } from '../config';
import * as cheerio from 'cheerio';
import { createWorker } from 'tesseract.js';
import { env } from '../config/env';
import { OllamaService } from './services/ollamaService';
import { BusinessData } from './types';
import { db } from './services/databaseService';
import { generateBusinessId } from './utils';
import { extractContactFromHtml, extractCleanAddress } from './utils/scraper';
import { GeocodingService } from './services/geocodingService';
import { cleanAddress, formatPhoneNumber, cleanEmail, cleanDescription } from './utils/dataCleanup';
import { CleanupService } from './services/cleanupService';
interface SearxngSearchOptions {
categories?: string[];
engines?: string[];
language?: string;
pageno?: number;
}
interface SearxngSearchResult {
title: string;
// Define interfaces used only in this file
interface SearchResult {
url: string;
img_src?: string;
thumbnail_src?: string;
thumbnail?: string;
content?: string;
author?: string;
iframe_src?: string;
}
export const searchSearxng = async (
query: string,
opts?: SearxngSearchOptions,
) => {
const searxngURL = getSearxngApiEndpoint();
const url = new URL(`${searxngURL}/search?format=json`);
url.searchParams.append('q', query);
if (opts) {
Object.keys(opts).forEach((key) => {
if (Array.isArray(opts[key])) {
url.searchParams.append(key, opts[key].join(','));
return;
}
url.searchParams.append(key, opts[key]);
});
}
const res = await axios.get(url.toString());
const results: SearxngSearchResult[] = res.data.results;
const suggestions: string[] = res.data.suggestions;
return { results, suggestions };
title: string;
content: string;
phone?: string;
email?: string;
address?: string;
website?: string;
rating?: number;
coordinates?: {
lat: number;
lng: number;
};
}
interface ContactInfo {
phone?: string;
email?: string;
address?: string;
description?: string;
openingHours?: string[];
}
// Export the main search function
export async function searchBusinesses(
query: string,
options: { onProgress?: (status: string, progress: number) => void } = {}
): Promise<BusinessData[]> {
try {
console.log('Processing search query:', query);
const [searchTerm, location] = query.split(' in ').map(s => s.trim());
if (!searchTerm || !location) {
throw new Error('Invalid search query format. Use: "search term in location"');
}
options.onProgress?.('Checking cache', 0);
// Check cache first
const cacheKey = `search:${searchTerm}:${location}`;
let results = await db.getFromCache(cacheKey);
if (!results) {
// Check database for existing businesses
console.log('Searching database for:', searchTerm, 'in', location);
const existingBusinesses = await db.searchBusinesses(searchTerm, location);
// Start search immediately
console.log('Starting web search');
const searchPromise = performSearch(searchTerm, location, options);
if (existingBusinesses.length > 0) {
console.log(`Found ${existingBusinesses.length} existing businesses`);
options.onProgress?.('Retrieved from database', 50);
}
// Wait for new results
const newResults = await searchPromise;
console.log(`Got ${newResults.length} new results from search`);
// Merge results, removing duplicates by ID
const allResults = [...existingBusinesses];
for (const result of newResults) {
if (!allResults.some(b => b.id === result.id)) {
allResults.push(result);
}
}
console.log(`Total unique results: ${allResults.length}`);
// Cache combined results
await db.saveToCache(cacheKey, allResults, env.cache.durationHours * 60 * 60 * 1000);
console.log(`Returning ${allResults.length} total results (${existingBusinesses.length} existing + ${newResults.length} new)`);
results = allResults;
}
// Clean all results using LLM
options.onProgress?.('Cleaning data', 75);
const cleanedResults = await CleanupService.cleanBusinessRecords(results);
options.onProgress?.('Search complete', 100);
return cleanedResults;
} catch (error) {
console.error('Search error:', error);
return [];
}
}
async function performSearch(
searchTerm: string,
location: string,
options: any
): Promise<BusinessData[]> {
const queries = [
searchTerm + ' ' + location,
searchTerm + ' business near ' + location,
searchTerm + ' services ' + location,
'local ' + searchTerm + ' ' + location
];
options.onProgress?.('Searching multiple sources', 25);
let allResults: SearchResult[] = [];
const seenUrls = new Set<string>();
for (const q of queries) {
try {
const response = await axios.get(`${env.searxng.currentUrl}/search`, {
params: {
q,
format: 'json',
engines: 'google,google_maps',
language: 'en-US',
time_range: '',
safesearch: 1
}
});
if (response.data?.results) {
// Deduplicate results
const newResults = response.data.results.filter((result: SearchResult) => {
if (seenUrls.has(result.url)) {
return false;
}
seenUrls.add(result.url);
return true;
});
console.log(`Found ${newResults.length} unique results from ${response.data.results[0]?.engine}`);
allResults = allResults.concat(newResults);
}
} catch (error) {
console.error(`Search failed for query "${q}":`, error);
}
}
options.onProgress?.('Processing results', 50);
const filteredResults = allResults.filter(isValidBusinessResult);
const processedResults = await processResults(filteredResults, location);
// Save results to database
for (const result of processedResults) {
await db.saveBusiness(result).catch(console.error);
}
options.onProgress?.('Search complete', 100);
return processedResults;
}
// Add other necessary functions (isValidBusinessResult, processResults, etc.)
function isValidBusinessResult(result: SearchResult): boolean {
// Skip listing/directory pages and search results
const skipPatterns = [
'tripadvisor.com',
'yelp.com',
'opentable.com',
'restaurants-for-sale',
'guide.michelin.com',
'denver.org',
'/blog/',
'/maps/',
'search?',
'features/',
'/lists/',
'reddit.com',
'eater.com'
];
if (skipPatterns.some(pattern => result.url.toLowerCase().includes(pattern))) {
console.log(`Skipping listing page: ${result.url}`);
return false;
}
// Must have a title
if (!result.title || result.title.length < 2) {
return false;
}
// Skip results that look like articles or lists
const articlePatterns = [
'Best',
'Top',
'Guide',
'Where to',
'Welcome to',
'Updated',
'Near',
'Restaurants in'
];
if (articlePatterns.some(pattern => result.title.includes(pattern))) {
console.log(`Skipping article: ${result.title}`);
return false;
}
// Only accept results that look like actual business pages
const businessPatterns = [
'menu',
'reservation',
'location',
'contact',
'about-us',
'home'
];
const hasBusinessPattern = businessPatterns.some(pattern =>
result.url.toLowerCase().includes(pattern) ||
result.content.toLowerCase().includes(pattern)
);
if (!hasBusinessPattern) {
console.log(`Skipping non-business page: ${result.url}`);
return false;
}
return true;
}
async function processResults(results: SearchResult[], location: string): Promise<BusinessData[]> {
const processedResults: BusinessData[] = [];
// Get coordinates for the location
const locationGeo = await GeocodingService.geocode(location);
const defaultCoords = locationGeo || { lat: 39.7392, lng: -104.9903 };
for (const result of results) {
try {
// Extract contact info from webpage
const contactInfo = await extractContactFromHtml(result.url);
// Create initial business record
const business: BusinessData = {
id: generateBusinessId(result),
name: cleanBusinessName(result.title),
phone: result.phone || contactInfo.phone || '',
email: result.email || contactInfo.email || '',
address: result.address || contactInfo.address || '',
rating: result.rating || 0,
website: result.website || result.url || '',
logo: '',
source: 'web',
description: result.content || contactInfo.description || '',
location: defaultCoords,
openingHours: contactInfo.openingHours
};
// Clean up the record using LLM
const cleanedBusiness = await CleanupService.cleanBusinessRecord(business);
// Get coordinates for cleaned address
if (cleanedBusiness.address) {
const addressGeo = await GeocodingService.geocode(cleanedBusiness.address);
if (addressGeo) {
cleanedBusiness.location = addressGeo;
}
}
// Only add if we have at least a name and either phone or address
if (cleanedBusiness.name && (cleanedBusiness.phone || cleanedBusiness.address)) {
processedResults.push(cleanedBusiness);
}
} catch (error) {
console.error(`Error processing result ${result.title}:`, error);
}
}
return processedResults;
}
// Helper functions
function cleanBusinessName(name: string): string {
// Remove common suffixes and prefixes
const cleanName = name
.replace(/^(The|A|An)\s+/i, '')
.replace(/\s+(-||—|:).*$/, '')
.replace(/\s*\([^)]*\)/g, '')
.trim();
return cleanName;
}
async function getLocationCoordinates(address: string): Promise<{lat: number, lng: number}> {
// Implement geocoding here
// For now, return default coordinates for Denver
return { lat: 39.7392, lng: -104.9903 };
}
async function searchAndUpdateInBackground(searchTerm: string, location: string) {
try {
const results = await performSearch(searchTerm, location, {});
console.log(`Updated ${results.length} businesses in background`);
} catch (error) {
console.error('Background search error:', error);
}
}
// ... rest of the file remains the same

View File

@@ -0,0 +1,111 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import { Cache } from '../utils/cache';
import { RateLimiter } from '../utils/rateLimiter';
interface CrawlResult {
mainContent: string;
contactInfo: string;
aboutInfo: string;
structuredData: any;
}
export class BusinessCrawler {
private cache: Cache<CrawlResult>;
private rateLimiter: RateLimiter;
constructor() {
this.cache = new Cache<CrawlResult>(60); // 1 hour cache
this.rateLimiter = new RateLimiter();
}
async crawlBusinessSite(url: string): Promise<CrawlResult> {
// Check cache first
const cached = this.cache.get(url);
if (cached) return cached;
await this.rateLimiter.waitForSlot();
try {
const mainPage = await this.fetchPage(url);
const $ = cheerio.load(mainPage);
// Get all important URLs
const contactUrl = this.findContactPage($, url);
const aboutUrl = this.findAboutPage($, url);
// Crawl additional pages
const [contactPage, aboutPage] = await Promise.all([
contactUrl ? this.fetchPage(contactUrl) : '',
aboutUrl ? this.fetchPage(aboutUrl) : ''
]);
// Extract structured data
const structuredData = this.extractStructuredData($);
const result = {
mainContent: $('body').text(),
contactInfo: contactPage,
aboutInfo: aboutPage,
structuredData
};
this.cache.set(url, result);
return result;
} catch (error) {
console.error(`Failed to crawl ${url}:`, error);
return {
mainContent: '',
contactInfo: '',
aboutInfo: '',
structuredData: {}
};
}
}
private async fetchPage(url: string): Promise<string> {
try {
const response = await axios.get(url, {
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; BizSearch/1.0; +http://localhost:3000/about)',
}
});
return response.data;
} catch (error) {
console.error(`Failed to fetch ${url}:`, error);
return '';
}
}
private findContactPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
const contactLinks = $('a[href*="contact"], a:contains("Contact")');
if (contactLinks.length > 0) {
const href = contactLinks.first().attr('href');
return href ? new URL(href, baseUrl).toString() : null;
}
return null;
}
private findAboutPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
const aboutLinks = $('a[href*="about"], a:contains("About")');
if (aboutLinks.length > 0) {
const href = aboutLinks.first().attr('href');
return href ? new URL(href, baseUrl).toString() : null;
}
return null;
}
private extractStructuredData($: cheerio.CheerioAPI): any {
const structuredData: any[] = [];
$('script[type="application/ld+json"]').each((_, element) => {
try {
const data = JSON.parse($(element).html() || '{}');
structuredData.push(data);
} catch (error) {
console.error('Failed to parse structured data:', error);
}
});
return structuredData;
}
}

View File

@@ -0,0 +1,71 @@
import { supabase } from '../supabase';
import { BusinessData } from '../searxng';
export class CacheService {
static async getCachedResults(category: string, location: string): Promise<BusinessData[] | null> {
try {
const { data, error } = await supabase
.from('search_cache')
.select('results')
.eq('category', category.toLowerCase())
.eq('location', location.toLowerCase())
.gt('expires_at', new Date().toISOString())
.order('created_at', { ascending: false })
.limit(1)
.single();
if (error) throw error;
return data ? data.results : null;
} catch (error) {
console.error('Cache lookup failed:', error);
return null;
}
}
static async cacheResults(
category: string,
location: string,
results: BusinessData[],
expiresInDays: number = 7
): Promise<void> {
try {
const expiresAt = new Date();
expiresAt.setDate(expiresAt.getDate() + expiresInDays);
const { error } = await supabase
.from('search_cache')
.insert({
query: `${category} in ${location}`,
category: category.toLowerCase(),
location: location.toLowerCase(),
results,
expires_at: expiresAt.toISOString()
});
if (error) throw error;
} catch (error) {
console.error('Failed to cache results:', error);
}
}
static async updateCache(
category: string,
location: string,
newResults: BusinessData[]
): Promise<void> {
try {
const { error } = await supabase
.from('search_cache')
.update({
results: newResults,
updated_at: new Date().toISOString()
})
.eq('category', category.toLowerCase())
.eq('location', location.toLowerCase());
if (error) throw error;
} catch (error) {
console.error('Failed to update cache:', error);
}
}
}

View File

@@ -0,0 +1,235 @@
import { DeepSeekService } from './deepseekService';
import { Business } from '../types';
import { db } from './databaseService';
// Constants for validation and scoring
const BATCH_SIZE = 3; // Process businesses in small batches to avoid overwhelming LLM
const LLM_TIMEOUT = 30000; // 30 second timeout for LLM requests
const MIN_CONFIDENCE_SCORE = 0.7; // Minimum score required to cache results
const VALID_EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
const VALID_PHONE_REGEX = /^\(\d{3}\) \d{3}-\d{4}$/;
const VALID_ADDRESS_REGEX = /^\d+.*(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|way|parkway|pkwy|place|pl),?\s+[a-z ]+,\s*[a-z]{2}\s+\d{5}$/i;
export class CleanupService {
/**
* Attempts to clean business data using LLM with timeout protection.
* Falls back to original data if LLM fails or times out.
*/
private static async cleanWithLLM(prompt: string, originalBusiness: Business): Promise<string> {
try {
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => reject(new Error('LLM timeout')), LLM_TIMEOUT);
});
const llmPromise = DeepSeekService.chat([{
role: 'user',
content: prompt
}]);
const response = await Promise.race([llmPromise, timeoutPromise]);
return (response as string).trim();
} catch (error) {
console.error('LLM cleanup error:', error);
// On timeout, return the original values
return `
Address: ${originalBusiness.address}
Phone: ${originalBusiness.phone}
Email: ${originalBusiness.email}
Description: ${originalBusiness.description}
`;
}
}
/**
* Calculates a confidence score (0-1) for the cleaned business data.
* Score is based on:
* - Valid email format (0.25)
* - Valid phone format (0.25)
* - Valid address format (0.25)
* - Description quality (0.25)
*/
private static calculateConfidenceScore(business: Business): number {
let score = 0;
// Valid email adds 0.25
if (business.email && VALID_EMAIL_REGEX.test(business.email)) {
score += 0.25;
}
// Valid phone adds 0.25
if (business.phone && VALID_PHONE_REGEX.test(business.phone)) {
score += 0.25;
}
// Valid address adds 0.25
if (business.address && VALID_ADDRESS_REGEX.test(business.address)) {
score += 0.25;
}
// Description quality checks (0.25 max)
if (business.description) {
// Length check (0.1)
if (business.description.length > 30 && business.description.length < 200) {
score += 0.1;
}
// Relevance check (0.1)
const businessType = this.getBusinessType(business.name);
if (business.description.toLowerCase().includes(businessType)) {
score += 0.1;
}
// No HTML/markdown (0.05)
if (!/[<>[\]()]/.test(business.description)) {
score += 0.05;
}
}
return score;
}
/**
* Determines the type of business based on name keywords.
* Used for validating and generating descriptions.
*/
private static getBusinessType(name: string): string {
const types = [
'restaurant', 'plumber', 'electrician', 'cafe', 'bar',
'salon', 'shop', 'store', 'service'
];
const nameLower = name.toLowerCase();
return types.find(type => nameLower.includes(type)) || 'business';
}
/**
* Parses LLM response into structured business data.
* Expects format: "field: value" for each line.
*/
private static parseResponse(response: string): Partial<Business> {
const cleaned: Partial<Business> = {};
const lines = response.split('\n');
for (const line of lines) {
const [field, ...values] = line.split(':');
const value = values.join(':').trim();
switch (field.toLowerCase().trim()) {
case 'address':
cleaned.address = value;
break;
case 'phone':
cleaned.phone = value;
break;
case 'email':
cleaned.email = value;
break;
case 'description':
cleaned.description = value;
break;
}
}
return cleaned;
}
/**
* Applies validation rules and cleaning to each field.
* - Standardizes formats
* - Removes invalid data
* - Ensures consistent formatting
*/
private static validateAndClean(business: Business): Business {
const cleaned = { ...business };
// Email validation and cleaning
if (cleaned.email) {
cleaned.email = cleaned.email
.toLowerCase()
.replace(/\[|\]|\(mailto:.*?\)/g, '')
.replace(/^\d+-\d+/, '')
.trim();
if (!VALID_EMAIL_REGEX.test(cleaned.email) ||
['none', 'n/a', 'union office', ''].includes(cleaned.email.toLowerCase())) {
cleaned.email = '';
}
}
// Phone validation and cleaning
if (cleaned.phone) {
const digits = cleaned.phone.replace(/\D/g, '');
if (digits.length === 10) {
cleaned.phone = `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
} else {
cleaned.phone = '';
}
}
// Address validation and cleaning
if (cleaned.address) {
cleaned.address = cleaned.address
.replace(/^.*?(?=\d|[A-Z])/s, '')
.replace(/^(Sure!.*?:|The business.*?:|.*?address.*?:)(?:\s*\\n)*\s*/si, '')
.replace(/\s+/g, ' ')
.trim();
// Standardize state abbreviations
cleaned.address = cleaned.address.replace(/\b(Colorado|Colo|Col)\b/gi, 'CO');
}
// Description validation and cleaning
if (cleaned.description) {
cleaned.description = cleaned.description
.replace(/\$\d+(\.\d{2})?/g, '') // Remove prices
.replace(/\b(call|email|website|click|visit)\b.*$/i, '') // Remove calls to action
.replace(/\s+/g, ' ')
.trim();
const businessType = this.getBusinessType(cleaned.name);
if (businessType !== 'business' &&
!cleaned.description.toLowerCase().includes(businessType)) {
cleaned.description = `${businessType.charAt(0).toUpperCase() + businessType.slice(1)} services in the Denver area.`;
}
}
return cleaned;
}
static async cleanBusinessRecord(business: Business): Promise<Business> {
// Check cache first
const cacheKey = `clean:${business.id}`;
const cached = await db.getFromCache(cacheKey);
if (cached) {
console.log('Using cached clean data for:', business.name);
return cached;
}
// Clean using DeepSeek
const cleaned = await DeepSeekService.cleanBusinessData(business);
const validated = this.validateAndClean({ ...business, ...cleaned });
// Only cache if confidence score is high enough
const confidence = this.calculateConfidenceScore(validated);
if (confidence >= MIN_CONFIDENCE_SCORE) {
await db.saveToCache(cacheKey, validated, 24 * 60 * 60 * 1000);
}
return validated;
}
static async cleanBusinessRecords(businesses: Business[]): Promise<Business[]> {
const cleanedBusinesses: Business[] = [];
// Process in batches
for (let i = 0; i < businesses.length; i += BATCH_SIZE) {
const batch = businesses.slice(i, i + BATCH_SIZE);
const cleanedBatch = await Promise.all(
batch.map(business => this.cleanBusinessRecord(business))
);
cleanedBusinesses.push(...cleanedBatch);
}
return cleanedBusinesses;
}
}

View File

@@ -0,0 +1,107 @@
import { OllamaService } from './ollamaService';
interface ValidatedBusinessData {
name: string;
phone: string;
email: string;
address: string;
description: string;
hours?: string;
isValid: boolean;
}
export class DataValidationService {
private ollama: OllamaService;
constructor() {
this.ollama = new OllamaService();
}
async validateAndCleanData(rawText: string): Promise<ValidatedBusinessData> {
try {
const prompt = `
You are a business data validation expert. Extract and validate business information from the following text.
Return ONLY a JSON object with the following format, nothing else:
{
"name": "verified business name",
"phone": "formatted phone number or N/A",
"email": "verified email address or N/A",
"address": "verified physical address or N/A",
"description": "short business description",
"hours": "business hours if available",
"isValid": boolean
}
Rules:
1. Phone numbers should be in (XXX) XXX-XXXX format
2. Addresses should be properly formatted with street, city, state, zip
3. Remove any irrelevant text from descriptions
4. Set isValid to true only if name and at least one contact method is found
5. Clean up any obvious formatting issues
6. Validate email addresses for proper format
Text to analyze:
${rawText}
`;
const response = await this.ollama.generateResponse(prompt);
try {
// Find the JSON object in the response
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error('No JSON found in response');
}
const result = JSON.parse(jsonMatch[0]);
return this.validateResult(result);
} catch (parseError) {
console.error('Failed to parse Ollama response:', parseError);
throw parseError;
}
} catch (error) {
console.error('Data validation failed:', error);
return {
name: 'Unknown',
phone: 'N/A',
email: 'N/A',
address: 'N/A',
description: '',
hours: '',
isValid: false
};
}
}
private validateResult(result: any): ValidatedBusinessData {
// Ensure all required fields are present
const validated: ValidatedBusinessData = {
name: this.cleanField(result.name) || 'Unknown',
phone: this.formatPhone(result.phone) || 'N/A',
email: this.cleanField(result.email) || 'N/A',
address: this.cleanField(result.address) || 'N/A',
description: this.cleanField(result.description) || '',
hours: this.cleanField(result.hours),
isValid: Boolean(result.isValid)
};
return validated;
}
private cleanField(value: any): string {
if (!value || typeof value !== 'string') return '';
return value.trim().replace(/\s+/g, ' ');
}
private formatPhone(phone: string): string {
if (!phone || phone === 'N/A') return 'N/A';
// Extract digits
const digits = phone.replace(/\D/g, '');
if (digits.length === 10) {
return `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
}
return phone;
}
}

View File

@@ -0,0 +1,80 @@
import { createClient } from '@supabase/supabase-js';
import { Business } from '../types';
import env from '../../config/env';
interface PartialBusiness {
name: string;
address: string;
phone: string;
description: string;
website?: string;
rating?: number;
source?: string;
location?: {
lat: number;
lng: number;
};
}
export class DatabaseService {
private supabase;
constructor() {
this.supabase = createClient(env.SUPABASE_URL, env.SUPABASE_KEY);
}
async saveBusiness(business: PartialBusiness): Promise<Business> {
const { data, error } = await this.supabase
.from('businesses')
.upsert({
name: business.name,
address: business.address,
phone: business.phone,
description: business.description,
website: business.website,
source: business.source || 'deepseek',
rating: business.rating || 4.5,
location: business.location ? `(${business.location.lng},${business.location.lat})` : '(0,0)'
})
.select()
.single();
if (error) {
console.error('Error saving business:', error);
throw new Error('Failed to save business');
}
return data;
}
async findBusinessesByQuery(query: string, location: string): Promise<Business[]> {
const { data, error } = await this.supabase
.from('businesses')
.select('*')
.or(`name.ilike.%${query}%,description.ilike.%${query}%`)
.ilike('address', `%${location}%`)
.order('rating', { ascending: false });
if (error) {
console.error('Error finding businesses:', error);
throw new Error('Failed to find businesses');
}
return data || [];
}
async getBusinessById(id: string): Promise<Business | null> {
const { data, error } = await this.supabase
.from('businesses')
.select('*')
.eq('id', id)
.single();
if (error) {
console.error('Error getting business:', error);
return null;
}
return data;
}
}

View File

@@ -0,0 +1,285 @@
import axios from 'axios';
import EventEmitter from 'events';
import { Business } from '../types';
interface PartialBusiness {
name: string;
address: string;
phone: string;
description: string;
website?: string;
rating?: number;
}
export class DeepSeekService extends EventEmitter {
private readonly baseUrl: string;
private readonly model: string;
constructor() {
super();
this.baseUrl = process.env.OLLAMA_URL || 'http://localhost:11434';
this.model = process.env.OLLAMA_MODEL || 'deepseek-coder:6.7b';
console.log('DeepSeekService initialized with:', {
baseUrl: this.baseUrl,
model: this.model
});
}
async streamChat(messages: any[], onResult: (business: PartialBusiness) => Promise<void>): Promise<void> {
try {
console.log('\nStarting streaming chat request...');
// Enhanced system prompt with more explicit instructions
const enhancedMessages = [
{
role: "system",
content: `You are a business search assistant powered by Deepseek Coder. Your task is to generate sample business listings in JSON format.
When asked about businesses in a location, return business listings one at a time in this exact JSON format:
\`\`\`json
{
"name": "Example Plumbing Co",
"address": "123 Main St, Denver, CO 80202",
"phone": "(303) 555-0123",
"description": "Licensed plumbing contractor specializing in residential and commercial services",
"website": "https://exampleplumbing.com",
"rating": 4.8
}
\`\`\`
Important rules:
1. Return ONE business at a time in JSON format
2. Generate realistic but fictional business data
3. Use proper formatting for phone numbers and addresses
4. Include ratings from 1-5 stars (can use decimals)
5. When sorting by rating, return highest rated first
6. Make each business unique with different names, addresses, and phone numbers
7. Keep descriptions concise and professional
8. Use realistic website URLs based on business names
9. Return exactly the number of businesses requested`
},
...messages
];
console.log('Sending streaming request to Ollama with messages:', JSON.stringify(enhancedMessages, null, 2));
const response = await axios.post(`${this.baseUrl}/api/chat`, {
model: this.model,
messages: enhancedMessages,
stream: true,
temperature: 0.7,
max_tokens: 1000,
system: "You are a business search assistant that returns one business at a time in JSON format."
}, {
responseType: 'stream'
});
let currentJson = '';
response.data.on('data', async (chunk: Buffer) => {
const text = chunk.toString();
currentJson += text;
// Try to find and process complete JSON objects
try {
const business = await this.extractNextBusiness(currentJson);
if (business) {
currentJson = ''; // Reset for next business
await onResult(business);
}
} catch (error) {
// Continue collecting more data if JSON is incomplete
console.debug('Collecting more data for complete JSON');
}
});
return new Promise((resolve, reject) => {
response.data.on('end', () => resolve());
response.data.on('error', (error: Error) => reject(error));
});
} catch (error) {
console.error('\nDeepseek streaming chat error:', error);
if (error instanceof Error) {
console.error('Error stack:', error.stack);
throw new Error(`AI model streaming error: ${error.message}`);
}
throw new Error('Failed to get streaming response from AI model');
}
}
private async extractNextBusiness(text: string): Promise<PartialBusiness | null> {
// Try to find a complete JSON object
const jsonMatch = text.match(/\{[^{]*\}/);
if (!jsonMatch) return null;
try {
const jsonStr = jsonMatch[0];
const business = JSON.parse(jsonStr);
// Validate required fields
if (!business.name || !business.address || !business.phone || !business.description) {
return null;
}
return business;
} catch (e) {
return null;
}
}
async chat(messages: any[]): Promise<any> {
try {
console.log('\nStarting chat request...');
// Enhanced system prompt with more explicit instructions
const enhancedMessages = [
{
role: "system",
content: `You are a business search assistant powered by Deepseek Coder. Your task is to generate sample business listings in JSON format.
When asked about businesses in a location, return business listings in this exact JSON format, with no additional text or comments:
\`\`\`json
[
{
"name": "Example Plumbing Co",
"address": "123 Main St, Denver, CO 80202",
"phone": "(303) 555-0123",
"description": "Licensed plumbing contractor specializing in residential and commercial services",
"website": "https://exampleplumbing.com",
"rating": 4.8
}
]
\`\`\`
Important rules:
1. Return ONLY the JSON array inside code blocks - no explanations or comments
2. Generate realistic but fictional business data
3. Use proper formatting for phone numbers (e.g., "(303) 555-XXXX") and addresses
4. Include ratings from 1-5 stars (can use decimals, e.g., 4.8)
5. When sorting by rating, sort from highest to lowest rating
6. When asked for a specific number of results, always return exactly that many
7. Make each business unique with different names, addresses, and phone numbers
8. Keep descriptions concise and professional
9. Use realistic website URLs based on business names`
},
...messages
];
console.log('Sending request to Ollama with messages:', JSON.stringify(enhancedMessages, null, 2));
const response = await axios.post(`${this.baseUrl}/api/chat`, {
model: this.model,
messages: enhancedMessages,
stream: false,
temperature: 0.7,
max_tokens: 1000,
system: "You are a business search assistant that always responds with JSON data."
});
if (!response.data) {
throw new Error('Empty response from AI model');
}
console.log('\nRaw response data:', JSON.stringify(response.data, null, 2));
if (!response.data.message?.content) {
throw new Error('No content in AI model response');
}
console.log('\nParsing AI response...');
const results = await this.sanitizeJsonResponse(response.data.message.content);
console.log('Parsed results:', JSON.stringify(results, null, 2));
return results;
} catch (error) {
console.error('\nDeepseek chat error:', error);
if (error instanceof Error) {
console.error('Error stack:', error.stack);
throw new Error(`AI model error: ${error.message}`);
}
throw new Error('Failed to get response from AI model');
}
}
private async sanitizeJsonResponse(text: string): Promise<PartialBusiness[]> {
console.log('Attempting to parse response:', text);
// First try to find JSON blocks
const jsonBlockMatch = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/);
if (jsonBlockMatch) {
try {
const jsonStr = jsonBlockMatch[1].trim();
console.log('Found JSON block:', jsonStr);
const parsed = JSON.parse(jsonStr);
return Array.isArray(parsed) ? parsed : [parsed];
} catch (e) {
console.error('Failed to parse JSON block:', e);
}
}
// Then try to find any JSON-like structure
const jsonPatterns = [
/\[\s*\{[\s\S]*\}\s*\]/, // Array of objects
/\{[\s\S]*\}/ // Single object
];
for (const pattern of jsonPatterns) {
const match = text.match(pattern);
if (match) {
try {
const jsonStr = match[0].trim();
console.log('Found JSON pattern:', jsonStr);
const parsed = JSON.parse(jsonStr);
return Array.isArray(parsed) ? parsed : [parsed];
} catch (e) {
console.error('Failed to parse JSON pattern:', e);
continue;
}
}
}
// If no valid JSON found, try to extract structured data
try {
const extractedData = this.extractBusinessData(text);
if (extractedData) {
console.log('Extracted business data:', extractedData);
return [extractedData];
}
} catch (e) {
console.error('Failed to extract business data:', e);
}
throw new Error('No valid JSON or business information found in response');
}
private extractBusinessData(text: string): PartialBusiness {
// Extract business information using regex patterns
const businessInfo: PartialBusiness = {
name: this.extractField(text, 'name', '[^"\\n]+') || 'Unknown Business',
address: this.extractField(text, 'address', '[^"\\n]+') || 'Address not available',
phone: this.extractField(text, 'phone', '[^"\\n]+') || 'Phone not available',
description: this.extractField(text, 'description', '[^"\\n]+') || 'No description available'
};
const website = this.extractField(text, 'website', '[^"\\n]+');
if (website) {
businessInfo.website = website;
}
const rating = this.extractField(text, 'rating', '[0-9.]+');
if (rating) {
businessInfo.rating = parseFloat(rating);
}
return businessInfo;
}
private extractField(text: string, field: string, pattern: string): string {
const regex = new RegExp(`"?${field}"?\\s*[:=]\\s*"?(${pattern})"?`, 'i');
const match = text.match(regex);
return match ? match[1].trim() : '';
}
}

View File

@@ -0,0 +1,63 @@
import axios from 'axios';
import { sleep } from '../utils/helpers';
interface GeocodingResult {
lat: number;
lng: number;
formattedAddress: string;
}
export class GeocodingService {
private static cache = new Map<string, GeocodingResult>();
private static lastRequestTime = 0;
private static RATE_LIMIT_MS = 1000; // 1 second between requests (Nominatim requirement)
static async geocode(address: string): Promise<GeocodingResult | null> {
// Check cache first
const cached = this.cache.get(address);
if (cached) return cached;
try {
// Rate limiting
const now = Date.now();
const timeSinceLastRequest = now - this.lastRequestTime;
if (timeSinceLastRequest < this.RATE_LIMIT_MS) {
await sleep(this.RATE_LIMIT_MS - timeSinceLastRequest);
}
this.lastRequestTime = Date.now();
const response = await axios.get(
'https://nominatim.openstreetmap.org/search',
{
params: {
q: address,
format: 'json',
limit: 1,
addressdetails: 1
},
headers: {
'User-Agent': 'BusinessFinder/1.0'
}
}
);
if (response.data?.length > 0) {
const result = response.data[0];
const geocoded = {
lat: parseFloat(result.lat),
lng: parseFloat(result.lon),
formattedAddress: result.display_name
};
// Cache the result
this.cache.set(address, geocoded);
return geocoded;
}
return null;
} catch (error) {
console.error('Geocoding error:', error);
return null;
}
}
}

Some files were not shown because too many files have changed in this diff Show More