Remove unused output parsers and document utility

This commit is contained in:
ItzCrazyKns
2025-11-23 19:21:16 +05:30
parent d6c364fdcb
commit 6d35d60b49
3 changed files with 0 additions and 197 deletions

View File

@@ -1,48 +0,0 @@
import { BaseOutputParser } from '@langchain/core/output_parsers';
interface LineOutputParserArgs {
key?: string;
}
class LineOutputParser extends BaseOutputParser<string | undefined> {
private key = 'questions';
constructor(args?: LineOutputParserArgs) {
super();
this.key = args?.key ?? this.key;
}
static lc_name() {
return 'LineOutputParser';
}
lc_namespace = ['langchain', 'output_parsers', 'line_output_parser'];
async parse(text: string): Promise<string | undefined> {
text = text.trim() || '';
const regex = /^(\s*(-|\*|\d+\.\s|\d+\)\s|\u2022)\s*)+/;
const startKeyIndex = text.indexOf(`<${this.key}>`);
const endKeyIndex = text.indexOf(`</${this.key}>`);
if (startKeyIndex === -1 || endKeyIndex === -1) {
return undefined;
}
const questionsStartIndex =
startKeyIndex === -1 ? 0 : startKeyIndex + `<${this.key}>`.length;
const questionsEndIndex = endKeyIndex === -1 ? text.length : endKeyIndex;
const line = text
.slice(questionsStartIndex, questionsEndIndex)
.trim()
.replace(regex, '');
return line;
}
getFormatInstructions(): string {
throw new Error('Not implemented.');
}
}
export default LineOutputParser;

View File

@@ -1,50 +0,0 @@
import { BaseOutputParser } from '@langchain/core/output_parsers';
interface LineListOutputParserArgs {
key?: string;
}
class LineListOutputParser extends BaseOutputParser<string[]> {
private key = 'questions';
constructor(args?: LineListOutputParserArgs) {
super();
this.key = args?.key ?? this.key;
}
static lc_name() {
return 'LineListOutputParser';
}
lc_namespace = ['langchain', 'output_parsers', 'line_list_output_parser'];
async parse(text: string): Promise<string[]> {
text = text.trim() || '';
const regex = /^(\s*(-|\*|\d+\.\s|\d+\)\s|\u2022)\s*)+/;
const startKeyIndex = text.indexOf(`<${this.key}>`);
const endKeyIndex = text.indexOf(`</${this.key}>`);
if (startKeyIndex === -1 || endKeyIndex === -1) {
return [];
}
const questionsStartIndex =
startKeyIndex === -1 ? 0 : startKeyIndex + `<${this.key}>`.length;
const questionsEndIndex = endKeyIndex === -1 ? text.length : endKeyIndex;
const lines = text
.slice(questionsStartIndex, questionsEndIndex)
.trim()
.split('\n')
.filter((line) => line.trim() !== '')
.map((line) => line.replace(regex, ''));
return lines;
}
getFormatInstructions(): string {
throw new Error('Not implemented.');
}
}
export default LineListOutputParser;

View File

@@ -1,99 +0,0 @@
import axios from 'axios';
import { htmlToText } from 'html-to-text';
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
import { Document } from '@langchain/core/documents';
import pdfParse from 'pdf-parse';
export const getDocumentsFromLinks = async ({ links }: { links: string[] }) => {
const splitter = new RecursiveCharacterTextSplitter();
let docs: Document[] = [];
await Promise.all(
links.map(async (link) => {
link =
link.startsWith('http://') || link.startsWith('https://')
? link
: `https://${link}`;
try {
const res = await axios.get(link, {
responseType: 'arraybuffer',
});
const isPdf = res.headers['content-type'] === 'application/pdf';
if (isPdf) {
const pdfText = await pdfParse(res.data);
const parsedText = pdfText.text
.replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ')
.trim();
const splittedText = await splitter.splitText(parsedText);
const title = 'PDF Document';
const linkDocs = splittedText.map((text) => {
return new Document({
pageContent: text,
metadata: {
title: title,
url: link,
},
});
});
docs.push(...linkDocs);
return;
}
const parsedText = htmlToText(res.data.toString('utf8'), {
selectors: [
{
selector: 'a',
options: {
ignoreHref: true,
},
},
],
})
.replace(/(\r\n|\n|\r)/gm, ' ')
.replace(/\s+/g, ' ')
.trim();
const splittedText = await splitter.splitText(parsedText);
const title = res.data
.toString('utf8')
.match(/<title.*>(.*?)<\/title>/)?.[1];
const linkDocs = splittedText.map((text) => {
return new Document({
pageContent: text,
metadata: {
title: title || link,
url: link,
},
});
});
docs.push(...linkDocs);
} catch (err) {
console.error(
'An error occurred while getting documents from links: ',
err,
);
docs.push(
new Document({
pageContent: `Failed to retrieve content from the link: ${err}`,
metadata: {
title: 'Failed to retrieve content',
url: link,
},
}),
);
}
}),
);
return docs;
};