forked from langchain-ai/langchainjs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
langchain[minor]: Multi-file loader (langchain-ai#5584)
* Multi-file loader * Update imports, add entrypoint, format --------- Co-authored-by: jacoblee93 <[email protected]>
- Loading branch information
1 parent
5984a6d
commit 3f07d61
Showing
7 changed files
with
240 additions
and
0 deletions.
There are no files selected for viewing
49 changes: 49 additions & 0 deletions
49
docs/core_docs/docs/integrations/document_loaders/file_loaders/multi_file.mdx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
--- | ||
sidebar_position: 2 | ||
hide_table_of_contents: true | ||
--- | ||
|
||
# Multiple individual files | ||
|
||
This example goes over how to load data from multiple file paths. The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together. | ||
|
||
Example files: | ||
|
||
```text | ||
src/document_loaders/example_data/example/ | ||
├── example.txt | ||
└── example.csv | ||
src/document_loaders/example_data/example2/ | ||
├── example.json | ||
└── example.jsonl | ||
``` | ||
|
||
Example code: | ||
|
||
```typescript | ||
import { MultiFileLoader } from "langchain/document_loaders/fs/multi_file"; | ||
import { | ||
JSONLoader, | ||
JSONLinesLoader, | ||
} from "langchain/document_loaders/fs/json"; | ||
import { TextLoader } from "langchain/document_loaders/fs/text"; | ||
import { CSVLoader } from "langchain/document_loaders/fs/csv"; | ||
|
||
const loader = new MultiFileLoader( | ||
[ | ||
"src/document_loaders/example_data/example/example.txt", | ||
"src/document_loaders/example_data/example/example.csv", | ||
"src/document_loaders/example_data/example2/example.json", | ||
"src/document_loaders/example_data/example2/example.jsonl", | ||
], | ||
{ | ||
".json": (path) => new JSONLoader(path, "/texts"), | ||
".jsonl": (path) => new JSONLinesLoader(path, "/html"), | ||
".txt": (path) => new TextLoader(path), | ||
".csv": (path) => new CSVLoader(path, "text"), | ||
} | ||
); | ||
const docs = await loader.load(); | ||
console.log({ docs }); | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import { extname, resolve } from "node:path"; | ||
import { stat } from "node:fs/promises"; | ||
import { Document } from "@langchain/core/documents"; | ||
import { BaseDocumentLoader } from "../base.js"; | ||
import { type LoadersMapping, UnknownHandling } from "./directory.js"; | ||
|
||
/** | ||
* A document loader that loads documents from multiple files. It extends the | ||
* `BaseDocumentLoader` class and implements the `load()` method. | ||
* @example | ||
* ```typescript | ||
* | ||
* const multiFileLoader = new MultiFileLoader( | ||
* ["path/to/file1.pdf", "path/to/file2.txt"], | ||
* { | ||
* ".pdf": (path: string) => new PDFLoader(path), | ||
* }, | ||
* ); | ||
* | ||
* const docs = await multiFileLoader.load(); | ||
* console.log({ docs }); | ||
* | ||
* ``` | ||
*/ | ||
export class MultiFileLoader extends BaseDocumentLoader { | ||
constructor( | ||
public filePaths: string[], | ||
public loaders: LoadersMapping, | ||
public unknown: UnknownHandling = UnknownHandling.Warn | ||
) { | ||
super(); | ||
|
||
if (Object.keys(loaders).length === 0) { | ||
throw new Error("Must provide at least one loader"); | ||
} | ||
for (const extension in loaders) { | ||
if (Object.hasOwn(loaders, extension)) { | ||
if (extension[0] !== ".") { | ||
throw new Error(`Extension must start with a dot: ${extension}`); | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Loads the documents from the provided file paths. It checks if the file | ||
* is a directory and ignores it. If a file is a file, it checks if there | ||
* is a corresponding loader function for the file extension in the `loaders` | ||
* mapping. If there is, it loads the documents. If there is no | ||
* corresponding loader function and `unknown` is set to `Warn`, it logs a | ||
* warning message. If `unknown` is set to `Error`, it throws an error. | ||
* @returns A promise that resolves to an array of loaded documents. | ||
*/ | ||
public async load(): Promise<Document[]> { | ||
const documents: Document[] = []; | ||
|
||
for (const filePath of this.filePaths) { | ||
const fullPath = resolve(filePath); | ||
const fileStat = await stat(fullPath); | ||
|
||
if (fileStat.isDirectory()) { | ||
console.warn(`Ignoring directory: ${fullPath}`); | ||
continue; | ||
} | ||
|
||
const loaderFactory = this.loaders[extname(fullPath)]; | ||
if (loaderFactory) { | ||
const loader = loaderFactory(fullPath); | ||
documents.push(...(await loader.load())); | ||
} else { | ||
switch (this.unknown) { | ||
case UnknownHandling.Ignore: | ||
break; | ||
case UnknownHandling.Warn: | ||
console.warn(`Unknown file type: ${fullPath}`); | ||
break; | ||
case UnknownHandling.Error: | ||
throw new Error(`Unknown file type: ${fullPath}`); | ||
default: | ||
throw new Error(`Unknown unknown handling: ${this.unknown}`); | ||
} | ||
} | ||
} | ||
|
||
return documents; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import * as url from "node:url"; | ||
import * as path from "node:path"; | ||
import { test, expect } from "@jest/globals"; | ||
import { MultiFileLoader } from "../fs/multi_file.js"; | ||
import { CSVLoader } from "../fs/csv.js"; | ||
import { PDFLoader } from "../fs/pdf.js"; | ||
import { TextLoader } from "../fs/text.js"; | ||
import { JSONLoader } from "../fs/json.js"; | ||
import { UnknownHandling } from "../fs/directory.js"; | ||
|
||
test("Test MultiFileLoader", async () => { | ||
const baseDirectory = path.resolve( | ||
path.dirname(url.fileURLToPath(import.meta.url)), | ||
"./example_data" | ||
); | ||
|
||
const filePaths = [ | ||
path.resolve(baseDirectory, "1706.03762.pdf"), | ||
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"), | ||
path.resolve( | ||
baseDirectory, | ||
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv" | ||
), | ||
path.resolve( | ||
baseDirectory, | ||
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json" | ||
), | ||
path.resolve(baseDirectory, "complex.json"), | ||
path.resolve(baseDirectory, "example.txt"), | ||
path.resolve(baseDirectory, "example_separator.csv"), | ||
]; | ||
|
||
const loader = new MultiFileLoader( | ||
filePaths, | ||
{ | ||
".csv": (p) => { | ||
if (p.includes("separator.csv")) { | ||
return new CSVLoader(p, { column: "html", separator: "|" }); | ||
} | ||
return new CSVLoader(p, "html"); | ||
}, | ||
".pdf": (p) => new PDFLoader(p), | ||
".txt": (p) => new TextLoader(p), | ||
".json": (p) => new JSONLoader(p), | ||
}, | ||
UnknownHandling.Ignore | ||
); | ||
|
||
const docs = await loader.load(); | ||
expect(docs.length).toBe(123); | ||
|
||
const expectedSources = [ | ||
...Array.from({ length: 15 }, (_) => | ||
path.resolve(baseDirectory, "1706.03762.pdf") | ||
), | ||
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"), | ||
// CSV | ||
...Array.from({ length: 32 }, (_) => | ||
path.resolve( | ||
baseDirectory, | ||
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv" | ||
) | ||
), | ||
// JSON | ||
...Array.from({ length: 32 }, (_) => | ||
path.resolve( | ||
baseDirectory, | ||
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json" | ||
) | ||
), | ||
...Array.from({ length: 10 }, (_) => | ||
path.resolve(baseDirectory, "complex.json") | ||
), | ||
// TXT | ||
path.resolve(baseDirectory, "example.txt"), | ||
// CSV | ||
...Array.from({ length: 32 }, (_) => | ||
path.resolve(baseDirectory, "example_separator.csv") | ||
), | ||
]; | ||
|
||
expect(docs.map((d) => d.metadata.source).sort()).toEqual(expectedSources); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters