Skip to content

Commit eb07b3f

Browse files
authored
Add support for (dis)allowed special and tokens in token text splitter (langchain-ai#134)
1 parent 917c158 commit eb07b3f

File tree

5 files changed

+20
-22
lines changed

5 files changed

+20
-22
lines changed

examples/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
"author": "Langchain",
2020
"license": "MIT",
2121
"dependencies": {
22-
"@dqbd/tiktoken": "^0.2.1",
22+
"@dqbd/tiktoken": "^0.4.0",
2323
"chromadb": "^1.3.0",
2424
"langchain": "workspace:*",
2525
"openai": "^3.1.0",

examples/src/indexes/token_text_splitter.ts

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ export const run = async () => {
1414
encodingName: "r50k_base",
1515
chunkSize: 10,
1616
chunkOverlap: 0,
17+
allowedSpecial: ["<|endoftext|>"],
18+
disallowedSpecial: [],
1719
});
1820

1921
const output = splitter.createDocuments([text]);

langchain/package.json

+2-2
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
"devDependencies": {
7171
"@babel/core": "^7.20.12",
7272
"@babel/preset-env": "^7.20.2",
73-
"@dqbd/tiktoken": "^0.2.1",
73+
"@dqbd/tiktoken": "^0.4.0",
7474
"@jest/globals": "^29.4.2",
7575
"@tsconfig/recommended": "^1.0.2",
7676
"@types/node-fetch": "2",
@@ -103,7 +103,7 @@
103103
"typescript": "^4.9.5"
104104
},
105105
"peerDependencies": {
106-
"@dqbd/tiktoken": "^0.2.1",
106+
"@dqbd/tiktoken": "^0.4.0",
107107
"cheerio": "^1.0.0-rc.12",
108108
"chromadb": "^1.3.0",
109109
"cohere-ai": "^5.0.2",

langchain/text_splitter.ts

+8-12
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ export class RecursiveCharacterTextSplitter
185185

186186
export interface TokenTextSplitterParams extends TextSplitterParams {
187187
encodingName: tiktoken.TiktokenEmbedding;
188-
allowedSpecial: "all" | Set<string>;
188+
allowedSpecial: "all" | Array<string>;
189189
disallowedSpecial: "all" | Array<string>;
190190
}
191191

@@ -198,7 +198,7 @@ export class TokenTextSplitter
198198
{
199199
encodingName: tiktoken.TiktokenEmbedding;
200200

201-
allowedSpecial: "all" | Set<string>;
201+
allowedSpecial: "all" | Array<string>;
202202

203203
disallowedSpecial: "all" | Array<string>;
204204

@@ -208,17 +208,9 @@ export class TokenTextSplitter
208208
super(fields);
209209

210210
this.encodingName = fields?.encodingName ?? "gpt2";
211-
this.allowedSpecial = fields?.allowedSpecial ?? new Set();
211+
this.allowedSpecial = fields?.allowedSpecial ?? [];
212212
this.disallowedSpecial = fields?.disallowedSpecial ?? "all";
213213

214-
if (fields?.allowedSpecial != null) {
215-
throw new Error("allowedSpecial is not implemented yet.");
216-
}
217-
218-
if (fields?.disallowedSpecial != null) {
219-
throw new Error("disallowedSpecial is not implemented yet.");
220-
}
221-
222214
try {
223215
const tiktoken =
224216
// eslint-disable-next-line @typescript-eslint/no-var-requires, global-require
@@ -235,7 +227,11 @@ export class TokenTextSplitter
235227
splitText(text: string): string[] {
236228
const splits: string[] = [];
237229

238-
const input_ids = this.tokenizer.encode(text);
230+
const input_ids = this.tokenizer.encode(
231+
text,
232+
this.allowedSpecial,
233+
this.disallowedSpecial
234+
);
239235

240236
let start_idx = 0;
241237
let cur_idx = Math.min(start_idx + this.chunkSize, input_ids.length);

yarn.lock

+7-7
Original file line numberDiff line numberDiff line change
@@ -2248,10 +2248,10 @@ __metadata:
22482248
languageName: node
22492249
linkType: hard
22502250

2251-
"@dqbd/tiktoken@npm:^0.2.1":
2252-
version: 0.2.1
2253-
resolution: "@dqbd/tiktoken@npm:0.2.1"
2254-
checksum: 1d3fd243112b154ced97985585a439837401614e0498c9798899d925f781de769da9a0918b3f4d4c4dded1e172a546b6ec4713ad988a7990c9e1872d341b7098
2251+
"@dqbd/tiktoken@npm:^0.4.0":
2252+
version: 0.4.0
2253+
resolution: "@dqbd/tiktoken@npm:0.4.0"
2254+
checksum: 2d708e70b86bd09fc9f5a5cb5e9d3bee34574e18b4c28911f311011f6673008b91510eea658e2769e29421c8e8cd7bf2c0938ffdcad58fa082871f696717383f
22552255
languageName: node
22562256
linkType: hard
22572257

@@ -9429,7 +9429,7 @@ __metadata:
94299429
version: 0.0.0-use.local
94309430
resolution: "langchain-examples@workspace:examples"
94319431
dependencies:
9432-
"@dqbd/tiktoken": ^0.2.1
9432+
"@dqbd/tiktoken": ^0.4.0
94339433
"@tsconfig/recommended": ^1.0.2
94349434
"@typescript-eslint/eslint-plugin": ^5.51.0
94359435
"@typescript-eslint/parser": ^5.51.0
@@ -9455,7 +9455,7 @@ __metadata:
94559455
dependencies:
94569456
"@babel/core": ^7.20.12
94579457
"@babel/preset-env": ^7.20.2
9458-
"@dqbd/tiktoken": ^0.2.1
9458+
"@dqbd/tiktoken": ^0.4.0
94599459
"@jest/globals": ^29.4.2
94609460
"@tsconfig/recommended": ^1.0.2
94619461
"@types/node-fetch": 2
@@ -9496,7 +9496,7 @@ __metadata:
94969496
uuid: ^9.0.0
94979497
yaml: ^2.2.1
94989498
peerDependencies:
9499-
"@dqbd/tiktoken": ^0.2.1
9499+
"@dqbd/tiktoken": ^0.4.0
95009500
cheerio: ^1.0.0-rc.12
95019501
chromadb: ^1.3.0
95029502
cohere-ai: ^5.0.2

0 commit comments

Comments
 (0)