1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import { fetch } from "https://esm.town/v/std/fetch";
import { fetchJSON } from "https://esm.town/v/stevekrouse/fetchJSON?v=41";
export const Tokenizer = async (text: string, modelName?: string) => {
const { init, Tiktoken } = await import(
"https://esm.sh/@dqbd/tiktoken@1.0.7/lite/init"
);
const { load } = await import("https://esm.sh/@dqbd/tiktoken@1.0.7/load");
const registry = await fetchJSON(
"https://esm.sh/@dqbd/tiktoken@1.0.7/registry.json",
);
const models = await fetchJSON(
"https://esm.sh/@dqbd/tiktoken@1.0.7/model_to_encoding.json",
);
type ObjectValues<T> = T[keyof T];
type Model = keyof typeof models;
type Encoder = ObjectValues<typeof models>;
type EncoderConfig = ObjectValues<typeof registry>;
type Registry = Record<Encoder, EncoderConfig>;
let _init = false;
class Tokenizer {
static _encoder: Tiktoken;
static async tokenize(text: string, modelName?: Model) {
if (!this._encoder) {
this._encoder = await buildEncoder(modelName ?? "gpt-3.5-turbo");
}
return this._encoder.encode(text);
}
}
async function loadModel(modelName: Model) {
if (!_init) {
// Initialize the wasm via discussion in https://github.com/dqbd/tiktoken/issues/22
await init(async (imports) => {
const req = await fetch(
"https://esm.sh/@dqbd/tiktoken@1.0.7/lite/tiktoken_bg.wasm",
);
return WebAssembly.instantiate(await req.arrayBuffer(), imports);
});
_init = true;
}
// MARK: gpt-3.5-turbo uses the cl100k_base encoding whereas text-davinci-003 uses the p50k_base
return await load((registry as Registry)[models[modelName]]);
}
async function buildEncoder(modelName: Model) {
const model = await loadModel(modelName);
return new Tiktoken(model.bpe_ranks, model.special_tokens, model.pat_str);
}
async function tokenize(text: string, modelName?: Model) {
modelName = modelName ?? "gpt-3.5-turbo";
const encoder = await buildEncoder(modelName);
const tokens = encoder.encode(text);
encoder.free();
return tokens;
}
return await Tokenizer.tokenize(text, modelName);
};