import { BaseTokenizer, Encoding, PaddingConfiguration, TruncationConfiguration, TruncationOptions } from "tokenizers"; import { ModelType } from "../models"; export interface TokenizerBaseOptions { /** * @default true */ lowercase?: boolean; /** * Name of the merges file (if applicable to the tokenizer) * @default "merges.txt" */ mergesFile?: string; /** * Directory under which the files needed by the tokenizer are located. * Must be an absolute path. */ filesDir: string; modelType: ModelType; /** * Name of the vocab file (if applicable to the tokenizer) * @default "vocab.txt" | "vocab.json" */ vocabFile?: string; } export type FullTokenizerOptions = TokenizerBaseOptions & Partial; export abstract class Tokenizer = BaseTokenizer