diff --git a/ktoken/src/commonMain/kotlin/com/aallam/ktoken/Encoding.kt b/ktoken/src/commonMain/kotlin/com/aallam/ktoken/Encoding.kt index 316193e..9b40978 100644 --- a/ktoken/src/commonMain/kotlin/com/aallam/ktoken/Encoding.kt +++ b/ktoken/src/commonMain/kotlin/com/aallam/ktoken/Encoding.kt @@ -29,6 +29,11 @@ public interface Encoding { */ public val O200K_BASE: Encoding = O200KBase() + /** + * A predefined [Encoding] instance representing the [O200KHarmony] encoding type. + */ + public val O200K_HARMONY: Encoding = O200KHarmony() + /** * A predefined [Encoding] instance representing the [CL100KBase] encoding type. */ diff --git a/ktoken/src/commonMain/kotlin/com/aallam/ktoken/encoding/O200KHarmony.kt b/ktoken/src/commonMain/kotlin/com/aallam/ktoken/encoding/O200KHarmony.kt new file mode 100644 index 0000000..951a4b9 --- /dev/null +++ b/ktoken/src/commonMain/kotlin/com/aallam/ktoken/encoding/O200KHarmony.kt @@ -0,0 +1,47 @@ +package com.aallam.ktoken.encoding + +import com.aallam.ktoken.Encoding +import com.aallam.ktoken.EncodingConfig +import com.aallam.ktoken.internal.Patterns +import com.aallam.ktoken.internal.Tokens +import okio.ByteString + +/** + * Default configuration of `o200k_harmony` encoding. + */ +public data class O200KHarmony( + override val file: String = "o200k_base.tiktoken" +) : Encoding { + + override fun encodingConfig(ranks: Map): EncodingConfig { + val specialTokens = mutableMapOf( + Tokens.ENDOFTEXT to 199999, + Tokens.ENDOFPROMPT to 200018, + Tokens.STARTOFTEXT to 199998, + Tokens.ENDOFTEXT to 199999, + Tokens.RESERVED_200000 to 200000, + Tokens.RESERVED_200001 to 200001, + Tokens.RETURN to 200002, + Tokens.CONSTRAIN to 200003, + Tokens.RESERVED_200004 to 200004, + Tokens.CHANNEL to 200005, + Tokens.START to 200006, + Tokens.END to 200007, + Tokens.MESSAGE to 200008, + Tokens.RESERVED_200009 to 200009, + Tokens.RESERVED_200010 to 200010, + Tokens.RESERVED_200011 to 200011, + Tokens.CALL to 200012, + ) + + for (i in 200013..201087) { + specialTokens[Tokens.reserved(i)] = i + } + + return EncodingConfig( + pattern = Patterns.O200K, + mergeableRanks = ranks, + specialTokens = specialTokens, + ) + } +} diff --git a/ktoken/src/commonMain/kotlin/com/aallam/ktoken/internal/Encoding.kt b/ktoken/src/commonMain/kotlin/com/aallam/ktoken/internal/Encoding.kt index 287d749..fe0bf1f 100644 --- a/ktoken/src/commonMain/kotlin/com/aallam/ktoken/internal/Encoding.kt +++ b/ktoken/src/commonMain/kotlin/com/aallam/ktoken/internal/Encoding.kt @@ -50,13 +50,94 @@ internal object Tokens { * Represents the end of a prompt. */ val ENDOFPROMPT = "<|endofprompt|>".encodeUtf8() + + /** + * Represents the start of the text. + */ + val STARTOFTEXT = "<|startoftext|>".encodeUtf8() + + /** + * Represents the harmony return token. + */ + val RETURN = "<|return|>".encodeUtf8() + + /** + * Represents the harmony constrain token. + */ + val CONSTRAIN = "<|constrain|>".encodeUtf8() + + /** + * Represents the harmony channel token. + */ + val CHANNEL = "<|channel|>".encodeUtf8() + + /** + * Represents the harmony start token. + */ + val START = "<|start|>".encodeUtf8() + + /** + * Represents the harmony end token. + */ + val END = "<|end|>".encodeUtf8() + + /** + * Represents the harmony message token. + */ + val MESSAGE = "<|message|>".encodeUtf8() + + /** + * Represents the harmony call token. + */ + val CALL = "<|call|>".encodeUtf8() + + /** + * Represents reserved harmony token 200000. + */ + val RESERVED_200000 = "<|reserved_200000|>".encodeUtf8() + + /** + * Represents reserved harmony token 200001. + */ + val RESERVED_200001 = "<|reserved_200001|>".encodeUtf8() + + /** + * Represents reserved harmony token 200004. + */ + val RESERVED_200004 = "<|reserved_200004|>".encodeUtf8() + + /** + * Represents reserved harmony token 200009. + */ + val RESERVED_200009 = "<|reserved_200009|>".encodeUtf8() + + /** + * Represents reserved harmony token 200010. + */ + val RESERVED_200010 = "<|reserved_200010|>".encodeUtf8() + + /** + * Represents reserved harmony token 200011. + */ + val RESERVED_200011 = "<|reserved_200011|>".encodeUtf8() + + /** + * Represents a reserved harmony token with the given [id]. + */ + fun reserved(id: Int) = "<|reserved_$id|>".encodeUtf8() } /** * Mapping of model names to their corresponding encoding settings. */ internal val modelToEncoding: Map = mapOf( + // reasoning + "o1" to Encoding.O200K_BASE, + "o3" to Encoding.O200K_BASE, + "o4-mini" to Encoding.O200K_BASE, // chat + "gpt-5" to Encoding.O200K_BASE, + "gpt-4.1" to Encoding.O200K_BASE, "gpt-4o" to Encoding.O200K_BASE, "gpt-4" to Encoding.CL100K_BASE, "gpt-3.5-turbo" to Encoding.CL100K_BASE, @@ -105,12 +186,22 @@ internal val modelToEncoding: Map = mapOf( * Mapping of model prefixes to their corresponding encoding settings. */ internal val modelPrefixToEncoding = mapOf( + // reasoning + "o1-" to Encoding.O200K_BASE, + "o3-" to Encoding.O200K_BASE, + "o4-mini-" to Encoding.O200K_BASE, // chat + "gpt-5-" to Encoding.O200K_BASE, + "gpt-4.5-" to Encoding.O200K_BASE, + "gpt-4.1-" to Encoding.O200K_BASE, + "chatgpt-4o-" to Encoding.O200K_BASE, "gpt-4o-" to Encoding.O200K_BASE, + "gpt-oss-" to Encoding.O200K_HARMONY, "gpt-4-" to Encoding.CL100K_BASE, // e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-" to Encoding.CL100K_BASE, // e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-35-turbo-" to Encoding.CL100K_BASE, // Azure deployment name // fine-tuned + "ft:gpt-4o" to Encoding.O200K_BASE, "ft:gpt-4" to Encoding.CL100K_BASE, "ft:gpt-3.5-turbo" to Encoding.CL100K_BASE, "ft:davinci-002" to Encoding.CL100K_BASE, diff --git a/ktoken/src/commonTest/kotlin/AbstractEncoding.kt b/ktoken/src/commonTest/kotlin/AbstractEncoding.kt index f0e0f73..f4a44a2 100644 --- a/ktoken/src/commonTest/kotlin/AbstractEncoding.kt +++ b/ktoken/src/commonTest/kotlin/AbstractEncoding.kt @@ -34,6 +34,46 @@ abstract class AbstractEncoding(private val loader: BpeLoader) { assertContentEquals(listOf(24912, 2375), tokenizer.encode("hello world")) } + @Test + fun recentModelAliasesMapToO200K() = runTest(timeout = 1.minutes) { + val reference = Tokenizer.of(Encoding.O200K_BASE, loader).encode("hello world") + val models = listOf( + "o1", + "o3", + "o4-mini", + "gpt-4.1", + "gpt-5", + "ft:gpt-4o:example", + "o1-2024-12-17", + "o3-2025-02-01", + "o4-mini-2025-02-01", + "gpt-4.1-2025-04-14", + "gpt-4.5-preview-2025-02-27", + "gpt-5-2025-08-07", + "chatgpt-4o-latest", + ) + for (model in models) { + val tokens = Tokenizer.of(model = model, loader = loader).encode("hello world") + assertContentEquals(reference, tokens) + } + } + + @Test + fun o200KHarmonyEncoding() = runTest(timeout = 1.minutes) { + val tokenizer = Tokenizer.of(Encoding.O200K_HARMONY, loader) + assertContentEquals(listOf(24912, 2375), tokenizer.encode("hello world")) + assertEquals(200002, tokenizer.encodeSingleToken("<|return|>")) + assertEquals(200500, tokenizer.encodeSingleToken("<|reserved_200500|>")) + val specialTokens = tokenizer.encode("<|return|><|reserved_200500|>", allowedSpecial = setOf("all")) + assertContains(specialTokens, 200002, 200500) + } + + @Test + fun gptOssModelsMapToO200KHarmony() = runTest(timeout = 1.minutes) { + val tokenizer = Tokenizer.of(model = "gpt-oss-120b", loader = loader) + assertEquals(200002, tokenizer.encodeSingleToken("<|return|>")) + } + internal suspend fun tokenizer() = Tokenizer.of( model = "gpt-4o", loader = loader