Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ktoken/src/commonMain/kotlin/com/aallam/ktoken/Encoding.kt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ public interface Encoding {
*/
public val O200K_BASE: Encoding = O200KBase()

/**
* A predefined [Encoding] instance representing the [O200KHarmony] encoding type.
*/
public val O200K_HARMONY: Encoding = O200KHarmony()

/**
* A predefined [Encoding] instance representing the [CL100KBase] encoding type.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package com.aallam.ktoken.encoding

import com.aallam.ktoken.Encoding
import com.aallam.ktoken.EncodingConfig
import com.aallam.ktoken.internal.Patterns
import com.aallam.ktoken.internal.Tokens
import okio.ByteString

/**
* Default configuration of `o200k_harmony` encoding.
*/
public data class O200KHarmony(
override val file: String = "o200k_base.tiktoken"
) : Encoding {

override fun encodingConfig(ranks: Map<ByteString, Int>): EncodingConfig {
val specialTokens = mutableMapOf(
Tokens.ENDOFTEXT to 199999,
Tokens.ENDOFPROMPT to 200018,
Tokens.STARTOFTEXT to 199998,
Tokens.ENDOFTEXT to 199999,
Tokens.RESERVED_200000 to 200000,
Tokens.RESERVED_200001 to 200001,
Tokens.RETURN to 200002,
Tokens.CONSTRAIN to 200003,
Tokens.RESERVED_200004 to 200004,
Tokens.CHANNEL to 200005,
Tokens.START to 200006,
Tokens.END to 200007,
Tokens.MESSAGE to 200008,
Tokens.RESERVED_200009 to 200009,
Tokens.RESERVED_200010 to 200010,
Tokens.RESERVED_200011 to 200011,
Tokens.CALL to 200012,
)

for (i in 200013..201087) {
specialTokens[Tokens.reserved(i)] = i
}

return EncodingConfig(
pattern = Patterns.O200K,
mergeableRanks = ranks,
specialTokens = specialTokens,
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,94 @@ internal object Tokens {
* Represents the end of a prompt.
*/
val ENDOFPROMPT = "<|endofprompt|>".encodeUtf8()

/**
* Represents the start of the text.
*/
val STARTOFTEXT = "<|startoftext|>".encodeUtf8()

/**
* Represents the harmony return token.
*/
val RETURN = "<|return|>".encodeUtf8()

/**
* Represents the harmony constrain token.
*/
val CONSTRAIN = "<|constrain|>".encodeUtf8()

/**
* Represents the harmony channel token.
*/
val CHANNEL = "<|channel|>".encodeUtf8()

/**
* Represents the harmony start token.
*/
val START = "<|start|>".encodeUtf8()

/**
* Represents the harmony end token.
*/
val END = "<|end|>".encodeUtf8()

/**
* Represents the harmony message token.
*/
val MESSAGE = "<|message|>".encodeUtf8()

/**
* Represents the harmony call token.
*/
val CALL = "<|call|>".encodeUtf8()

/**
* Represents reserved harmony token 200000.
*/
val RESERVED_200000 = "<|reserved_200000|>".encodeUtf8()

/**
* Represents reserved harmony token 200001.
*/
val RESERVED_200001 = "<|reserved_200001|>".encodeUtf8()

/**
* Represents reserved harmony token 200004.
*/
val RESERVED_200004 = "<|reserved_200004|>".encodeUtf8()

/**
* Represents reserved harmony token 200009.
*/
val RESERVED_200009 = "<|reserved_200009|>".encodeUtf8()

/**
* Represents reserved harmony token 200010.
*/
val RESERVED_200010 = "<|reserved_200010|>".encodeUtf8()

/**
* Represents reserved harmony token 200011.
*/
val RESERVED_200011 = "<|reserved_200011|>".encodeUtf8()

/**
* Represents a reserved harmony token with the given [id].
*/
fun reserved(id: Int) = "<|reserved_$id|>".encodeUtf8()
}

/**
* Mapping of model names to their corresponding encoding settings.
*/
internal val modelToEncoding: Map<String, Encoding> = mapOf(
// reasoning
"o1" to Encoding.O200K_BASE,
"o3" to Encoding.O200K_BASE,
"o4-mini" to Encoding.O200K_BASE,
// chat
"gpt-5" to Encoding.O200K_BASE,
"gpt-4.1" to Encoding.O200K_BASE,
"gpt-4o" to Encoding.O200K_BASE,
"gpt-4" to Encoding.CL100K_BASE,
"gpt-3.5-turbo" to Encoding.CL100K_BASE,
Expand Down Expand Up @@ -105,12 +186,22 @@ internal val modelToEncoding: Map<String, Encoding> = mapOf(
* Mapping of model prefixes to their corresponding encoding settings.
*/
internal val modelPrefixToEncoding = mapOf(
// reasoning
"o1-" to Encoding.O200K_BASE,
"o3-" to Encoding.O200K_BASE,
"o4-mini-" to Encoding.O200K_BASE,
// chat
"gpt-5-" to Encoding.O200K_BASE,
"gpt-4.5-" to Encoding.O200K_BASE,
"gpt-4.1-" to Encoding.O200K_BASE,
"chatgpt-4o-" to Encoding.O200K_BASE,
"gpt-4o-" to Encoding.O200K_BASE,
"gpt-oss-" to Encoding.O200K_HARMONY,
"gpt-4-" to Encoding.CL100K_BASE, // e.g., gpt-4-0314, etc., plus gpt-4-32k
"gpt-3.5-turbo-" to Encoding.CL100K_BASE, // e.g, gpt-3.5-turbo-0301, -0401, etc.
"gpt-35-turbo-" to Encoding.CL100K_BASE, // Azure deployment name
// fine-tuned
"ft:gpt-4o" to Encoding.O200K_BASE,
"ft:gpt-4" to Encoding.CL100K_BASE,
"ft:gpt-3.5-turbo" to Encoding.CL100K_BASE,
"ft:davinci-002" to Encoding.CL100K_BASE,
Expand Down
40 changes: 40 additions & 0 deletions ktoken/src/commonTest/kotlin/AbstractEncoding.kt
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,46 @@ abstract class AbstractEncoding(private val loader: BpeLoader) {
assertContentEquals(listOf(24912, 2375), tokenizer.encode("hello world"))
}

@Test
fun recentModelAliasesMapToO200K() = runTest(timeout = 1.minutes) {
val reference = Tokenizer.of(Encoding.O200K_BASE, loader).encode("hello world")
val models = listOf(
"o1",
"o3",
"o4-mini",
"gpt-4.1",
"gpt-5",
"ft:gpt-4o:example",
"o1-2024-12-17",
"o3-2025-02-01",
"o4-mini-2025-02-01",
"gpt-4.1-2025-04-14",
"gpt-4.5-preview-2025-02-27",
"gpt-5-2025-08-07",
"chatgpt-4o-latest",
)
for (model in models) {
val tokens = Tokenizer.of(model = model, loader = loader).encode("hello world")
assertContentEquals(reference, tokens)
}
}

@Test
fun o200KHarmonyEncoding() = runTest(timeout = 1.minutes) {
val tokenizer = Tokenizer.of(Encoding.O200K_HARMONY, loader)
assertContentEquals(listOf(24912, 2375), tokenizer.encode("hello world"))
assertEquals(200002, tokenizer.encodeSingleToken("<|return|>"))
assertEquals(200500, tokenizer.encodeSingleToken("<|reserved_200500|>"))
val specialTokens = tokenizer.encode("<|return|><|reserved_200500|>", allowedSpecial = setOf("all"))
assertContains(specialTokens, 200002, 200500)
}

@Test
fun gptOssModelsMapToO200KHarmony() = runTest(timeout = 1.minutes) {
val tokenizer = Tokenizer.of(model = "gpt-oss-120b", loader = loader)
assertEquals(200002, tokenizer.encodeSingleToken("<|return|>"))
}

internal suspend fun tokenizer() = Tokenizer.of(
model = "gpt-4o",
loader = loader
Expand Down
Loading