Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Examples/transformers-cli/Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ let package = Package(
// If you copy this manifest as a template, use the following line instead
//.package(url: "https://github.com/huggingface/swift-transformers", from: "1.0.0"),
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.3.0"),
.package(url: "https://github.com/apple/swift-container-plugin", from: "1.0.0"),
],
targets: [
.executableTarget(
Expand All @@ -19,6 +20,14 @@ let package = Package(
.product(name: "Transformers", package: "swift-transformers"),
.product(name: "ArgumentParser", package: "swift-argument-parser"),
]
),
.executableTarget(
name: "transformers-cli-linux",
dependencies: [
.product(name: "Hub", package: "swift-transformers"),
.product(name: "Tokenizers", package: "swift-transformers"),
.product(name: "ArgumentParser", package: "swift-argument-parser"),
]
)
]
)
227 changes: 227 additions & 0 deletions Examples/transformers-cli/Sources/transformers-cli-linux/main.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
import ArgumentParser
import Foundation
import Hub
import Tokenizers

/// Returns a HubApi configured to use persistent storage on WendyOS (/mnt/app),
/// or the default location otherwise.
func createHubApi() -> HubApi {
let wendyPersistentPath = "/mnt/app"
if FileManager.default.fileExists(atPath: wendyPersistentPath) {
let downloadBase = URL(filePath: wendyPersistentPath).appending(component: "huggingface")
return HubApi(downloadBase: downloadBase)
}
return HubApi()
}

@main
struct TransformersLinuxCLI: AsyncParsableCommand {
static let configuration = CommandConfiguration(
commandName: "transformers-cli-linux",
abstract: "Cross-platform CLI for HuggingFace Transformers (tokenization & Hub)",
version: "0.0.1",
subcommands: [Demo.self, Tokenize.self, Decode.self, Download.self, ChatTemplate.self],
defaultSubcommand: Demo.self
)
}

struct Demo: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Run a demo showcasing tokenizer capabilities"
)

@Option(name: .shortAndLong, help: "HuggingFace model ID")
var model: String = "bert-base-uncased"

func run() async throws {
print("Swift Transformers Demo")
print("=======================\n")

let hub = createHubApi()
print("Loading tokenizer for '\(model)'...")
print("Cache location: \(hub.downloadBase.path)")
let tokenizer = try await AutoTokenizer.from(pretrained: model, hubApi: hub)
print("Tokenizer loaded successfully!\n")

// Demo 1: Basic tokenization
let text1 = "Hello, world! Welcome to Swift Transformers."
print("Demo 1: Basic Tokenization")
print("---------------------------")
print("Input: \"\(text1)\"")
let tokens1 = tokenizer.encode(text: text1)
print("Tokens: \(tokens1)")
print("Token count: \(tokens1.count)")
print("Decoded: \"\(tokenizer.decode(tokens: tokens1))\"\n")

// Demo 2: Subword tokenization
let text2 = "Tokenization handles unknownwords and subwords nicely."
print("Demo 2: Subword Tokenization")
print("-----------------------------")
print("Input: \"\(text2)\"")
let tokens2 = tokenizer.encode(text: text2)
print("Tokens: \(tokens2)")
print("Token count: \(tokens2.count)\n")

// Demo 3: Special tokens
let text3 = "Testing special tokens"
print("Demo 3: Encoding with Special Tokens")
print("-------------------------------------")
print("Input: \"\(text3)\"")
let tokens3 = tokenizer.encode(text: text3)
print("Tokens (with special tokens): \(tokens3)")
print("Token count: \(tokens3.count)\n")

print("Demo complete! Try other commands:")
print(" tokenize <text> - Tokenize custom text")
print(" decode <ids> - Decode token IDs")
print(" download <model> - Download a tokenizer")
print(" chat-template <msg> - Apply chat template")
}
}

struct Tokenize: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Tokenize text using a HuggingFace tokenizer"
)

@Argument(help: "Text to tokenize")
var text: String

@Option(name: .shortAndLong, help: "HuggingFace model ID (e.g., 'bert-base-uncased')")
var model: String = "bert-base-uncased"

@Option(name: .shortAndLong, help: "Path to local tokenizer folder")
var localPath: String?

@Flag(name: .shortAndLong, help: "Show token strings alongside IDs")
var verbose: Bool = false

func run() async throws {
let hub = createHubApi()
let tokenizer: Tokenizer
if let localPath {
let url = URL(filePath: localPath, directoryHint: .isDirectory)
tokenizer = try await AutoTokenizer.from(modelFolder: url, hubApi: hub)
} else {
tokenizer = try await AutoTokenizer.from(pretrained: model, hubApi: hub)
}

let tokens = tokenizer.encode(text: text)

print("Input: \"\(text)\"")
print("Token count: \(tokens.count)")
print("Token IDs: \(tokens)")

if verbose {
let tokenStrings = tokenizer.decode(tokens: tokens)
print("Decoded: \"\(tokenStrings)\"")
}
}
}

struct Decode: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Decode token IDs back to text"
)

@Argument(help: "Token IDs to decode (comma-separated)")
var tokenIds: String

@Option(name: .shortAndLong, help: "HuggingFace model ID")
var model: String = "bert-base-uncased"

@Option(name: .shortAndLong, help: "Path to local tokenizer folder")
var localPath: String?

func run() async throws {
let hub = createHubApi()
let tokenizer: Tokenizer
if let localPath {
let url = URL(filePath: localPath, directoryHint: .isDirectory)
tokenizer = try await AutoTokenizer.from(modelFolder: url, hubApi: hub)
} else {
tokenizer = try await AutoTokenizer.from(pretrained: model, hubApi: hub)
}

let ids = tokenIds.split(separator: ",").compactMap { Int($0.trimmingCharacters(in: .whitespaces)) }
let decoded = tokenizer.decode(tokens: ids)

print("Token IDs: \(ids)")
print("Decoded: \"\(decoded)\"")
}
}

struct Download: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Download a tokenizer from HuggingFace Hub"
)

@Argument(help: "HuggingFace model ID to download")
var model: String

@Option(name: .shortAndLong, help: "Output directory (defaults to HF cache)")
var output: String?

func run() async throws {
let hub = createHubApi()
print("Downloading tokenizer for '\(model)'...")
print("Cache location: \(hub.downloadBase.path)")

let repo = Hub.Repo(id: model)

// Download tokenizer files
let files = ["tokenizer.json", "tokenizer_config.json", "vocab.txt", "vocab.json", "merges.txt"]

for file in files {
do {
let url = try await hub.snapshot(from: repo, matching: [file])
print(" Downloaded: \(file) -> \(url.path)")
} catch {
// File might not exist for this tokenizer type
}
}

print("Done! Tokenizer cached locally.")

// Verify it works
let tokenizer = try await AutoTokenizer.from(pretrained: model, hubApi: hub)
let testTokens = tokenizer.encode(text: "Hello, world!")
print("Verification: \"Hello, world!\" -> \(testTokens.count) tokens")
}
}

struct ChatTemplate: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Apply a chat template to messages"
)

@Option(name: .shortAndLong, help: "HuggingFace model ID")
var model: String = "microsoft/Phi-3-mini-4k-instruct"

@Option(name: .shortAndLong, help: "System message")
var system: String?

@Argument(help: "User message")
var message: String

func run() async throws {
let hub = createHubApi()
let tokenizer = try await AutoTokenizer.from(pretrained: model, hubApi: hub)

var messages: [[String: String]] = []

if let system {
messages.append(["role": "system", "content": system])
}
messages.append(["role": "user", "content": message])

let tokens = try tokenizer.applyChatTemplate(messages: messages)
let formatted = tokenizer.decode(tokens: tokens)

print("Formatted prompt:")
print("---")
print(formatted)
print("---")
print("\nToken count: \(tokens.count)")
}
}
2 changes: 2 additions & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ let package = Package(
dependencies: [
.package(url: "https://github.com/huggingface/swift-jinja.git", from: "2.0.0"),
.package(url: "https://github.com/apple/swift-collections.git", from: "1.0.0"),
.package(url: "https://github.com/apple/swift-crypto.git", "3.0.0"..<"5.0.0"),
],
targets: [
.target(name: "Generation", dependencies: ["Tokenizers"]),
Expand All @@ -27,6 +28,7 @@ let package = Package(
dependencies: [
.product(name: "Jinja", package: "swift-jinja"),
.product(name: "OrderedCollections", package: "swift-collections"),
.product(name: "Crypto", package: "swift-crypto"),
],
resources: [
.process("Resources")
Expand Down
22 changes: 10 additions & 12 deletions Sources/Generation/Generation.swift
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,6 @@
// Created by Pedro Cuenca on 7/5/23.
//

#if canImport(CoreML)
import CoreML

import CoreML
import Tokenizers

/// Supported text generation modes.
public enum GenerationMode {
/// Contrastive search generation mode
Expand All @@ -33,6 +27,16 @@ public typealias InputTokens = [Int]
/// Array of token IDs representing generated output tokens.
public typealias GenerationOutput = [Int]

/// Callback for receiving generated tokens during streaming.
public typealias PredictionTokensCallback = (GenerationOutput) -> Void

/// Callback for receiving generated text during streaming.
public typealias PredictionStringCallback = (String) -> Void

#if canImport(CoreML)
import CoreML
import Tokenizers

/// A callable model that predicts the next token after a given sequence.
///
/// - Parameter tokens: Input token sequence
Expand All @@ -41,12 +45,6 @@ public typealias GenerationOutput = [Int]
@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *)
public typealias NextTokenModel = (MLTensor, GenerationConfig) async -> MLTensor

/// Callback for receiving generated tokens during streaming.
public typealias PredictionTokensCallback = (GenerationOutput) -> Void

/// Callback for receiving generated text during streaming.
public typealias PredictionStringCallback = (String) -> Void

/// Protocol for text generation implementations.
@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *)
public protocol Generation {
Expand Down
32 changes: 21 additions & 11 deletions Sources/Hub/Config.swift
Original file line number Diff line number Diff line change
Expand Up @@ -234,31 +234,41 @@ public struct Config: Hashable, Sendable,

private static func convertToBinaryDistinctKeys(_ object: Any) -> Config {
if let dict = object as? [NSString: Any] {
Config(Dictionary(uniqueKeysWithValues: dict.map { (BinaryDistinctString($0.key), convertToBinaryDistinctKeys($0.value)) }))
return Config(Dictionary(uniqueKeysWithValues: dict.map { (BinaryDistinctString($0.key), convertToBinaryDistinctKeys($0.value)) }))
} else if let array = object as? [Any] {
Config(array.map { convertToBinaryDistinctKeys($0) })
return Config(array.map { convertToBinaryDistinctKeys($0) })
} else {
switch object {
case let obj as String:
Config(obj)
return Config(obj)
case let obj as Int:
Config(obj)
return Config(obj)
case let obj as Float:
Config(obj)
return Config(obj)
case let obj as Bool:
Config(obj)
return Config(obj)
case let obj as NSNumber:
#if os(macOS) || os(iOS) || os(tvOS) || os(watchOS) || os(visionOS)
if CFNumberIsFloatType(obj) {
Config(obj.floatValue)
return Config(obj.floatValue)
} else {
Config(obj.intValue)
return Config(obj.intValue)
}
#else
// On Linux, check objCType to determine if it's a floating point number
let type = String(cString: obj.objCType)
if type == "f" || type == "d" {
return Config(obj.floatValue)
} else {
return Config(obj.intValue)
}
#endif
case _ as NSNull:
Config()
return Config()
case let obj as Config:
obj
return obj
case let obj as (UInt, String):
Config((obj.0, BinaryDistinctString(obj.1)))
return Config((obj.0, BinaryDistinctString(obj.1)))
default:
fatalError("unknown type: \(type(of: object)) \(object)")
}
Expand Down
Loading
Loading