Skip to content

Commit 521e51d

Browse files
committed
Begin play.
1 parent 73b63bd commit 521e51d

File tree

12 files changed

+1899
-1458
lines changed

12 files changed

+1899
-1458
lines changed

examples/constrained_generation.rs

Lines changed: 88 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -1,91 +1,91 @@
11
// examples/constrained_generation.rs
22
//! Generate a constrained output.
33
4-
use candle_core::IntDType;
5-
use std::fs;
6-
use syncode_core::{
7-
bytes::restore_bytes, grammar::EBNFParser, lexer::Lexer, mask::grammar_mask, mask_store,
8-
parser::Parser,
9-
};
10-
use tokenizers::{Tokenizer, tokenizer};
11-
12-
fn main() {
13-
let model_id = "Qwen/Qwen3-4B-Thinking-2507";
14-
let grammar_file = "./grammars/json.lark";
15-
16-
let store = mask_store(model_id, grammar_file);
17-
18-
// Harness to avoid generating with the model.
19-
let sample = r#"{"menu": {
20-
"id": "file",
21-
"value": "File",
22-
"popup": {
23-
"menuitem": [
24-
{"value": "New", "onclick": "CreateNewDoc()"},
25-
{"value": "Open", "onclick": "OpenDoc()"},
26-
{"value": "Close", "onclick": "CloseDoc()"}
27-
]
28-
}
29-
}}"#;
30-
31-
let tokenizer = Tokenizer::from_pretrained(model_id, None).unwrap();
32-
33-
let vocab = tokenizer.get_vocab(false);
34-
let tokens: Vec<&String> = vocab.keys().collect();
35-
let byte_tokens: Vec<Vec<u8>> = tokens.into_iter().map(|t| restore_bytes(t)).collect();
36-
37-
let Ok(encoding) = tokenizer.encode(sample, false) else {
38-
panic!()
39-
};
40-
41-
let Ok(grammar) = EBNFParser::new(&fs::read_to_string(grammar_file).unwrap(), "start").parse()
42-
else {
43-
panic!()
44-
};
45-
46-
let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else {
47-
panic!()
48-
};
49-
50-
let Ok(parser) = Parser::new(&grammar) else {
51-
panic!()
52-
};
53-
54-
let tokens_ids = encoding.get_ids();
55-
56-
for (idx, token) in tokens_ids.iter().enumerate() {
57-
let mut sequence_so_far: Vec<u8> = Vec::new();
58-
59-
for token in &encoding.get_tokens()[..idx] {
60-
sequence_so_far.extend(restore_bytes(token));
61-
}
62-
63-
let Ok((terminals, remainder)) = lexer.lex(&sequence_so_far[..]) else {
64-
panic!()
65-
};
66-
67-
let Ok(accept_sequences) = parser.parse(&terminals, &remainder) else {
68-
panic!()
69-
};
70-
71-
println!("{:#?}", accept_sequences);
72-
73-
let mask = grammar_mask(
74-
&accept_sequences,
75-
&remainder,
76-
&store,
77-
&byte_tokens,
78-
&grammar,
79-
);
80-
81-
// assert!(
82-
// mask[token.as_usize()],
83-
println!(
84-
"Mask value: {}\nToken: {} {}\nIter: {}",
85-
mask[token.as_usize()],
86-
tokenizer.decode(&[*token], false).unwrap(),
87-
token,
88-
idx
89-
);
90-
}
91-
}
4+
// use candle_core::IntDType;
5+
// use std::fs;
6+
// use syncode_core::{
7+
// bytes::restore_bytes, grammar::EBNFParser, lexer::Lexer, mask::grammar_mask, mask_store,
8+
// parser::Parser,
9+
// };
10+
// use tokenizers::{Tokenizer, tokenizer};
11+
12+
// fn main() {
13+
// let model_id = "Qwen/Qwen3-4B-Thinking-2507";
14+
// let grammar_file = "./grammars/json.lark";
15+
16+
// let store = mask_store(model_id, grammar_file);
17+
18+
// // Harness to avoid generating with the model.
19+
// let sample = r#"{"menu": {
20+
// "id": "file",
21+
// "value": "File",
22+
// "popup": {
23+
// "menuitem": [
24+
// {"value": "New", "onclick": "CreateNewDoc()"},
25+
// {"value": "Open", "onclick": "OpenDoc()"},
26+
// {"value": "Close", "onclick": "CloseDoc()"}
27+
// ]
28+
// }
29+
// }}"#;
30+
31+
// let tokenizer = Tokenizer::from_pretrained(model_id, None).unwrap();
32+
33+
// let vocab = tokenizer.get_vocab(false);
34+
// let tokens: Vec<&String> = vocab.keys().collect();
35+
// let byte_tokens: Vec<Vec<u8>> = tokens.into_iter().map(|t| restore_bytes(t)).collect();
36+
37+
// let Ok(encoding) = tokenizer.encode(sample, false) else {
38+
// panic!()
39+
// };
40+
41+
// let Ok(grammar) = EBNFParser::new(&fs::read_to_string(grammar_file).unwrap(), "start").parse()
42+
// else {
43+
// panic!()
44+
// };
45+
46+
// let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else {
47+
// panic!()
48+
// };
49+
50+
// let Ok(parser) = Parser::new(&grammar) else {
51+
// panic!()
52+
// };
53+
54+
// let tokens_ids = encoding.get_ids();
55+
56+
// for (idx, token) in tokens_ids.iter().enumerate() {
57+
// let mut sequence_so_far: Vec<u8> = Vec::new();
58+
59+
// for token in &encoding.get_tokens()[..idx] {
60+
// sequence_so_far.extend(restore_bytes(token));
61+
// }
62+
63+
// let Ok((terminals, remainder)) = lexer.lex(&sequence_so_far[..]) else {
64+
// panic!()
65+
// };
66+
67+
// let Ok(accept_sequences) = parser.parse(&terminals, &remainder) else {
68+
// panic!()
69+
// };
70+
71+
// println!("{:#?}", accept_sequences);
72+
73+
// let mask = grammar_mask(
74+
// &accept_sequences,
75+
// &remainder,
76+
// &store,
77+
// &byte_tokens,
78+
// &grammar,
79+
// );
80+
81+
// // assert!(
82+
// // mask[token.as_usize()],
83+
// println!(
84+
// "Mask value: {}\nToken: {} {}\nIter: {}",
85+
// mask[token.as_usize()],
86+
// tokenizer.decode(&[*token], false).unwrap(),
87+
// token,
88+
// idx
89+
// );
90+
// }
91+
// }

examples/construct_mask_store.rs

Lines changed: 53 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,66 @@
1-
use rayon::prelude::*;
2-
use std::fs;
3-
use syncode_core::bytes::restore_bytes;
4-
use syncode_core::grammar::EBNFParser;
5-
use syncode_core::mask::{dfa_mask_store, grammar_mask};
6-
use syncode_core::{lexer::Lexer, parser::Parser};
7-
use tokenizers::Tokenizer;
1+
// use rayon::prelude::*;
2+
// use std::fs;
3+
// use syncode_core::bytes::restore_bytes;
4+
// use syncode_core::grammar::EBNFParser;
5+
// use syncode_core::mask::{dfa_mask_store, grammar_mask};
6+
// use syncode_core::{lexer::Lexer, parser::Parser};
7+
// use tokenizers::Tokenizer;
88

9-
fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
10-
let model_id = "Qwen/Qwen3-235B-A22B";
11-
let language = "json";
12-
let grammar_file = format!("./grammars/{language}.lark");
13-
let tokenizer = Tokenizer::from_pretrained(model_id, None)?;
14-
let vocab = tokenizer.get_vocab(false);
15-
let tokens: Vec<&String> = vocab.keys().collect();
16-
let byte_tokens: Vec<Vec<u8>> = tokens.into_par_iter().map(|t| restore_bytes(t)).collect();
9+
// fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
10+
// let model_id = "Qwen/Qwen3-235B-A22B";
11+
// let language = "json";
12+
// let grammar_file = format!("./grammars/{language}.lark");
13+
// let tokenizer = Tokenizer::from_pretrained(model_id, None)?;
14+
// let vocab = tokenizer.get_vocab(false);
15+
// let tokens: Vec<&String> = vocab.keys().collect();
16+
// let byte_tokens: Vec<Vec<u8>> = tokens.into_par_iter().map(|t| restore_bytes(t)).collect();
1717

18-
let Ok(grammar) = EBNFParser::new(&fs::read_to_string(grammar_file).unwrap(), "start").parse()
19-
else {
20-
panic!()
21-
};
18+
// let Ok(grammar) = EBNFParser::new(&fs::read_to_string(grammar_file).unwrap(), "start").parse()
19+
// else {
20+
// panic!()
21+
// };
2222

23-
let Ok(parser) = Parser::new(&grammar.clone()) else {
24-
panic!()
25-
};
23+
// let Ok(parser) = Parser::new(&grammar.clone()) else {
24+
// panic!()
25+
// };
2626

27-
let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else {
28-
panic!()
29-
};
27+
// let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else {
28+
// panic!()
29+
// };
3030

31-
// println!("{:#?}", byte_tokens[32]);
32-
let mask_store = dfa_mask_store(&grammar.terminals, &byte_tokens, &parser, 2);
31+
// // println!("{:#?}", byte_tokens[32]);
32+
// let mask_store = dfa_mask_store(&grammar.terminals, &byte_tokens, &parser, 2);
3333

34-
// let mut cache = fs::File::open("./cache/{model_id}/{language}.json")?;
34+
// // let mut cache = fs::File::open("./cache/{model_id}/{language}.json")?;
3535

36-
// ser::to_writer(cache, &mask_store);
36+
// // ser::to_writer(cache, &mask_store);
3737

38-
let candidate = r#"{
39-
"basics": {
40-
"name": "Preston Firestone",
41-
"label": "Programmer",
42-
"image": "",
43-
"email": "[email protected]",
44-
"phone": "+1 (224) 688-2924","#;
38+
// let candidate = r#"{
39+
// "basics": {
40+
// "name": "Preston Firestone",
41+
// "label": "Programmer",
42+
// "image": "",
43+
// "email": "[email protected]",
44+
// "phone": "+1 (224) 688-2924","#;
4545

46-
let tokens = tokenizer.encode(candidate, false);
46+
// let tokens = tokenizer.encode(candidate, false);
4747

48-
let Ok((terminals, remainder)) = lexer.lex(candidate.as_bytes()) else {
49-
panic!()
50-
};
48+
// let Ok((terminals, remainder)) = lexer.lex(candidate.as_bytes()) else {
49+
// panic!()
50+
// };
5151

52-
let Ok(accept_sequences) = parser.parse(&terminals, &remainder) else {
53-
panic!()
54-
};
52+
// let Ok(accept_sequences) = parser.parse(&terminals, &remainder) else {
53+
// panic!()
54+
// };
5555

56-
let mask = grammar_mask(
57-
&accept_sequences,
58-
&remainder,
59-
&mask_store,
60-
&byte_tokens,
61-
&grammar,
62-
);
56+
// let mask = grammar_mask(
57+
// &accept_sequences,
58+
// &remainder,
59+
// &mask_store,
60+
// &byte_tokens,
61+
// &grammar,
62+
// );
6363

64-
println!("{:#?}", mask);
65-
Ok(())
66-
}
64+
// println!("{:#?}", mask);
65+
// Ok(())
66+
// }

grammars/json_sugar.lark

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// Adapted from https://www.crockford.com/mckeeman.html. With regular expressions on right hand side.
2+
3+
json: element
4+
5+
value: object
6+
| array
7+
| string
8+
| number
9+
| "true"
10+
| "false"
11+
| "null"
12+
13+
object: "{" ws* "}"
14+
| "{" member ("," member)* "}"
15+
16+
member: ws* string ws* ":" element
17+
18+
array: "[" ws* "]"
19+
| "[" element ("," element)* "]"
20+
21+
element: ws* value ws*
22+
23+
string: /"/ character* /"/
24+
25+
character: /[\x{20} -\x{10FFFF}--["\\]]/
26+
| "\" escape
27+
28+
escape: /"/
29+
| "\"
30+
| "/"
31+
| "b"
32+
| "f"
33+
| "n"
34+
| "r"
35+
| "t"
36+
| "u" hex hex hex hex
37+
38+
hex: digit
39+
| "A".."F"
40+
| "a".."f"
41+
42+
number: integer fraction? exponent?
43+
44+
integer: "-"? digit
45+
| "-"? "1".."9" digit+
46+
47+
digit: "0".."9"
48+
49+
fraction: "." digit+
50+
51+
exponent: ("E"|"e") ("+"|"-")? digit+
52+
53+
ws: /\x{0020}/
54+
| /\x{000A}/
55+
| /\x{000D}/
56+
| /\x{0009}/

0 commit comments

Comments
 (0)