|
1 | 1 | // examples/constrained_generation.rs |
2 | 2 | //! Generate a constrained output. |
3 | 3 |
|
4 | | -use candle_core::IntDType; |
5 | | -use std::fs; |
6 | | -use syncode_core::{ |
7 | | - bytes::restore_bytes, grammar::EBNFParser, lexer::Lexer, mask::grammar_mask, mask_store, |
8 | | - parser::Parser, |
9 | | -}; |
10 | | -use tokenizers::{Tokenizer, tokenizer}; |
11 | | - |
12 | | -fn main() { |
13 | | - let model_id = "Qwen/Qwen3-4B-Thinking-2507"; |
14 | | - let grammar_file = "./grammars/json.lark"; |
15 | | - |
16 | | - let store = mask_store(model_id, grammar_file); |
17 | | - |
18 | | - // Harness to avoid generating with the model. |
19 | | - let sample = r#"{"menu": { |
20 | | - "id": "file", |
21 | | - "value": "File", |
22 | | - "popup": { |
23 | | - "menuitem": [ |
24 | | - {"value": "New", "onclick": "CreateNewDoc()"}, |
25 | | - {"value": "Open", "onclick": "OpenDoc()"}, |
26 | | - {"value": "Close", "onclick": "CloseDoc()"} |
27 | | - ] |
28 | | - } |
29 | | -}}"#; |
30 | | - |
31 | | - let tokenizer = Tokenizer::from_pretrained(model_id, None).unwrap(); |
32 | | - |
33 | | - let vocab = tokenizer.get_vocab(false); |
34 | | - let tokens: Vec<&String> = vocab.keys().collect(); |
35 | | - let byte_tokens: Vec<Vec<u8>> = tokens.into_iter().map(|t| restore_bytes(t)).collect(); |
36 | | - |
37 | | - let Ok(encoding) = tokenizer.encode(sample, false) else { |
38 | | - panic!() |
39 | | - }; |
40 | | - |
41 | | - let Ok(grammar) = EBNFParser::new(&fs::read_to_string(grammar_file).unwrap(), "start").parse() |
42 | | - else { |
43 | | - panic!() |
44 | | - }; |
45 | | - |
46 | | - let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else { |
47 | | - panic!() |
48 | | - }; |
49 | | - |
50 | | - let Ok(parser) = Parser::new(&grammar) else { |
51 | | - panic!() |
52 | | - }; |
53 | | - |
54 | | - let tokens_ids = encoding.get_ids(); |
55 | | - |
56 | | - for (idx, token) in tokens_ids.iter().enumerate() { |
57 | | - let mut sequence_so_far: Vec<u8> = Vec::new(); |
58 | | - |
59 | | - for token in &encoding.get_tokens()[..idx] { |
60 | | - sequence_so_far.extend(restore_bytes(token)); |
61 | | - } |
62 | | - |
63 | | - let Ok((terminals, remainder)) = lexer.lex(&sequence_so_far[..]) else { |
64 | | - panic!() |
65 | | - }; |
66 | | - |
67 | | - let Ok(accept_sequences) = parser.parse(&terminals, &remainder) else { |
68 | | - panic!() |
69 | | - }; |
70 | | - |
71 | | - println!("{:#?}", accept_sequences); |
72 | | - |
73 | | - let mask = grammar_mask( |
74 | | - &accept_sequences, |
75 | | - &remainder, |
76 | | - &store, |
77 | | - &byte_tokens, |
78 | | - &grammar, |
79 | | - ); |
80 | | - |
81 | | - // assert!( |
82 | | - // mask[token.as_usize()], |
83 | | - println!( |
84 | | - "Mask value: {}\nToken: {} {}\nIter: {}", |
85 | | - mask[token.as_usize()], |
86 | | - tokenizer.decode(&[*token], false).unwrap(), |
87 | | - token, |
88 | | - idx |
89 | | - ); |
90 | | - } |
91 | | -} |
| 4 | +// use candle_core::IntDType; |
| 5 | +// use std::fs; |
| 6 | +// use syncode_core::{ |
| 7 | +// bytes::restore_bytes, grammar::EBNFParser, lexer::Lexer, mask::grammar_mask, mask_store, |
| 8 | +// parser::Parser, |
| 9 | +// }; |
| 10 | +// use tokenizers::{Tokenizer, tokenizer}; |
| 11 | + |
| 12 | +// fn main() { |
| 13 | +// let model_id = "Qwen/Qwen3-4B-Thinking-2507"; |
| 14 | +// let grammar_file = "./grammars/json.lark"; |
| 15 | + |
| 16 | +// let store = mask_store(model_id, grammar_file); |
| 17 | + |
| 18 | +// // Harness to avoid generating with the model. |
| 19 | +// let sample = r#"{"menu": { |
| 20 | +// "id": "file", |
| 21 | +// "value": "File", |
| 22 | +// "popup": { |
| 23 | +// "menuitem": [ |
| 24 | +// {"value": "New", "onclick": "CreateNewDoc()"}, |
| 25 | +// {"value": "Open", "onclick": "OpenDoc()"}, |
| 26 | +// {"value": "Close", "onclick": "CloseDoc()"} |
| 27 | +// ] |
| 28 | +// } |
| 29 | +// }}"#; |
| 30 | + |
| 31 | +// let tokenizer = Tokenizer::from_pretrained(model_id, None).unwrap(); |
| 32 | + |
| 33 | +// let vocab = tokenizer.get_vocab(false); |
| 34 | +// let tokens: Vec<&String> = vocab.keys().collect(); |
| 35 | +// let byte_tokens: Vec<Vec<u8>> = tokens.into_iter().map(|t| restore_bytes(t)).collect(); |
| 36 | + |
| 37 | +// let Ok(encoding) = tokenizer.encode(sample, false) else { |
| 38 | +// panic!() |
| 39 | +// }; |
| 40 | + |
| 41 | +// let Ok(grammar) = EBNFParser::new(&fs::read_to_string(grammar_file).unwrap(), "start").parse() |
| 42 | +// else { |
| 43 | +// panic!() |
| 44 | +// }; |
| 45 | + |
| 46 | +// let Ok(lexer) = Lexer::new(&grammar.terminals, &grammar.ignore_terminals) else { |
| 47 | +// panic!() |
| 48 | +// }; |
| 49 | + |
| 50 | +// let Ok(parser) = Parser::new(&grammar) else { |
| 51 | +// panic!() |
| 52 | +// }; |
| 53 | + |
| 54 | +// let tokens_ids = encoding.get_ids(); |
| 55 | + |
| 56 | +// for (idx, token) in tokens_ids.iter().enumerate() { |
| 57 | +// let mut sequence_so_far: Vec<u8> = Vec::new(); |
| 58 | + |
| 59 | +// for token in &encoding.get_tokens()[..idx] { |
| 60 | +// sequence_so_far.extend(restore_bytes(token)); |
| 61 | +// } |
| 62 | + |
| 63 | +// let Ok((terminals, remainder)) = lexer.lex(&sequence_so_far[..]) else { |
| 64 | +// panic!() |
| 65 | +// }; |
| 66 | + |
| 67 | +// let Ok(accept_sequences) = parser.parse(&terminals, &remainder) else { |
| 68 | +// panic!() |
| 69 | +// }; |
| 70 | + |
| 71 | +// println!("{:#?}", accept_sequences); |
| 72 | + |
| 73 | +// let mask = grammar_mask( |
| 74 | +// &accept_sequences, |
| 75 | +// &remainder, |
| 76 | +// &store, |
| 77 | +// &byte_tokens, |
| 78 | +// &grammar, |
| 79 | +// ); |
| 80 | + |
| 81 | +// // assert!( |
| 82 | +// // mask[token.as_usize()], |
| 83 | +// println!( |
| 84 | +// "Mask value: {}\nToken: {} {}\nIter: {}", |
| 85 | +// mask[token.as_usize()], |
| 86 | +// tokenizer.decode(&[*token], false).unwrap(), |
| 87 | +// token, |
| 88 | +// idx |
| 89 | +// ); |
| 90 | +// } |
| 91 | +// } |
0 commit comments