Skip to content

Commit 2dd0db9

Browse files
authored
feat: allow zero or multiple amino-acids as a cleavage restrictions (#213)
1 parent 65666b8 commit 2dd0db9

File tree

5 files changed

+55
-41
lines changed

5 files changed

+55
-41
lines changed

DOCS.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ For additional information about configuration options and output file formats,
160160
"min_len": 5, // Optional[int] {default=5}, Minimum AA length of peptides to search
161161
"max_len": 50, // Optional[int] {default=50}, Maximum AA length of peptides to search
162162
"cleave_at": "KR", // Optional[str] {default='KR'}. Amino acids to cleave at
163-
"restrict": "P", // Optional[char/single AA] {default='P'}. Do not cleave if this AA follows the cleavage site
163+
"restrict": "P", // Optional[str] {default='P'}. Do not cleave if one of these AAs follows the cleavage site
164164
"c_terminal": false, // Optional[bool] {default=true}. Cleave at c terminus of matching amino acid
165165
"semi_enzymatic": false // Optional[bool] {default=false}. Generate semi-enzymatic peptides
166166
},
@@ -270,7 +270,7 @@ The enzyme section contains parameters related to the enzyme used for digestion.
270270
- **min_len**: Integer. The minimum amino acid (AA) length of peptides to search (default: 5).
271271
- **max_len**: Integer. The maximum AA length of peptides to search (default: 50).
272272
- **cleave_at**: String. Amino acids to cleave at (default: 'KR').
273-
- **restrict**: Single character string. Do not cleave if this amino acid follows the cleavage site (default: 'P').
273+
- **restrict**: String. Do not cleave if one of these amino acids follows the cleavage site (default: 'P').
274274
- **c_terminal**: Boolean. Cleave at the C-terminus of matching amino acids (default:true).
275275

276276
Example:

crates/sage-cli/src/input.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,8 @@ impl Input {
357357

358358
#[cfg(test)]
359359
mod test {
360+
361+
360362
use sage_core::{database::EnzymeBuilder, enzyme::EnzymeParameters};
361363

362364
#[test]
@@ -368,12 +370,22 @@ mod test {
368370
"cleave_at": "KR",
369371
"restrict": "P",
370372
}))?;
373+
let c: EnzymeBuilder = serde_json::from_value(serde_json::json!({
374+
"cleave_at": "KR",
375+
"restrict": "",
376+
}))?;
371377

372378
let a: EnzymeParameters = a.into();
373379
let b: EnzymeParameters = b.into();
380+
let c: EnzymeParameters = c.into();
374381

375-
assert_eq!(a.enyzme.and_then(|e| e.skip_suffix), None);
376-
assert_eq!(b.enyzme.and_then(|e| e.skip_suffix), Some('P'));
382+
assert_eq!(a.enzyme.map(|e| e.skip_suffix), Some([false; 26]));
383+
{
384+
let mut expected = [false; 26];
385+
expected[(b'P' - b'A') as usize] = true;
386+
assert_eq!(b.enzyme.map(|e| e.skip_suffix), Some(expected));
387+
}
388+
assert_eq!(c.enzyme.map(|e| e.skip_suffix), Some([false; 26]));
377389

378390
Ok(())
379391
}

crates/sage/src/database.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ pub struct EnzymeBuilder {
2121
/// Maximum peptide length that will be fragmented
2222
pub max_len: Option<usize>,
2323
pub cleave_at: Option<String>,
24-
pub restrict: Option<char>,
24+
pub restrict: Option<String>,
2525
pub c_terminal: Option<bool>,
2626
pub semi_enzymatic: Option<bool>,
2727
}
@@ -33,7 +33,7 @@ impl Default for EnzymeBuilder {
3333
min_len: Some(5),
3434
max_len: Some(50),
3535
cleave_at: Some("KR".into()),
36-
restrict: Some('P'),
36+
restrict: Some("P".into()),
3737
c_terminal: Some(true),
3838
semi_enzymatic: Some(false),
3939
}
@@ -46,9 +46,9 @@ impl From<EnzymeBuilder> for EnzymeParameters {
4646
missed_cleavages: en.missed_cleavages.unwrap_or(1),
4747
min_len: en.min_len.unwrap_or(5),
4848
max_len: en.max_len.unwrap_or(50),
49-
enyzme: Enzyme::new(
49+
enzyme: Enzyme::new(
5050
&en.cleave_at.unwrap_or_else(|| "KR".into()),
51-
en.restrict,
51+
&en.restrict.unwrap_or_else(|| "".into()),
5252
en.c_terminal.unwrap_or(true),
5353
en.semi_enzymatic.unwrap_or(false),
5454
),

crates/sage/src/enzyme.rs

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,13 @@ pub struct EnzymeParameters {
122122
pub min_len: usize,
123123
/// Inclusive
124124
pub max_len: usize,
125-
pub enyzme: Option<Enzyme>,
125+
pub enzyme: Option<Enzyme>,
126126
}
127127

128128
#[derive(Clone)]
129129
pub struct Enzyme {
130-
// Skip cleaving if the site is followed matching this AA
131-
pub skip_suffix: Option<char>,
130+
// Skip cleaving if the site is followed by one of these AAs
131+
pub skip_suffix: [bool; 26],
132132
// Regex for matching cleavage sites
133133
regex: Regex,
134134
// Cleave at c-terminal?
@@ -150,7 +150,7 @@ pub struct DigestSite {
150150
impl Enzyme {
151151
pub fn new(
152152
cleave: &str,
153-
skip_suffix: Option<char>,
153+
skip_suffix: &str,
154154
c_terminal: bool,
155155
semi_enzymatic: bool,
156156
) -> Option<Self> {
@@ -160,27 +160,31 @@ impl Enzyme {
160160
cleave
161161
);
162162
assert!(
163-
skip_suffix
164-
.map(|x| VALID_AA.contains(&(x as u8)))
165-
.unwrap_or(true),
166-
"Enzyme cleavage restriction is non-amino acid character: {}",
167-
skip_suffix.unwrap(),
163+
skip_suffix.chars().all(|x| VALID_AA.contains(&(x as u8))),
164+
"Enzyme cleavage restriction contains non-amino acid characters: {}",
165+
skip_suffix,
168166
);
169167

170168
// At this point, cleave can be three things: empty, "$", or a string of valid AA's
171169
match cleave {
172170
"" => None,
173171
"$" => Some(Enzyme {
174172
regex: Regex::new("$").unwrap(),
175-
skip_suffix: None,
173+
skip_suffix: [false; 26],
176174
// Allowing this to be set to false could cause unexpected behavior
177175
c_terminal: true,
178176
// Do not allow strange behavior
179177
semi_enzymatic: false,
180178
}),
181179
_ => Some(Enzyme {
182180
regex: Regex::new(&format!("[{}]", cleave.replace('?', ""))).unwrap(),
183-
skip_suffix,
181+
skip_suffix: {
182+
let mut arr = [false; 26];
183+
for b in skip_suffix.bytes() {
184+
arr[(b - b'A') as usize] = true;
185+
}
186+
arr
187+
},
184188
c_terminal,
185189
semi_enzymatic,
186190
}),
@@ -195,10 +199,8 @@ impl Enzyme {
195199
true => mat.end(),
196200
false => mat.start(),
197201
};
198-
if let Some(skip) = self.skip_suffix {
199-
if right < sequence.len() && sequence[right..].starts_with(skip) {
200-
continue;
201-
}
202+
if sequence.as_bytes().get(right).map_or(false, |b| self.skip_suffix[(b - b'A') as usize]) {
203+
continue;
202204
}
203205
sites.push(DigestSite {
204206
site: left..right,
@@ -218,7 +220,7 @@ impl Enzyme {
218220

219221
impl EnzymeParameters {
220222
pub fn cleavage_sites(&self, sequence: &str) -> Vec<DigestSite> {
221-
match &self.enyzme {
223+
match &self.enzyme {
222224
Some(enzyme) => enzyme.cleavage_sites(sequence),
223225
None => {
224226
// Perform a non-specific digest
@@ -260,7 +262,7 @@ impl EnzymeParameters {
260262
}
261263

262264
fn is_semi_enzymatic(&self) -> bool {
263-
match &self.enyzme {
265+
match &self.enzyme {
264266
Some(enzyme) => enzyme.semi_enzymatic,
265267
None => false,
266268
}
@@ -297,7 +299,7 @@ impl EnzymeParameters {
297299
let mut sites = self.cleavage_sites(sequence);
298300
// Allowing missed_cleavages with non-specific digest causes OOB panics
299301
// in the below indexing code
300-
let missed_cleavages = match self.enyzme {
302+
let missed_cleavages = match self.enzyme {
301303
None => 0,
302304
_ => self.missed_cleavages,
303305
};
@@ -420,7 +422,7 @@ mod test {
420422
min_len: 2,
421423
max_len: 50,
422424
missed_cleavages: 0,
423-
enyzme: Enzyme::new("KR", Some('P'), true, false),
425+
enzyme: Enzyme::new("KR", "P", true, false),
424426
};
425427

426428
assert_eq!(
@@ -453,7 +455,7 @@ mod test {
453455
min_len: 0,
454456
max_len: 50,
455457
missed_cleavages: 1,
456-
enyzme: Enzyme::new("KR", Some('P'), true, false),
458+
enzyme: Enzyme::new("KR", "P", true, false),
457459
};
458460

459461
assert_eq!(
@@ -490,7 +492,7 @@ mod test {
490492
min_len: 0,
491493
max_len: 50,
492494
missed_cleavages: 2,
493-
enyzme: Enzyme::new("KR", Some('P'), true, false),
495+
enzyme: Enzyme::new("KR", "P", true, false),
494496
};
495497

496498
assert_eq!(
@@ -518,7 +520,7 @@ mod test {
518520
min_len: 2,
519521
max_len: 50,
520522
missed_cleavages: 0,
521-
enyzme: Enzyme::new("KR", None, true, false),
523+
enzyme: Enzyme::new("KR", "", true, false),
522524
};
523525

524526
assert_eq!(
@@ -539,7 +541,7 @@ mod test {
539541
min_len: 1,
540542
max_len: 50,
541543
missed_cleavages: 0,
542-
enyzme: Enzyme::new("D", None, false, false),
544+
enzyme: Enzyme::new("D", "", false, false),
543545
};
544546

545547
assert_eq!(
@@ -568,7 +570,7 @@ mod test {
568570
min_len: 1,
569571
max_len: 50,
570572
missed_cleavages: 0,
571-
enyzme: Enzyme::new("FYWL", None, true, false),
573+
enzyme: Enzyme::new("FYWL", "", true, false),
572574
};
573575

574576
assert_eq!(
@@ -594,7 +596,7 @@ mod test {
594596
min_len: 5,
595597
max_len: 5,
596598
missed_cleavages: 0,
597-
enyzme: None,
599+
enzyme: None,
598600
};
599601

600602
assert_eq!(
@@ -623,7 +625,7 @@ mod test {
623625
min_len: 5,
624626
max_len: 7,
625627
missed_cleavages: 0,
626-
enyzme: Enzyme::new("", None, true, false),
628+
enzyme: Enzyme::new("", "", true, false),
627629
};
628630

629631
assert_eq!(
@@ -644,7 +646,7 @@ mod test {
644646
min_len: 0,
645647
max_len: usize::MAX,
646648
missed_cleavages: 0,
647-
enyzme: Enzyme::new("$", None, true, false),
649+
enzyme: Enzyme::new("$", "", true, false),
648650
};
649651

650652
assert_eq!(
@@ -665,7 +667,7 @@ mod test {
665667
min_len: 2,
666668
max_len: usize::MAX,
667669
missed_cleavages: 0,
668-
enyzme: Enzyme::new("KR", None, true, false),
670+
enzyme: Enzyme::new("KR", "", true, false),
669671
};
670672

671673
assert_eq!(
@@ -688,7 +690,7 @@ mod test {
688690
min_len: 2,
689691
max_len: 50,
690692
missed_cleavages: 0,
691-
enyzme: Enzyme::new("KR", None, true, true),
693+
enzyme: Enzyme::new("KR", "P", true, true),
692694
};
693695

694696
assert_eq!(
@@ -738,7 +740,7 @@ mod test {
738740
min_len: 3,
739741
max_len: 50,
740742
missed_cleavages: 1,
741-
enyzme: Enzyme::new("KR", None, true, true),
743+
enzyme: Enzyme::new("KR", "P", true, true),
742744
};
743745

744746
for (digest, expected) in tryp
@@ -794,7 +796,7 @@ mod test {
794796
min_len: 3,
795797
max_len: 50,
796798
missed_cleavages: 2,
797-
enyzme: Enzyme::new("KR", None, true, true),
799+
enzyme: Enzyme::new("KR", "", true, true),
798800
};
799801

800802
for digest in tryp.digest(&sequence, Arc::default()) {

crates/sage/src/peptide.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ mod test {
433433
min_len: 0,
434434
max_len: 50,
435435
missed_cleavages: 0,
436-
enyzme: Enzyme::new("KR", Some('P'), true, false),
436+
enzyme: Enzyme::new("KR", "P", true, false),
437437
};
438438

439439
let peptides = tryp
@@ -625,7 +625,7 @@ mod test {
625625
missed_cleavages: 0,
626626
min_len: 3,
627627
max_len: 30,
628-
enyzme: Enzyme::new("KR", Some('P'), true, false),
628+
enzyme: Enzyme::new("KR", "P", true, false),
629629
};
630630

631631
let fwd = "MADEEKLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGN";

0 commit comments

Comments
 (0)