From 91fa3f4bb507dfd5d5733b8b780c1e83ed9f482b Mon Sep 17 00:00:00 2001 From: Ali Alimohammadi Date: Thu, 4 Dec 2025 18:48:46 -0800 Subject: [PATCH 1/4] feat: add Smith-Waterman algorithm for local sequence alignment --- DIRECTORY.md | 1 + src/dynamic_programming/mod.rs | 2 + src/dynamic_programming/smith_waterman.rs | 346 ++++++++++++++++++++++ 3 files changed, 349 insertions(+) create mode 100644 src/dynamic_programming/smith_waterman.rs diff --git a/DIRECTORY.md b/DIRECTORY.md index 1b8c4b0c316..16359ba1f8e 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -103,6 +103,7 @@ * [Minimum Cost Path](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/minimum_cost_path.rs) * [Optimal Bst](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/optimal_bst.rs) * [Rod Cutting](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/rod_cutting.rs) + * [Smith-Waterman](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/smith_waterman.rs) * [Snail](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/snail.rs) * [Subset Generation](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/subset_generation.rs) * [Task Assignment](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/task_assignment.rs) diff --git a/src/dynamic_programming/mod.rs b/src/dynamic_programming/mod.rs index b3dc169201a..1bad6b4018f 100644 --- a/src/dynamic_programming/mod.rs +++ b/src/dynamic_programming/mod.rs @@ -14,6 +14,7 @@ mod maximum_subarray; mod minimum_cost_path; mod optimal_bst; mod rod_cutting; +mod smith_waterman; mod snail; mod subset_generation; mod subset_sum; @@ -45,6 +46,7 @@ pub use self::maximum_subarray::maximum_subarray; pub use self::minimum_cost_path::minimum_cost_path; pub use self::optimal_bst::optimal_search_tree; pub use self::rod_cutting::rod_cut; +pub use self::smith_waterman::{score_function, smith_waterman, traceback}; pub use self::snail::snail; pub use self::subset_generation::list_subset; pub use self::subset_sum::is_sum_subset; diff --git a/src/dynamic_programming/smith_waterman.rs b/src/dynamic_programming/smith_waterman.rs new file mode 100644 index 00000000000..9ab03c5069a --- /dev/null +++ b/src/dynamic_programming/smith_waterman.rs @@ -0,0 +1,346 @@ +//! This module contains the Smith-Waterman algorithm implementation for local sequence alignment. +//! +//! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining +//! similar regions between two sequences (nucleotide or protein sequences). It is particularly +//! useful in bioinformatics for identifying optimal local alignments. +//! +//! # Algorithm Overview +//! +//! The algorithm works by: +//! 1. Creating a scoring matrix where each cell represents the maximum alignment score +//! ending at that position +//! 2. Using match, mismatch, and gap penalties to calculate scores +//! 3. Allowing scores to reset to 0 (ensuring local rather than global alignment) +//! 4. Tracing back from the highest scoring position to reconstruct the alignment +//! +//! # Time Complexity +//! +//! O(m * n) where m and n are the lengths of the two sequences +//! +//! # Space Complexity +//! +//! O(m * n) for the scoring matrix +//! +//! # References +//! +//! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5) +//! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) + +use std::cmp::max; + +/// Calculates the score for a character pair based on match, mismatch, or gap scoring. +/// +/// # Arguments +/// +/// * `source_char` - Character from the source sequence +/// * `target_char` - Character from the target sequence +/// * `match_score` - Score awarded for matching characters (typically positive) +/// * `mismatch_score` - Score penalty for mismatching characters (typically negative) +/// * `gap_score` - Score penalty for gaps (typically negative) +/// +/// # Returns +/// +/// The calculated score for the character pair +/// +/// # Examples +/// +/// ``` +/// use the_algorithms_rust::dynamic_programming::score_function; +/// +/// let score = score_function('A', 'A', 1, -1, -2); +/// assert_eq!(score, 1); // Match +/// +/// let score = score_function('A', 'C', 1, -1, -2); +/// assert_eq!(score, -1); // Mismatch +/// +/// let score = score_function('-', 'A', 1, -1, -2); +/// assert_eq!(score, -2); // Gap +/// ``` +pub fn score_function( + source_char: char, + target_char: char, + match_score: i32, + mismatch_score: i32, + gap_score: i32, +) -> i32 { + if source_char == '-' || target_char == '-' { + gap_score + } else if source_char == target_char { + match_score + } else { + mismatch_score + } +} + +/// Performs the Smith-Waterman local sequence alignment algorithm. +/// +/// This function creates a scoring matrix using dynamic programming to find the +/// optimal local alignment between two sequences. The algorithm is case-insensitive. +/// +/// # Arguments +/// +/// * `query` - The query sequence (e.g., DNA, protein) +/// * `subject` - The subject sequence to align against +/// * `match_score` - Score for matching characters (default: 1) +/// * `mismatch_score` - Penalty for mismatching characters (default: -1) +/// * `gap_score` - Penalty for gaps/indels (default: -2) +/// +/// # Returns +/// +/// A 2D vector representing the dynamic programming scoring matrix +/// +/// # Examples +/// +/// ``` +/// use the_algorithms_rust::dynamic_programming::smith_waterman; +/// +/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); +/// assert_eq!(score_matrix.len(), 5); // query length + 1 +/// assert_eq!(score_matrix[0].len(), 3); // subject length + 1 +/// ``` +pub fn smith_waterman( + query: &str, + subject: &str, + match_score: i32, + mismatch_score: i32, + gap_score: i32, +) -> Vec> { + let query_upper: Vec = query.to_uppercase().chars().collect(); + let subject_upper: Vec = subject.to_uppercase().chars().collect(); + + let m = query_upper.len(); + let n = subject_upper.len(); + + // Initialize scoring matrix with zeros + let mut score = vec![vec![0; n + 1]; m + 1]; + + // Fill the scoring matrix using dynamic programming + for i in 1..=m { + for j in 1..=n { + // Calculate score for match/mismatch + let match_or_mismatch = score[i - 1][j - 1] + + score_function( + query_upper[i - 1], + subject_upper[j - 1], + match_score, + mismatch_score, + gap_score, + ); + + // Calculate score for deletion (gap in subject) + let delete = score[i - 1][j] + gap_score; + + // Calculate score for insertion (gap in query) + let insert = score[i][j - 1] + gap_score; + + // Take maximum of all options, but never go below 0 (local alignment) + score[i][j] = max(0, max(match_or_mismatch, max(delete, insert))); + } + } + + score +} + +/// Performs traceback on the Smith-Waterman score matrix to reconstruct the optimal alignment. +/// +/// This function starts from the highest scoring cell and traces back through the matrix +/// to reconstruct the aligned sequences. The traceback stops when a cell with score 0 +/// is encountered. +/// +/// # Arguments +/// +/// * `score` - The score matrix from the Smith-Waterman algorithm +/// * `query` - Original query sequence used in alignment +/// * `subject` - Original subject sequence used in alignment +/// +/// # Returns +/// +/// A String containing the two aligned sequences separated by a newline, +/// or an empty string if no significant alignment is found +/// +/// # Examples +/// +/// ``` +/// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback}; +/// +/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); +/// let alignment = traceback(&score_matrix, "ACAC", "CA"); +/// assert_eq!(alignment, "CA\nCA"); +/// ``` +pub fn traceback(score: &[Vec], query: &str, subject: &str) -> String { + let query_upper: Vec = query.to_uppercase().chars().collect(); + let subject_upper: Vec = subject.to_uppercase().chars().collect(); + + // Find the cell with maximum score + let mut max_value = i32::MIN; + let (mut i_max, mut j_max) = (0, 0); + + for (i, row) in score.iter().enumerate() { + for (j, &value) in row.iter().enumerate() { + if value > max_value { + max_value = value; + i_max = i; + j_max = j; + } + } + } + + // If no significant alignment found, return empty string + if i_max == 0 || j_max == 0 { + return String::new(); + } + + // Traceback from the maximum scoring cell + let (mut i, mut j) = (i_max, j_max); + let mut align1 = String::new(); + let mut align2 = String::new(); + + // Continue tracing back until we hit a cell with score 0 + while i > 0 && j > 0 && score[i][j] > 0 { + let current_score = score[i][j]; + + // Check if we came from diagonal (match/mismatch) + if current_score + == score[i - 1][j - 1] + + score_function(query_upper[i - 1], subject_upper[j - 1], 1, -1, -2) + { + align1.insert(0, query_upper[i - 1]); + align2.insert(0, subject_upper[j - 1]); + i -= 1; + j -= 1; + } + // Check if we came from above (deletion/gap in subject) + else if current_score == score[i - 1][j] - 2 { + align1.insert(0, query_upper[i - 1]); + align2.insert(0, '-'); + i -= 1; + } + // Otherwise we came from left (insertion/gap in query) + else { + align1.insert(0, '-'); + align2.insert(0, subject_upper[j - 1]); + j -= 1; + } + } + + format!("{align1}\n{align2}") +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! smith_waterman_tests { + ($($name:ident: $test_cases:expr,)*) => { + $( + #[test] + fn $name() { + let (query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases; + assert_eq!(smith_waterman(query, subject, match_score, mismatch_score, gap_score), expected); + } + )* + } + } + + macro_rules! traceback_tests { + ($($name:ident: $test_cases:expr,)*) => { + $( + #[test] + fn $name() { + let (score, query, subject, expected) = $test_cases; + assert_eq!(traceback(&score, query, subject), expected); + } + )* + } + } + + smith_waterman_tests! { + test_acac_ca: ("ACAC", "CA", 1, -1, -2, vec![ + vec![0, 0, 0], + vec![0, 0, 1], + vec![0, 1, 0], + vec![0, 0, 2], + vec![0, 1, 0], + ]), + test_agt_agt: ("AGT", "AGT", 1, -1, -2, vec![ + vec![0, 0, 0, 0], + vec![0, 1, 0, 0], + vec![0, 0, 2, 0], + vec![0, 0, 0, 3], + ]), + test_agt_gta: ("AGT", "GTA", 1, -1, -2, vec![ + vec![0, 0, 0, 0], + vec![0, 0, 0, 1], + vec![0, 1, 0, 0], + vec![0, 0, 2, 0], + ]), + test_agt_g: ("AGT", "G", 1, -1, -2, vec![ + vec![0, 0], + vec![0, 0], + vec![0, 1], + vec![0, 0], + ]), + test_g_agt: ("G", "AGT", 1, -1, -2, vec![ + vec![0, 0, 0, 0], + vec![0, 0, 1, 0], + ]), + test_empty_query: ("", "CA", 1, -1, -2, vec![vec![0, 0, 0]]), + test_empty_subject: ("ACAC", "", 1, -1, -2, vec![vec![0], vec![0], vec![0], vec![0], vec![0]]), + test_both_empty: ("", "", 1, -1, -2, vec![vec![0]]), + } + + traceback_tests! { + test_traceback_acac_ca: ( + vec![ + vec![0, 0, 0], + vec![0, 0, 1], + vec![0, 1, 0], + vec![0, 0, 2], + vec![0, 1, 0], + ], + "ACAC", + "CA", + "CA\nCA", + ), + test_traceback_agt_agt: ( + vec![ + vec![0, 0, 0, 0], + vec![0, 1, 0, 0], + vec![0, 0, 2, 0], + vec![0, 0, 0, 3], + ], + "AGT", + "AGT", + "AGT\nAGT", + ), + test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", ""), + } + + #[test] + fn test_score_function_match() { + assert_eq!(score_function('A', 'A', 1, -1, -2), 1); + assert_eq!(score_function('G', 'G', 2, -1, -1), 2); + } + + #[test] + fn test_score_function_mismatch() { + assert_eq!(score_function('A', 'C', 1, -1, -2), -1); + assert_eq!(score_function('G', 'T', 1, -2, -1), -2); + } + + #[test] + fn test_score_function_gap() { + assert_eq!(score_function('-', 'A', 1, -1, -2), -2); + assert_eq!(score_function('A', '-', 1, -1, -2), -2); + } + + #[test] + fn test_case_insensitive() { + let result1 = smith_waterman("acac", "CA", 1, -1, -2); + let result2 = smith_waterman("ACAC", "ca", 1, -1, -2); + let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2); + + assert_eq!(result1, result2); + assert_eq!(result2, result3); + } +} From ca1dd1ea9d9ff02e35bc9022a2cf349ba31bdedd Mon Sep 17 00:00:00 2001 From: Ali Alimohammadi <41567902+AliAlimohammadi@users.noreply.github.com> Date: Thu, 4 Dec 2025 22:19:53 -0800 Subject: [PATCH 2/4] Update DIRECTORY.md to fix ordering. --- DIRECTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index 16359ba1f8e..bd76d7d153a 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -106,8 +106,8 @@ * [Smith-Waterman](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/smith_waterman.rs) * [Snail](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/snail.rs) * [Subset Generation](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/subset_generation.rs) - * [Task Assignment](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/task_assignment.rs) * [Subset Sum](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/subset_sum.rs) + * [Task Assignment](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/task_assignment.rs) * [Trapped Rainwater](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/trapped_rainwater.rs) * [Word Break](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/word_break.rs) * Financial From 2eea7a411e856bc6711f7d4c461368ea501626cd Mon Sep 17 00:00:00 2001 From: Ali Alimohammadi <41567902+AliAlimohammadi@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:18:00 -0800 Subject: [PATCH 3/4] Fix: address Copilot feedback - custom scoring, efficiency, and edge cases. --- src/dynamic_programming/smith_waterman.rs | 178 +++++++++++++++------- 1 file changed, 124 insertions(+), 54 deletions(-) diff --git a/src/dynamic_programming/smith_waterman.rs b/src/dynamic_programming/smith_waterman.rs index 9ab03c5069a..93e8b5439ca 100644 --- a/src/dynamic_programming/smith_waterman.rs +++ b/src/dynamic_programming/smith_waterman.rs @@ -1,11 +1,11 @@ //! This module contains the Smith-Waterman algorithm implementation for local sequence alignment. -//! +//! //! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining //! similar regions between two sequences (nucleotide or protein sequences). It is particularly //! useful in bioinformatics for identifying optimal local alignments. //! //! # Algorithm Overview -//! +//! //! The algorithm works by: //! 1. Creating a scoring matrix where each cell represents the maximum alignment score //! ending at that position @@ -14,15 +14,15 @@ //! 4. Tracing back from the highest scoring position to reconstruct the alignment //! //! # Time Complexity -//! +//! //! O(m * n) where m and n are the lengths of the two sequences //! //! # Space Complexity -//! +//! //! O(m * n) for the scoring matrix //! //! # References -//! +//! //! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5) //! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) @@ -46,7 +46,7 @@ use std::cmp::max; /// /// ``` /// use the_algorithms_rust::dynamic_programming::score_function; -/// +/// /// let score = score_function('A', 'A', 1, -1, -2); /// assert_eq!(score, 1); // Match /// @@ -93,7 +93,7 @@ pub fn score_function( /// /// ``` /// use the_algorithms_rust::dynamic_programming::smith_waterman; -/// +/// /// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); /// assert_eq!(score_matrix.len(), 5); // query length + 1 /// assert_eq!(score_matrix[0].len(), 3); // subject length + 1 @@ -107,37 +107,36 @@ pub fn smith_waterman( ) -> Vec> { let query_upper: Vec = query.to_uppercase().chars().collect(); let subject_upper: Vec = subject.to_uppercase().chars().collect(); - + let m = query_upper.len(); let n = subject_upper.len(); - + // Initialize scoring matrix with zeros let mut score = vec![vec![0; n + 1]; m + 1]; - + // Fill the scoring matrix using dynamic programming for i in 1..=m { for j in 1..=n { // Calculate score for match/mismatch - let match_or_mismatch = score[i - 1][j - 1] - + score_function( - query_upper[i - 1], - subject_upper[j - 1], - match_score, - mismatch_score, - gap_score, - ); - + let match_or_mismatch = score[i - 1][j - 1] + score_function( + query_upper[i - 1], + subject_upper[j - 1], + match_score, + mismatch_score, + gap_score, + ); + // Calculate score for deletion (gap in subject) let delete = score[i - 1][j] + gap_score; - + // Calculate score for insertion (gap in query) let insert = score[i][j - 1] + gap_score; - + // Take maximum of all options, but never go below 0 (local alignment) score[i][j] = max(0, max(match_or_mismatch, max(delete, insert))); } } - + score } @@ -152,6 +151,9 @@ pub fn smith_waterman( /// * `score` - The score matrix from the Smith-Waterman algorithm /// * `query` - Original query sequence used in alignment /// * `subject` - Original subject sequence used in alignment +/// * `match_score` - Score for matching characters (should match smith_waterman call) +/// * `mismatch_score` - Score for mismatching characters (should match smith_waterman call) +/// * `gap_score` - Penalty for gaps (should match smith_waterman call) /// /// # Returns /// @@ -162,19 +164,26 @@ pub fn smith_waterman( /// /// ``` /// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback}; -/// +/// /// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); -/// let alignment = traceback(&score_matrix, "ACAC", "CA"); +/// let alignment = traceback(&score_matrix, "ACAC", "CA", 1, -1, -2); /// assert_eq!(alignment, "CA\nCA"); /// ``` -pub fn traceback(score: &[Vec], query: &str, subject: &str) -> String { +pub fn traceback( + score: &[Vec], + query: &str, + subject: &str, + match_score: i32, + mismatch_score: i32, + gap_score: i32, +) -> String { let query_upper: Vec = query.to_uppercase().chars().collect(); let subject_upper: Vec = subject.to_uppercase().chars().collect(); - + // Find the cell with maximum score - let mut max_value = i32::MIN; + let mut max_value = 0; let (mut i_max, mut j_max) = (0, 0); - + for (i, row) in score.iter().enumerate() { for (j, &value) in row.iter().enumerate() { if value > max_value { @@ -184,46 +193,57 @@ pub fn traceback(score: &[Vec], query: &str, subject: &str) -> String { } } } - + // If no significant alignment found, return empty string - if i_max == 0 || j_max == 0 { + if max_value <= 0 { return String::new(); } - + // Traceback from the maximum scoring cell let (mut i, mut j) = (i_max, j_max); - let mut align1 = String::new(); - let mut align2 = String::new(); - + let mut align1 = Vec::new(); + let mut align2 = Vec::new(); + // Continue tracing back until we hit a cell with score 0 while i > 0 && j > 0 && score[i][j] > 0 { let current_score = score[i][j]; - + // Check if we came from diagonal (match/mismatch) - if current_score - == score[i - 1][j - 1] - + score_function(query_upper[i - 1], subject_upper[j - 1], 1, -1, -2) - { - align1.insert(0, query_upper[i - 1]); - align2.insert(0, subject_upper[j - 1]); + if current_score == score[i - 1][j - 1] + score_function( + query_upper[i - 1], + subject_upper[j - 1], + match_score, + mismatch_score, + gap_score, + ) { + align1.push(query_upper[i - 1]); + align2.push(subject_upper[j - 1]); i -= 1; j -= 1; - } + } // Check if we came from above (deletion/gap in subject) - else if current_score == score[i - 1][j] - 2 { - align1.insert(0, query_upper[i - 1]); - align2.insert(0, '-'); + else if current_score == score[i - 1][j] + gap_score { + align1.push(query_upper[i - 1]); + align2.push('-'); i -= 1; - } + } // Otherwise we came from left (insertion/gap in query) else { - align1.insert(0, '-'); - align2.insert(0, subject_upper[j - 1]); + align1.push('-'); + align2.push(subject_upper[j - 1]); j -= 1; } } - - format!("{align1}\n{align2}") + + // Reverse the sequences (we built them backwards) + align1.reverse(); + align2.reverse(); + + format!( + "{}\n{}", + align1.into_iter().collect::(), + align2.into_iter().collect::() + ) } #[cfg(test)] @@ -247,8 +267,8 @@ mod tests { $( #[test] fn $name() { - let (score, query, subject, expected) = $test_cases; - assert_eq!(traceback(&score, query, subject), expected); + let (score, query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases; + assert_eq!(traceback(&score, query, subject, match_score, mismatch_score, gap_score), expected); } )* } @@ -300,6 +320,7 @@ mod tests { ], "ACAC", "CA", + 1, -1, -2, "CA\nCA", ), test_traceback_agt_agt: ( @@ -311,9 +332,23 @@ mod tests { ], "AGT", "AGT", + 1, -1, -2, "AGT\nAGT", ), - test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", ""), + test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", 1, -1, -2, ""), + test_traceback_custom_scoring: ( + vec![ + vec![0, 0, 0], + vec![0, 0, 2], + vec![0, 2, 0], + vec![0, 0, 4], + vec![0, 2, 0], + ], + "ACAC", + "CA", + 2, -2, -3, + "CA\nCA", + ), } #[test] @@ -339,8 +374,43 @@ mod tests { let result1 = smith_waterman("acac", "CA", 1, -1, -2); let result2 = smith_waterman("ACAC", "ca", 1, -1, -2); let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2); - + assert_eq!(result1, result2); assert_eq!(result2, result3); } + + #[test] + fn test_custom_scoring_end_to_end() { + // Test with custom scoring parameters (match=2, mismatch=-2, gap=-3) + let query = "ACGT"; + let subject = "ACGT"; + let match_score = 2; + let mismatch_score = -2; + let gap_score = -3; + + // Generate score matrix with custom parameters + let score_matrix = smith_waterman(query, subject, match_score, mismatch_score, gap_score); + + // Traceback using the same custom parameters + let alignment = traceback(&score_matrix, query, subject, match_score, mismatch_score, gap_score); + + // With perfect match and match_score=2, we expect alignment "ACGT\nACGT" + assert_eq!(alignment, "ACGT\nACGT"); + + // Verify the score is correct (4 matches × 2 = 8) + assert_eq!(score_matrix[4][4], 8); + } + + #[test] + fn test_alignment_at_boundary() { + // Test case where optimal alignment might be at row/column 0 + let query = "A"; + let subject = "A"; + let score_matrix = smith_waterman(query, subject, 1, -1, -2); + let alignment = traceback(&score_matrix, query, subject, 1, -1, -2); + + // Should find the alignment even though it's near the boundary + assert_eq!(alignment, "A\nA"); + assert_eq!(score_matrix[1][1], 1); + } } From 7a849eea50ba712d265a58f97fb03d202945f02b Mon Sep 17 00:00:00 2001 From: Ali Alimohammadi <41567902+AliAlimohammadi@users.noreply.github.com> Date: Fri, 5 Dec 2025 15:22:28 -0800 Subject: [PATCH 4/4] Fix: cargo fmt --- src/dynamic_programming/smith_waterman.rs | 103 ++++++++++++---------- 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/src/dynamic_programming/smith_waterman.rs b/src/dynamic_programming/smith_waterman.rs index 93e8b5439ca..3bd7fef2896 100644 --- a/src/dynamic_programming/smith_waterman.rs +++ b/src/dynamic_programming/smith_waterman.rs @@ -1,11 +1,11 @@ //! This module contains the Smith-Waterman algorithm implementation for local sequence alignment. -//! +//! //! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining //! similar regions between two sequences (nucleotide or protein sequences). It is particularly //! useful in bioinformatics for identifying optimal local alignments. //! //! # Algorithm Overview -//! +//! //! The algorithm works by: //! 1. Creating a scoring matrix where each cell represents the maximum alignment score //! ending at that position @@ -14,15 +14,15 @@ //! 4. Tracing back from the highest scoring position to reconstruct the alignment //! //! # Time Complexity -//! +//! //! O(m * n) where m and n are the lengths of the two sequences //! //! # Space Complexity -//! +//! //! O(m * n) for the scoring matrix //! //! # References -//! +//! //! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5) //! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) @@ -46,7 +46,7 @@ use std::cmp::max; /// /// ``` /// use the_algorithms_rust::dynamic_programming::score_function; -/// +/// /// let score = score_function('A', 'A', 1, -1, -2); /// assert_eq!(score, 1); // Match /// @@ -93,7 +93,7 @@ pub fn score_function( /// /// ``` /// use the_algorithms_rust::dynamic_programming::smith_waterman; -/// +/// /// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); /// assert_eq!(score_matrix.len(), 5); // query length + 1 /// assert_eq!(score_matrix[0].len(), 3); // subject length + 1 @@ -107,36 +107,37 @@ pub fn smith_waterman( ) -> Vec> { let query_upper: Vec = query.to_uppercase().chars().collect(); let subject_upper: Vec = subject.to_uppercase().chars().collect(); - + let m = query_upper.len(); let n = subject_upper.len(); - + // Initialize scoring matrix with zeros let mut score = vec![vec![0; n + 1]; m + 1]; - + // Fill the scoring matrix using dynamic programming for i in 1..=m { for j in 1..=n { // Calculate score for match/mismatch - let match_or_mismatch = score[i - 1][j - 1] + score_function( - query_upper[i - 1], - subject_upper[j - 1], - match_score, - mismatch_score, - gap_score, - ); - + let match_or_mismatch = score[i - 1][j - 1] + + score_function( + query_upper[i - 1], + subject_upper[j - 1], + match_score, + mismatch_score, + gap_score, + ); + // Calculate score for deletion (gap in subject) let delete = score[i - 1][j] + gap_score; - + // Calculate score for insertion (gap in query) let insert = score[i][j - 1] + gap_score; - + // Take maximum of all options, but never go below 0 (local alignment) score[i][j] = max(0, max(match_or_mismatch, max(delete, insert))); } } - + score } @@ -164,7 +165,7 @@ pub fn smith_waterman( /// /// ``` /// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback}; -/// +/// /// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); /// let alignment = traceback(&score_matrix, "ACAC", "CA", 1, -1, -2); /// assert_eq!(alignment, "CA\nCA"); @@ -179,11 +180,11 @@ pub fn traceback( ) -> String { let query_upper: Vec = query.to_uppercase().chars().collect(); let subject_upper: Vec = subject.to_uppercase().chars().collect(); - + // Find the cell with maximum score let mut max_value = 0; let (mut i_max, mut j_max) = (0, 0); - + for (i, row) in score.iter().enumerate() { for (j, &value) in row.iter().enumerate() { if value > max_value { @@ -193,40 +194,43 @@ pub fn traceback( } } } - + // If no significant alignment found, return empty string if max_value <= 0 { return String::new(); } - + // Traceback from the maximum scoring cell let (mut i, mut j) = (i_max, j_max); let mut align1 = Vec::new(); let mut align2 = Vec::new(); - + // Continue tracing back until we hit a cell with score 0 while i > 0 && j > 0 && score[i][j] > 0 { let current_score = score[i][j]; - + // Check if we came from diagonal (match/mismatch) - if current_score == score[i - 1][j - 1] + score_function( - query_upper[i - 1], - subject_upper[j - 1], - match_score, - mismatch_score, - gap_score, - ) { + if current_score + == score[i - 1][j - 1] + + score_function( + query_upper[i - 1], + subject_upper[j - 1], + match_score, + mismatch_score, + gap_score, + ) + { align1.push(query_upper[i - 1]); align2.push(subject_upper[j - 1]); i -= 1; j -= 1; - } + } // Check if we came from above (deletion/gap in subject) else if current_score == score[i - 1][j] + gap_score { align1.push(query_upper[i - 1]); align2.push('-'); i -= 1; - } + } // Otherwise we came from left (insertion/gap in query) else { align1.push('-'); @@ -234,11 +238,11 @@ pub fn traceback( j -= 1; } } - + // Reverse the sequences (we built them backwards) align1.reverse(); align2.reverse(); - + format!( "{}\n{}", align1.into_iter().collect::(), @@ -374,7 +378,7 @@ mod tests { let result1 = smith_waterman("acac", "CA", 1, -1, -2); let result2 = smith_waterman("ACAC", "ca", 1, -1, -2); let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2); - + assert_eq!(result1, result2); assert_eq!(result2, result3); } @@ -387,16 +391,23 @@ mod tests { let match_score = 2; let mismatch_score = -2; let gap_score = -3; - + // Generate score matrix with custom parameters let score_matrix = smith_waterman(query, subject, match_score, mismatch_score, gap_score); - + // Traceback using the same custom parameters - let alignment = traceback(&score_matrix, query, subject, match_score, mismatch_score, gap_score); - + let alignment = traceback( + &score_matrix, + query, + subject, + match_score, + mismatch_score, + gap_score, + ); + // With perfect match and match_score=2, we expect alignment "ACGT\nACGT" assert_eq!(alignment, "ACGT\nACGT"); - + // Verify the score is correct (4 matches × 2 = 8) assert_eq!(score_matrix[4][4], 8); } @@ -408,7 +419,7 @@ mod tests { let subject = "A"; let score_matrix = smith_waterman(query, subject, 1, -1, -2); let alignment = traceback(&score_matrix, query, subject, 1, -1, -2); - + // Should find the alignment even though it's near the boundary assert_eq!(alignment, "A\nA"); assert_eq!(score_matrix[1][1], 1);