soedinglab · rosekim2001-dev · Mar 24, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 26, 2026
diff --git a/countNUM.py b/countNUM.py
@@ -0,0 +1,123 @@
+#Counting the unique target ID from result2.m8 & reclass_result.txt
+from collections import defaultdict
+import pdb
+
+PATH1 = '/home/yakim/benchmark/result_mmseq.m8'    # mmseq
+PATH2 = '/home/yakim/benchmark/reclass_bit.m8'     # after reclassification - bit score
+PATH3 = '/home/yakim/benchmark/reclass_seqid.m8'  # after reclassification - seq id
+PATH4 = '/home/yakim/benchmark/reclass0922.m8'  # after reclassification - C++ (seqid)
+
+
+# MMseq2 result count
+qandt = []
+hit_num = 0
+redund1 = []
+with open(PATH1, 'r') as f:
+    for line in f:
+        a = line.split()
+        qandt.append([a[0],a[1]])
+
+for q, t in qandt:
+    query = q.split('_')[-1]  #query
+    qnum = q.split('_')[0]
+    if qnum not in redund1:
+      redund1.append(qnum)
+      if query == t:
+          hit_num += 1
+print('Hit # of mmseqs result:', hit_num, '&& Total #:', len(redund1))
+
+
+# Mmseq2 result count (count also for every top 10)
+'''
+hit_num1 = 0
+grouped = defaultdict(list)
+
+with open(PATH1, 'r') as f:
+  for line in f:
+      a = line.split()
+      query_full = a[0]      # query705_from_UniRef90_A0AA46S9G1
+      target = a[1]          # A0AA46S9G1
+      seq_identity = round(float(a[2]), 3)
+      query_num = query_full.split('_')[0]      # query705
+      query_last_id = query_full.split('_')[-1] # A0AA46S9G1
+
+        # query 번호별로 그룹핑
+      grouped[query_num].append({
+            'query_last_id': query_last_id,
+            'target': target,
+            'seq_identity': seq_identity
+        })        # row[0]                       row[1]                          row[2]
+        #{query1: (A0AA46S9G1, A0AA46S9R1, 0.9), (A0AA46S9G1, A0AA46S9G1, 0.9), (A0AA46S9G1, A0AA34S9T1, 0.85)}
+
+for query_num, rows in grouped.items():
+    if not rows:
+        continue
+    top_seq_identity = rows[0]['seq_identity']    # 0.9
+    top_query_last_id = rows[0]['query_last_id']  # A0AA46S9G1
+
+    count = sum(
+        (row['query_last_id'] == row['target']) and
+        (row['seq_identity'] == top_seq_identity)
+        for row in rows
+    )
+    #print(f'{query_num}: {count}')   # 각 그룹별로 개수 출력
+    hit_num1 += count
+
+print('Hit # of mmseqs result(top 10):', hit_num1, '&& Total #:', len(redund))
+'''
+
+# After reclassification result (bit score)
+qandt2 = []
+hit_num2 = 0
+redund2 = []
+with open(PATH2, 'r') as f:
+    for line in f:
+        a = line.split()
+        qandt2.append([a[0],a[1]])
+
+for q, t in qandt2:
+    query = q.split('_')[-1]  #query
+    qnum = q.split('_')[0]
+    if qnum not in redund2:
+      redund2.append(qnum)
+      if query == t:
+          hit_num2 += 1
+print('Hit # of reclassification result(bit score):', hit_num2, '&& Total #:', len(redund2))
+
+
+# After reclassification result (seq id)
+qandt3 = []
+hit_num3 = 0
+redund3 = []
+with open(PATH3, 'r') as f:
+    for line in f:
+        a = line.split()
+        qandt3.append([a[0],a[1]])
+
+for q, t in qandt3:
+    query = q.split('_')[-1]  #query
+    qnum = q.split('_')[0]
+    if qnum not in redund3:
+      redund3.append(qnum)
+      if query == t:
+          hit_num3 += 1
+print('Hit # of reclassification result(seqid):', hit_num3, '&& Total #:', len(redund3))
+
+# after reclass result count (C++)
+qandt4 = []
+hit_num4 = 0
+redund4 = []
+with open(PATH4, 'r') as f:
+    for line in f:
+        a = line.split()
+        qandt4.append([a[0],a[1]])
+
+for q, t in qandt4:
+    query = q.split('_')[-1]  #query
+    qnum = q.split('_')[0]
+    if qnum not in redund4:
+      redund4.append(qnum)
+      if query == t:
+          hit_num4 += 1
+print('Hit # of reclassification result(C++):', hit_num4, '&& Total #:', len(redund4))
+
diff --git a/src/CommandDeclarations.h b/src/CommandDeclarations.h
@@ -101,6 +101,8 @@ extern int gappedprefilter(int argc, const char **argv, const Command& command);
 extern int unpackdb(int argc, const char **argv, const Command& command);
 extern int rbh(int argc, const char **argv, const Command& command);
 extern int recoverlongestorf(int argc, const char **argv, const Command& command);
+extern int emreclassify(int argc, const char **argv, const Command& command);
+extern int emabundance(int argc, const char **argv, const Command& command);
 extern int result2flat(int argc, const char **argv, const Command& command);
 extern int result2msa(int argc, const char **argv, const Command& command);
 extern int result2dnamsa(int argc, const char **argv, const Command& command);

diff --git a/src/MMseqsBase.cpp b/src/MMseqsBase.cpp
@@ -1072,6 +1072,26 @@ std::vector<Command> baseCommands = {
                 "<i:resultbDB> <o:resultDB>",
                 CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                           {"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
+        {"reclassify",           emreclassify,         &par.reclassify,           COMMAND_RESULT | COMMAND_FORMAT_CONVERSION,
+                "Reclassify alignments and write a new alignment DB with posterior probabilities(convertalis will convert it into column 3(instead of seqId))",
+                "mmseqs reclassify queryDB targetDB alignmentDB newDB\n",
+                "Yaeji Kim",
+                "<i:queryDB> <i:targetDB> <i:alignmentDB> <o:newDB>",
+                CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"newDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"abundance",            emabundance,          &par.abundance,            COMMAND_RESULT | COMMAND_FORMAT_CONVERSION,
+                "Summarize abundance from reclassified alignment DB",
+                "mmseqs abundance queryDB targetDB newDB abundance.tsv\n"
+                "# targetDB_mapping and targetDB_taxonomy are only required with --taxonomy 1\n"
+                "mmseqs abundance queryDB targetDB newDB abundance.report --taxonomy 1\n",
+                "Yaeji Kim",
+                "<i:queryDB> <i:targetDB> <i:alignmentDB> <o:abundanceFile>",
+                CITATION_MMSEQS2 | CITATION_TAXONOMY, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
+                                          {"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::taxSequenceDb },
+                                          {"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                          {"abundanceFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
         {"summarizealis",      summarizealis,      &par.threadsandcompression,     COMMAND_RESULT,
                 "Summarize alignment result to one row (uniq. cov., cov., avg. seq. id.)",
                 NULL,

diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp
@@ -308,6 +308,14 @@ Parameters::Parameters():
         // unpackdb
         PARAM_UNPACK_SUFFIX(PARAM_UNPACK_SUFFIX_ID, "--unpack-suffix", "Unpack suffix", "File suffix for unpacked files.\nAdd .gz suffix to write compressed files.", typeid(std::string), (void *) &unpackSuffix, "^.*$"),
         PARAM_UNPACK_NAME_MODE(PARAM_UNPACK_NAME_MODE_ID, "--unpack-name-mode", "Unpack name mode", "Name unpacked files by 0: DB key, 1: accession (through .lookup)", typeid(int), (void *) &unpackNameMode, "^[0-1]{1}$"),
+        // reclassify
+        PARAM_RECLASSIFY_LAMBDA(PARAM_RECLASSIFY_LAMBDA_ID, "--lambda", "Reclassify lambda", "Lambda scaling factor for the reclassification bit score term", typeid(double), (void *) &reclassifyLambda, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
+        PARAM_RECLASSIFY_ALPHA(PARAM_RECLASSIFY_ALPHA_ID, "--alpha", "Reclassify alpha", "Exponent applied to abundance during reclassification", typeid(double), (void *) &reclassifyAlpha, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
+        PARAM_RECLASSIFY_GAMMA(PARAM_RECLASSIFY_GAMMA_ID, "--gamma", "Reclassify gamma", "Exponent applied to coverage confidence during reclassification", typeid(double), (void *) &reclassifyGamma, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
+        PARAM_RECLASSIFY_MAX_ITER(PARAM_RECLASSIFY_MAX_ITER_ID, "--max-iter", "Reclassify max iterations", "Maximum number of SQUAREM iterations for reclassification", typeid(int), (void *) &reclassifyMaxIterations, "^[1-9]{1}[0-9]*$"),
+        PARAM_RECLASSIFY_TOL(PARAM_RECLASSIFY_TOL_ID, "--tol", "Reclassify tolerance", "Convergence tolerance for reclassification", typeid(double), (void *) &reclassifyTolerance, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
+        PARAM_RECLASSIFY_TAXONOMY(PARAM_RECLASSIFY_TAXONOMY_ID, "--taxonomy", "Abundance taxonomy output", "0: write alignment and protein abundance only; taxonomy files are not required. 1: also write taxonomy_abundance.tsv and taxonomic columns in protein_abundance.tsv; requires targetDB_mapping and targetDB_taxonomy", typeid(int), (void *) &reclassifyTaxonomy, "^[0-1]{1}$"),
+        PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE(PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE_ID, "--drop-percentage", "Max drop percentage", "Maximum percentage of cumulative low-tail abundance mass that the automatic jump-based filter may drop (range 0.0-100.0, default 10.0)", typeid(double), (void *) &reclassifyMaxDropPercentage, "^100(\\.0+)?$|^([0-9]|[1-9][0-9])(\\.[0-9]+)?$"),
         // for modules that should handle -h themselves
         PARAM_HELP(PARAM_HELP_ID, "-h", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN),
         PARAM_HELP_LONG(PARAM_HELP_LONG_ID, "--help", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN)
@@ -335,6 +343,29 @@ Parameters::Parameters():
     threadsandcompression.push_back(&PARAM_COMPRESSED);
     threadsandcompression.push_back(&PARAM_V);
 
+    // reclassify
+    reclassify.push_back(&PARAM_RECLASSIFY_LAMBDA);
+    reclassify.push_back(&PARAM_RECLASSIFY_ALPHA);
+    reclassify.push_back(&PARAM_RECLASSIFY_GAMMA);
+    reclassify.push_back(&PARAM_RECLASSIFY_MAX_ITER);
+    reclassify.push_back(&PARAM_RECLASSIFY_TOL);
+    reclassify.push_back(&PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE);
+    reclassify.push_back(&PARAM_THREADS);
+    reclassify.push_back(&PARAM_COMPRESSED);
+    reclassify.push_back(&PARAM_V);
+
+    // abundance
+    abundance.push_back(&PARAM_RECLASSIFY_LAMBDA);
+    abundance.push_back(&PARAM_RECLASSIFY_ALPHA);
+    abundance.push_back(&PARAM_RECLASSIFY_GAMMA);
+    abundance.push_back(&PARAM_RECLASSIFY_MAX_ITER);
+    abundance.push_back(&PARAM_RECLASSIFY_TOL);
+    abundance.push_back(&PARAM_RECLASSIFY_TAXONOMY);
+    abundance.push_back(&PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE);
+    abundance.push_back(&PARAM_THREADS);
+    abundance.push_back(&PARAM_COMPRESSED);
+    abundance.push_back(&PARAM_V);
+
     // createclusearchdb
     createclusearchdb.push_back(&PARAM_THREADS);
     createclusearchdb.push_back(&PARAM_COMPRESSED);
@@ -2637,6 +2668,15 @@ void Parameters::setDefaults() {
     unpackSuffix = "";
     unpackNameMode = Parameters::UNPACK_NAME_ACCESSION;
 
+    // reclassify
+    reclassifyLambda = 1.0;
+    reclassifyAlpha = 1.0;
+    reclassifyGamma = 1.0;
+    reclassifyMaxIterations = 100;
+    reclassifyTolerance = 1e-5;
+    reclassifyTaxonomy = 0;
+    reclassifyMaxDropPercentage = 10.0;
+
     lcaRanks = "";
     showTaxLineage = 0;
     // bin for all unclassified sequences

diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h
@@ -724,6 +724,15 @@ class Parameters {
     std::string unpackSuffix;
     int unpackNameMode;
 
+    // reclassify
+    double reclassifyLambda;
+    double reclassifyAlpha;
+    double reclassifyGamma;
+    int reclassifyMaxIterations;
+    double reclassifyTolerance;
+    int reclassifyTaxonomy;
+    double reclassifyMaxDropPercentage;
+
     // for modules that should handle -h themselves
     bool help;
 
@@ -1081,6 +1090,15 @@ class Parameters {
     PARAMETER(PARAM_UNPACK_SUFFIX)
     PARAMETER(PARAM_UNPACK_NAME_MODE)
 
+    // reclassify
+    PARAMETER(PARAM_RECLASSIFY_LAMBDA)
+    PARAMETER(PARAM_RECLASSIFY_ALPHA)
+    PARAMETER(PARAM_RECLASSIFY_GAMMA)
+    PARAMETER(PARAM_RECLASSIFY_MAX_ITER)
+    PARAMETER(PARAM_RECLASSIFY_TOL)
+    PARAMETER(PARAM_RECLASSIFY_TAXONOMY)
+    PARAMETER(PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE)
+
     // for modules that should handle -h themselves
     PARAMETER(PARAM_HELP)
     PARAMETER(PARAM_HELP_LONG)
@@ -1207,6 +1225,8 @@ class Parameters {
     std::vector<MMseqsParameter*> touchdb;
     std::vector<MMseqsParameter*> gpuserver;
     std::vector<MMseqsParameter*> tsv2exprofiledb;
+    std::vector<MMseqsParameter*> reclassify;
+    std::vector<MMseqsParameter*> abundance;
 
     std::vector<MMseqsParameter*> combineList(const std::vector<MMseqsParameter*> &par1,
                                              const std::vector<MMseqsParameter*> &par2);

diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
@@ -48,6 +48,8 @@ set(util_source_files
         util/profile2pssm.cpp
         util/profile2neff.cpp
         util/profile2seq.cpp
+        util/EM_reclassify.cpp
+        util/EM_abundnace.cpp
         util/recoverlongestorf.cpp
         util/result2dnamsa.cpp
         util/result2flat.cpp