Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions countNUM.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#Counting the unique target ID from result2.m8 & reclass_result.txt
from collections import defaultdict
import pdb

PATH1 = '/home/yakim/benchmark/result_mmseq.m8' # mmseq
PATH2 = '/home/yakim/benchmark/reclass_bit.m8' # after reclassification - bit score
PATH3 = '/home/yakim/benchmark/reclass_seqid.m8' # after reclassification - seq id
PATH4 = '/home/yakim/benchmark/reclass0922.m8' # after reclassification - C++ (seqid)


# MMseq2 result count
qandt = []
hit_num = 0
redund1 = []
with open(PATH1, 'r') as f:
for line in f:
a = line.split()
qandt.append([a[0],a[1]])

for q, t in qandt:
query = q.split('_')[-1] #query
qnum = q.split('_')[0]
if qnum not in redund1:
redund1.append(qnum)
if query == t:
hit_num += 1
print('Hit # of mmseqs result:', hit_num, '&& Total #:', len(redund1))


# Mmseq2 result count (count also for every top 10)
'''
hit_num1 = 0
grouped = defaultdict(list)

with open(PATH1, 'r') as f:
for line in f:
a = line.split()
query_full = a[0] # query705_from_UniRef90_A0AA46S9G1
target = a[1] # A0AA46S9G1
seq_identity = round(float(a[2]), 3)
query_num = query_full.split('_')[0] # query705
query_last_id = query_full.split('_')[-1] # A0AA46S9G1

# query 번호별로 그룹핑
grouped[query_num].append({
'query_last_id': query_last_id,
'target': target,
'seq_identity': seq_identity
}) # row[0] row[1] row[2]
#{query1: (A0AA46S9G1, A0AA46S9R1, 0.9), (A0AA46S9G1, A0AA46S9G1, 0.9), (A0AA46S9G1, A0AA34S9T1, 0.85)}

for query_num, rows in grouped.items():
if not rows:
continue
top_seq_identity = rows[0]['seq_identity'] # 0.9
top_query_last_id = rows[0]['query_last_id'] # A0AA46S9G1

count = sum(
(row['query_last_id'] == row['target']) and
(row['seq_identity'] == top_seq_identity)
for row in rows
)
#print(f'{query_num}: {count}') # 각 그룹별로 개수 출력
hit_num1 += count

print('Hit # of mmseqs result(top 10):', hit_num1, '&& Total #:', len(redund))
'''

# After reclassification result (bit score)
qandt2 = []
hit_num2 = 0
redund2 = []
with open(PATH2, 'r') as f:
for line in f:
a = line.split()
qandt2.append([a[0],a[1]])

for q, t in qandt2:
query = q.split('_')[-1] #query
qnum = q.split('_')[0]
if qnum not in redund2:
redund2.append(qnum)
if query == t:
hit_num2 += 1
print('Hit # of reclassification result(bit score):', hit_num2, '&& Total #:', len(redund2))


# After reclassification result (seq id)
qandt3 = []
hit_num3 = 0
redund3 = []
with open(PATH3, 'r') as f:
for line in f:
a = line.split()
qandt3.append([a[0],a[1]])

for q, t in qandt3:
query = q.split('_')[-1] #query
qnum = q.split('_')[0]
if qnum not in redund3:
redund3.append(qnum)
if query == t:
hit_num3 += 1
print('Hit # of reclassification result(seqid):', hit_num3, '&& Total #:', len(redund3))

# after reclass result count (C++)
qandt4 = []
hit_num4 = 0
redund4 = []
with open(PATH4, 'r') as f:
for line in f:
a = line.split()
qandt4.append([a[0],a[1]])

for q, t in qandt4:
query = q.split('_')[-1] #query
qnum = q.split('_')[0]
if qnum not in redund4:
redund4.append(qnum)
if query == t:
hit_num4 += 1
print('Hit # of reclassification result(C++):', hit_num4, '&& Total #:', len(redund4))

2 changes: 2 additions & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ extern int gappedprefilter(int argc, const char **argv, const Command& command);
extern int unpackdb(int argc, const char **argv, const Command& command);
extern int rbh(int argc, const char **argv, const Command& command);
extern int recoverlongestorf(int argc, const char **argv, const Command& command);
extern int emreclassify(int argc, const char **argv, const Command& command);
extern int emabundance(int argc, const char **argv, const Command& command);
extern int result2flat(int argc, const char **argv, const Command& command);
extern int result2msa(int argc, const char **argv, const Command& command);
extern int result2dnamsa(int argc, const char **argv, const Command& command);
Expand Down
20 changes: 20 additions & 0 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,26 @@ std::vector<Command> baseCommands = {
"<i:resultbDB> <o:resultDB>",
CITATION_MMSEQS2, {{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
{"resultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::resultDb }}},
{"reclassify", emreclassify, &par.reclassify, COMMAND_RESULT | COMMAND_FORMAT_CONVERSION,
"Reclassify alignments and write a new alignment DB with posterior probabilities(convertalis will convert it into column 3(instead of seqId))",
"mmseqs reclassify queryDB targetDB alignmentDB newDB\n",
"Yaeji Kim",
"<i:queryDB> <i:targetDB> <i:alignmentDB> <o:newDB>",
CITATION_MMSEQS2, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
{"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
{"newDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
{"abundance", emabundance, &par.abundance, COMMAND_RESULT | COMMAND_FORMAT_CONVERSION,
"Summarize abundance from reclassified alignment DB",
"mmseqs abundance queryDB targetDB newDB abundance.tsv\n"
"# targetDB_mapping and targetDB_taxonomy are only required with --taxonomy 1\n"
"mmseqs abundance queryDB targetDB newDB abundance.report --taxonomy 1\n",
"Yaeji Kim",
"<i:queryDB> <i:targetDB> <i:alignmentDB> <o:abundanceFile>",
CITATION_MMSEQS2 | CITATION_TAXONOMY, {{"queryDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::sequenceDb },
{"targetDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::NEED_HEADER, &DbValidator::taxSequenceDb },
{"alignmentDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
{"abundanceFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile }}},
{"summarizealis", summarizealis, &par.threadsandcompression, COMMAND_RESULT,
"Summarize alignment result to one row (uniq. cov., cov., avg. seq. id.)",
NULL,
Expand Down
40 changes: 40 additions & 0 deletions src/commons/Parameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,14 @@ Parameters::Parameters():
// unpackdb
PARAM_UNPACK_SUFFIX(PARAM_UNPACK_SUFFIX_ID, "--unpack-suffix", "Unpack suffix", "File suffix for unpacked files.\nAdd .gz suffix to write compressed files.", typeid(std::string), (void *) &unpackSuffix, "^.*$"),
PARAM_UNPACK_NAME_MODE(PARAM_UNPACK_NAME_MODE_ID, "--unpack-name-mode", "Unpack name mode", "Name unpacked files by 0: DB key, 1: accession (through .lookup)", typeid(int), (void *) &unpackNameMode, "^[0-1]{1}$"),
// reclassify
PARAM_RECLASSIFY_LAMBDA(PARAM_RECLASSIFY_LAMBDA_ID, "--lambda", "Reclassify lambda", "Lambda scaling factor for the reclassification bit score term", typeid(double), (void *) &reclassifyLambda, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
PARAM_RECLASSIFY_ALPHA(PARAM_RECLASSIFY_ALPHA_ID, "--alpha", "Reclassify alpha", "Exponent applied to abundance during reclassification", typeid(double), (void *) &reclassifyAlpha, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
PARAM_RECLASSIFY_GAMMA(PARAM_RECLASSIFY_GAMMA_ID, "--gamma", "Reclassify gamma", "Exponent applied to coverage confidence during reclassification", typeid(double), (void *) &reclassifyGamma, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
PARAM_RECLASSIFY_MAX_ITER(PARAM_RECLASSIFY_MAX_ITER_ID, "--max-iter", "Reclassify max iterations", "Maximum number of SQUAREM iterations for reclassification", typeid(int), (void *) &reclassifyMaxIterations, "^[1-9]{1}[0-9]*$"),
PARAM_RECLASSIFY_TOL(PARAM_RECLASSIFY_TOL_ID, "--tol", "Reclassify tolerance", "Convergence tolerance for reclassification", typeid(double), (void *) &reclassifyTolerance, "^([-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?)|([0-9]*(\\.[0-9]+)?)$"),
PARAM_RECLASSIFY_TAXONOMY(PARAM_RECLASSIFY_TAXONOMY_ID, "--taxonomy", "Abundance taxonomy output", "0: write alignment and protein abundance only; taxonomy files are not required. 1: also write taxonomy_abundance.tsv and taxonomic columns in protein_abundance.tsv; requires targetDB_mapping and targetDB_taxonomy", typeid(int), (void *) &reclassifyTaxonomy, "^[0-1]{1}$"),
PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE(PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE_ID, "--drop-percentage", "Max drop percentage", "Maximum percentage of cumulative low-tail abundance mass that the automatic jump-based filter may drop (range 0.0-100.0, default 10.0)", typeid(double), (void *) &reclassifyMaxDropPercentage, "^100(\\.0+)?$|^([0-9]|[1-9][0-9])(\\.[0-9]+)?$"),
// for modules that should handle -h themselves
PARAM_HELP(PARAM_HELP_ID, "-h", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN),
PARAM_HELP_LONG(PARAM_HELP_LONG_ID, "--help", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN)
Expand Down Expand Up @@ -335,6 +343,29 @@ Parameters::Parameters():
threadsandcompression.push_back(&PARAM_COMPRESSED);
threadsandcompression.push_back(&PARAM_V);

// reclassify
reclassify.push_back(&PARAM_RECLASSIFY_LAMBDA);
reclassify.push_back(&PARAM_RECLASSIFY_ALPHA);
reclassify.push_back(&PARAM_RECLASSIFY_GAMMA);
reclassify.push_back(&PARAM_RECLASSIFY_MAX_ITER);
reclassify.push_back(&PARAM_RECLASSIFY_TOL);
reclassify.push_back(&PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE);
reclassify.push_back(&PARAM_THREADS);
reclassify.push_back(&PARAM_COMPRESSED);
reclassify.push_back(&PARAM_V);

// abundance
abundance.push_back(&PARAM_RECLASSIFY_LAMBDA);
abundance.push_back(&PARAM_RECLASSIFY_ALPHA);
abundance.push_back(&PARAM_RECLASSIFY_GAMMA);
abundance.push_back(&PARAM_RECLASSIFY_MAX_ITER);
abundance.push_back(&PARAM_RECLASSIFY_TOL);
abundance.push_back(&PARAM_RECLASSIFY_TAXONOMY);
abundance.push_back(&PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE);
abundance.push_back(&PARAM_THREADS);
abundance.push_back(&PARAM_COMPRESSED);
abundance.push_back(&PARAM_V);

// createclusearchdb
createclusearchdb.push_back(&PARAM_THREADS);
createclusearchdb.push_back(&PARAM_COMPRESSED);
Expand Down Expand Up @@ -2637,6 +2668,15 @@ void Parameters::setDefaults() {
unpackSuffix = "";
unpackNameMode = Parameters::UNPACK_NAME_ACCESSION;

// reclassify
reclassifyLambda = 1.0;
reclassifyAlpha = 1.0;
reclassifyGamma = 1.0;
reclassifyMaxIterations = 100;
reclassifyTolerance = 1e-5;
reclassifyTaxonomy = 0;
reclassifyMaxDropPercentage = 10.0;

lcaRanks = "";
showTaxLineage = 0;
// bin for all unclassified sequences
Expand Down
20 changes: 20 additions & 0 deletions src/commons/Parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,15 @@ class Parameters {
std::string unpackSuffix;
int unpackNameMode;

// reclassify
double reclassifyLambda;
double reclassifyAlpha;
double reclassifyGamma;
int reclassifyMaxIterations;
double reclassifyTolerance;
int reclassifyTaxonomy;
double reclassifyMaxDropPercentage;

// for modules that should handle -h themselves
bool help;

Expand Down Expand Up @@ -1081,6 +1090,15 @@ class Parameters {
PARAMETER(PARAM_UNPACK_SUFFIX)
PARAMETER(PARAM_UNPACK_NAME_MODE)

// reclassify
PARAMETER(PARAM_RECLASSIFY_LAMBDA)
PARAMETER(PARAM_RECLASSIFY_ALPHA)
PARAMETER(PARAM_RECLASSIFY_GAMMA)
PARAMETER(PARAM_RECLASSIFY_MAX_ITER)
PARAMETER(PARAM_RECLASSIFY_TOL)
PARAMETER(PARAM_RECLASSIFY_TAXONOMY)
PARAMETER(PARAM_RECLASSIFY_MAX_DROP_PERCENTAGE)

// for modules that should handle -h themselves
PARAMETER(PARAM_HELP)
PARAMETER(PARAM_HELP_LONG)
Expand Down Expand Up @@ -1207,6 +1225,8 @@ class Parameters {
std::vector<MMseqsParameter*> touchdb;
std::vector<MMseqsParameter*> gpuserver;
std::vector<MMseqsParameter*> tsv2exprofiledb;
std::vector<MMseqsParameter*> reclassify;
std::vector<MMseqsParameter*> abundance;

std::vector<MMseqsParameter*> combineList(const std::vector<MMseqsParameter*> &par1,
const std::vector<MMseqsParameter*> &par2);
Expand Down
2 changes: 2 additions & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ set(util_source_files
util/profile2pssm.cpp
util/profile2neff.cpp
util/profile2seq.cpp
util/EM_reclassify.cpp
util/EM_abundnace.cpp
util/recoverlongestorf.cpp
util/result2dnamsa.cpp
util/result2flat.cpp
Expand Down
Loading