Skip to content

Commit 2b0d3af

Browse files
Merge pull request #5 from PyEED/mmseqs
Mmseqs
2 parents f533666 + 2241ccb commit 2b0d3af

File tree

3 files changed

+115
-117
lines changed

3 files changed

+115
-117
lines changed

mmseqs2/Dockerfile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@ RUN apt-get update && apt-get install -y python3 python3-pip
1010
RUN pip3 install fastapi uvicorn
1111

1212
# Copy the FastAPI app to the container
13-
COPY app.py app.py
13+
COPY app.py /usr/local/bin/app.py
14+
15+
# Set the working directory
16+
WORKDIR /usr/local/bin
1417

1518
# Expose the port on which FastAPI will run
16-
EXPOSE 8000
19+
EXPOSE 8001
1720

1821
# Start the FastAPI server when the container starts
1922
CMD ["python3", "app.py"]

mmseqs2/app.py

Lines changed: 106 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -1,133 +1,128 @@
1-
from fastapi import FastAPI, HTTPException
2-
from pydantic import BaseModel
1+
from fastapi import FastAPI, HTTPException, Request
2+
from starlette.responses import FileResponse
3+
import logging
4+
35
import subprocess
46
import os
5-
from uuid import uuid4
67
import shutil
78

89
app = FastAPI()
9-
10-
# Define a model for the input parameters
11-
class MMSeqsParams(BaseModel):
12-
query: str # The query sequence
13-
database: str
14-
output: str # The output directory
15-
sensitivity: float = 7.5 # Sensitivity parameter for mmseqs2
16-
threads: int = 4 # Number of threads to use
17-
blast_format: bool = True # Option to convert to BLAST+ format
18-
19-
# Dictionary to keep track of running jobs and results
20-
job_results = {}
21-
22-
def create_fastas_file_from_seq(seq, filename):
23-
with open(filename, 'w') as file:
24-
file.write(f">seq\n{seq}\n")
25-
26-
def create_queryDB_from_seq(filename):
27-
# this will create a db from a single sequence file
28-
# the command is mmseqs createdb <input> <output>
29-
# the output should be a file with the same name as the input but with the extension .db
30-
31-
command = [
32-
"mmseqs", "createdb",
33-
filename,
34-
filename.replace('fasta', '') + ".db"
35-
]
36-
37-
try:
38-
subprocess.run(command, check=True)
39-
40-
except subprocess.CalledProcessError as e:
41-
raise HTTPException(status_code=600, detail=str(e))
10+
logging.basicConfig(level=logging.INFO)
11+
logger = logging.getLogger(__name__)
12+
logger.info("FastAPI server is running...")
13+
14+
15+
def create_fastas_file_from_seq(query_string, filename):
16+
"""
17+
Creates a FASTA file from a single string containing FASTA-formatted sequences.
18+
19+
Args:
20+
query_string (str): String containing FASTA-formatted sequences.
21+
filename (str): Path to the output FASTA file.
22+
23+
Raises:
24+
ValueError: If any sequence contains invalid characters.
25+
"""
26+
def validate_sequence(sequence: str) -> bool:
27+
"""Validate that a sequence contains only valid amino acid characters."""
28+
valid_chars = set("ACDEFGHIKLMNPQRSTVWY*X") # Allow amino acids + stop codon (*), unknown (X)
29+
sequence = sequence.upper().strip().replace("\n", "") # Remove whitespace and newlines
30+
return all(char in valid_chars for char in sequence)
31+
32+
# Split query string into lines
33+
lines = query_string.strip().split("\n")
34+
35+
# Parse headers and sequences
36+
multifasta = []
37+
current_header = None
38+
current_sequence = []
39+
40+
for line in lines:
41+
if line.startswith(">"): # Header line
42+
if current_header: # Save the previous sequence
43+
sequence = "".join(current_sequence)
44+
if not validate_sequence(sequence):
45+
raise ValueError(f"Invalid characters in sequence under {current_header}")
46+
multifasta.append(f"{current_header}\n{sequence}")
47+
current_header = line.strip() # Update header
48+
current_sequence = [] # Reset sequence buffer
49+
else: # Sequence line
50+
current_sequence.append(line.strip())
51+
52+
# Add the last sequence
53+
if current_header and current_sequence:
54+
sequence = "".join(current_sequence)
55+
if not validate_sequence(sequence):
56+
raise ValueError(f"Invalid characters in sequence under {current_header}")
57+
multifasta.append(f"{current_header}\n{sequence}")
58+
59+
# Write to file
60+
with open(filename, 'w', encoding='utf-8') as f:
61+
f.write("\n".join(multifasta) + "\n") # Ensure newline at end of file
62+
63+
print(f"FASTA file created: {filename}")
4264

43-
4465
@app.get("/")
4566
async def read_root():
4667
return {"message": "Welcome to the MMSeqs2 API!"}
4768

48-
@app.post("/run_mmseqs")
49-
async def run_mmseqs(params: MMSeqsParams):
50-
# Create a unique job id
51-
job_id = str(uuid4())
52-
output_dir = f"/tmp/{job_id}"
69+
@app.get("/help")
70+
def help():
71+
try:
72+
results = subprocess.run(
73+
["mmseqs", "-h"],
74+
capture_output=True,
75+
text=True,
76+
)
77+
return {"help": results.stdout}
78+
except subprocess.CalledProcessError as e:
79+
raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}")
5380

54-
# Prepare the output directory
55-
os.makedirs(output_dir, exist_ok=True)
81+
@app.post("/easycluster")
82+
async def easycluster(request: Request):
83+
data = await request.json()
84+
logger.info(f"Received request data: {data}")
85+
86+
BASE_DIR = "/app"
87+
query_filename = os.path.join(BASE_DIR, "in.fasta")
88+
result_filename = os.path.join(BASE_DIR, "output")
89+
tmp_dir = os.path.join(BASE_DIR, "tmp")
5690

57-
# Prepare paths
58-
result_m8_path = os.path.join(output_dir, "result.m8")
59-
result_tsv_path = os.path.join(output_dir, "result.tsv")
91+
os.makedirs(tmp_dir, exist_ok=True)
92+
open(result_filename, 'w').close() # Clear or create result file
6093

61-
# Create the FASTA file
62-
path_query = os.path.join(output_dir, "query.fasta")
63-
path_queryDB = path_query.replace('fasta', '') + ".db"
64-
create_fastas_file_from_seq(params.query, path_query)
65-
create_queryDB_from_seq(path_query)
94+
# Create the FASTA file from the query string
95+
create_fastas_file_from_seq(data['query'], query_filename)
6696

67-
# Run the mmseqs2 search command
97+
# Run the mmseqs2 command
6898
command = [
69-
"mmseqs", "search",
70-
path_queryDB,
71-
params.database,
72-
os.path.join(output_dir, "result"),
73-
output_dir,
74-
"--threads", str(params.threads),
75-
"--sensitivity", str(params.sensitivity)
76-
]
77-
99+
"mmseqs",
100+
"easy-cluster",
101+
query_filename,
102+
result_filename,
103+
'--min-seq-id', str(data['min_seq_id']),
104+
'-c', str(data['coverage']),
105+
'--cov-mode', str(data['cov_mode']),
106+
tmp_dir]
107+
logger.info(f"Running command: {' '.join(command)}")
108+
78109
try:
79-
# Execute mmseqs search
80-
subprocess.run(command, check=True)
81-
82-
# Convert the results to BLAST+ format if requested
83-
if params.blast_format:
84-
# mmseqs convertalis queryDB targetDB resultDB resultDB.m8
85-
# Convert to BLAST tabular format (BLAST m8 format)
86-
convert_command = [
87-
"mmseqs", "convertalis",
88-
params.query,
89-
params.database,
90-
os.path.join(output_dir, "result"),
91-
result_m8_path,
92-
]
93-
subprocess.run(convert_command, check=True)
94-
95-
# Store the result path for m8 format
96-
job_results[job_id] = {
97-
"status": "completed",
98-
"result_path": result_m8_path
99-
}
100-
else:
101-
# Store the result path for standard mmseqs2 output (TSV format)
102-
job_results[job_id] = {
103-
"status": "completed",
104-
"result_path": result_tsv_path
105-
}
106-
107-
return {"job_id": job_id}
108-
except subprocess.CalledProcessError as e:
109-
raise HTTPException(status_code=500, detail=f"mmseqs2 failed: {str(e)}")
110-
111-
@app.get("/results/{job_id}")
112-
async def get_results(job_id: str):
113-
# Check if the job exists
114-
if job_id not in job_results:
115-
raise HTTPException(status_code=404, detail="Job not found")
110+
result = subprocess.run(command, capture_output=True, text=True, check=True)
111+
logger.info(f"Command output: {result.stdout}")
116112

117-
# Get the result path
118-
result = job_results[job_id]
119-
120-
# Read and return the result (assuming it's a text file you want to read and return)
121-
result_file = result["result_path"]
122-
if os.path.exists(result_file):
123-
with open(result_file, "r") as file:
124-
data = file.read()
125-
return {"status": result["status"], "results": data}
126-
else:
127-
raise HTTPException(status_code=404, detail="Result file not found")
113+
except subprocess.CalledProcessError as e:
114+
logger.error(f"Command failed with return code {e.returncode}")
115+
logger.error(f"STDOUT: {e.stdout}")
116+
logger.error(f"STDERR: {e.stderr}")
117+
raise HTTPException(status_code=500, detail=f"Command failed: {e.stderr}")
128118

119+
with open("/app/output_all_seqs.fasta", 'r') as file:
120+
logger.info(f"Reading result file: /app/output_all_seqs.fasta")
121+
result = file.read()
122+
123+
return result
129124

130125
if __name__ == '__main__':
131126
import uvicorn
132127

133-
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
128+
uvicorn.run("app:app", host="0.0.0.0", port=8001, reload=True)

mmseqs2/reload_development.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
sudo docker stop mmseq_docker
2-
sudo docker remove mmseq_docker
3-
sudo docker build --no-cache -t mmseq_docker .
4-
sudo docker run --name mmseq_docker --volume /mnt/databases:/app -p 8000:8000 mmseq_docker
1+
sudo docker stop mmseqs
2+
sudo docker remove mmseqs
3+
sudo docker build --no-cache -t mmseqs_docker .
4+
sudo docker run -d --name mmseqs -p 8001:8001 mmseqs_docker

0 commit comments

Comments
 (0)