Merge pull request #5 from PyEED/mmseqs

NiklasAbraham · web-flow · commit 2b0d3af14dde · 2025-01-16T14:08:54.000+01:00
Mmseqs
diff --git a/mmseqs2/Dockerfile b/mmseqs2/Dockerfile
@@ -10,10 +10,13 @@ RUN apt-get update && apt-get install -y python3 python3-pip
 RUN pip3 install fastapi uvicorn
 
 # Copy the FastAPI app to the container
-COPY app.py app.py
+COPY app.py /usr/local/bin/app.py
+
+# Set the working directory
+WORKDIR /usr/local/bin
 
 # Expose the port on which FastAPI will run
-EXPOSE 8000
+EXPOSE 8001
 
 # Start the FastAPI server when the container starts
 CMD ["python3", "app.py"]
diff --git a/mmseqs2/app.py b/mmseqs2/app.py
@@ -1,133 +1,128 @@
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
+from fastapi import FastAPI, HTTPException, Request
+from starlette.responses import FileResponse
+import logging
+
 import subprocess
 import os
-from uuid import uuid4
 import shutil
 
 app = FastAPI()
-
-# Define a model for the input parameters
-class MMSeqsParams(BaseModel):
-    query: str  # The query sequence
-    database: str
-    output: str  # The output directory
-    sensitivity: float = 7.5  # Sensitivity parameter for mmseqs2
-    threads: int = 4  # Number of threads to use
-    blast_format: bool = True  # Option to convert to BLAST+ format
-
-# Dictionary to keep track of running jobs and results
-job_results = {}
-
-def create_fastas_file_from_seq(seq, filename):
-    with open(filename, 'w') as file:
-        file.write(f">seq\n{seq}\n")
-
-def create_queryDB_from_seq(filename):
-    # this will create a db from a single sequence file
-    # the command is mmseqs createdb <input> <output>
-    # the output should be a file with the same name as the input but with the extension .db
-
-    command = [
-        "mmseqs", "createdb",
-        filename,
-        filename.replace('fasta', '') + ".db"
-    ]
-
-    try:
-        subprocess.run(command, check=True)
-    
-    except subprocess.CalledProcessError as e:
-        raise HTTPException(status_code=600, detail=str(e))
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.info("FastAPI server is running...")
+
+
+def create_fastas_file_from_seq(query_string, filename):
+    """
+    Creates a FASTA file from a single string containing FASTA-formatted sequences.
+
+    Args:
+        query_string (str): String containing FASTA-formatted sequences.
+        filename (str): Path to the output FASTA file.
+
+    Raises:
+        ValueError: If any sequence contains invalid characters.
+    """
+    def validate_sequence(sequence: str) -> bool:
+        """Validate that a sequence contains only valid amino acid characters."""
+        valid_chars = set("ACDEFGHIKLMNPQRSTVWY*X")  # Allow amino acids + stop codon (*), unknown (X)
+        sequence = sequence.upper().strip().replace("\n", "")  # Remove whitespace and newlines
+        return all(char in valid_chars for char in sequence)
+
+    # Split query string into lines
+    lines = query_string.strip().split("\n")
+
+    # Parse headers and sequences
+    multifasta = []
+    current_header = None
+    current_sequence = []
+
+    for line in lines:
+        if line.startswith(">"):  # Header line
+            if current_header:  # Save the previous sequence
+                sequence = "".join(current_sequence)
+                if not validate_sequence(sequence):
+                    raise ValueError(f"Invalid characters in sequence under {current_header}")
+                multifasta.append(f"{current_header}\n{sequence}")
+            current_header = line.strip()  # Update header
+            current_sequence = []  # Reset sequence buffer
+        else:  # Sequence line
+            current_sequence.append(line.strip())
+
+    # Add the last sequence
+    if current_header and current_sequence:
+        sequence = "".join(current_sequence)
+        if not validate_sequence(sequence):
+            raise ValueError(f"Invalid characters in sequence under {current_header}")
+        multifasta.append(f"{current_header}\n{sequence}")
+
+    # Write to file
+    with open(filename, 'w', encoding='utf-8') as f:
+        f.write("\n".join(multifasta) + "\n")  # Ensure newline at end of file
+
+    print(f"FASTA file created: {filename}")
     
-
 @app.get("/")
 async def read_root():
     return {"message": "Welcome to the MMSeqs2 API!"}
 
-@app.post("/run_mmseqs")
-async def run_mmseqs(params: MMSeqsParams):
-    # Create a unique job id
-    job_id = str(uuid4())
-    output_dir = f"/tmp/{job_id}"
+@app.get("/help")
+def help():
+    try: 
+        results = subprocess.run(
+            ["mmseqs", "-h"],
+            capture_output=True,
+            text=True,
+        )
+        return {"help": results.stdout}
+    except subprocess.CalledProcessError as e:
+        raise HTTPException(status_code=400, detail=f"Command failed {e.stderr}")
 
-    # Prepare the output directory
-    os.makedirs(output_dir, exist_ok=True)
+@app.post("/easycluster")
+async def easycluster(request: Request):
+    data = await request.json()
+    logger.info(f"Received request data: {data}")
+    
+    BASE_DIR = "/app"
+    query_filename = os.path.join(BASE_DIR, "in.fasta")
+    result_filename = os.path.join(BASE_DIR, "output")
+    tmp_dir = os.path.join(BASE_DIR, "tmp")
 
-    # Prepare paths
-    result_m8_path = os.path.join(output_dir, "result.m8")
-    result_tsv_path = os.path.join(output_dir, "result.tsv")
+    os.makedirs(tmp_dir, exist_ok=True)
+    open(result_filename, 'w').close()  # Clear or create result file
 
-    # Create the FASTA file
-    path_query = os.path.join(output_dir, "query.fasta")
-    path_queryDB = path_query.replace('fasta', '') + ".db"
-    create_fastas_file_from_seq(params.query, path_query)
-    create_queryDB_from_seq(path_query)
+    # Create the FASTA file from the query string
+    create_fastas_file_from_seq(data['query'], query_filename)
 
-    # Run the mmseqs2 search command
+    # Run the mmseqs2 command
     command = [
-        "mmseqs", "search", 
-        path_queryDB, 
-        params.database, 
-        os.path.join(output_dir, "result"), 
-        output_dir, 
-        "--threads", str(params.threads), 
-        "--sensitivity", str(params.sensitivity)
-    ]
-
+        "mmseqs", 
+        "easy-cluster", 
+        query_filename, 
+        result_filename, 
+        '--min-seq-id', str(data['min_seq_id']),
+        '-c', str(data['coverage']),
+        '--cov-mode', str(data['cov_mode']),
+        tmp_dir]
+    logger.info(f"Running command: {' '.join(command)}")
+    
     try:
-        # Execute mmseqs search
-        subprocess.run(command, check=True)
-
-        # Convert the results to BLAST+ format if requested
-        if params.blast_format:
-            # mmseqs convertalis queryDB targetDB resultDB resultDB.m8
-            # Convert to BLAST tabular format (BLAST m8 format)
-            convert_command = [
-                "mmseqs", "convertalis", 
-                params.query, 
-                params.database, 
-                os.path.join(output_dir, "result"), 
-                result_m8_path, 
-            ]
-            subprocess.run(convert_command, check=True)
-            
-            # Store the result path for m8 format
-            job_results[job_id] = {
-                "status": "completed",
-                "result_path": result_m8_path
-            }
-        else:
-            # Store the result path for standard mmseqs2 output (TSV format)
-            job_results[job_id] = {
-                "status": "completed",
-                "result_path": result_tsv_path
-            }
-
-        return {"job_id": job_id}
-    except subprocess.CalledProcessError as e:
-        raise HTTPException(status_code=500, detail=f"mmseqs2 failed: {str(e)}")
-
-@app.get("/results/{job_id}")
-async def get_results(job_id: str):
-    # Check if the job exists
-    if job_id not in job_results:
-        raise HTTPException(status_code=404, detail="Job not found")
+        result = subprocess.run(command, capture_output=True, text=True, check=True)
+        logger.info(f"Command output: {result.stdout}")
 
-    # Get the result path
-    result = job_results[job_id]
-
-    # Read and return the result (assuming it's a text file you want to read and return)
-    result_file = result["result_path"]
-    if os.path.exists(result_file):
-        with open(result_file, "r") as file:
-            data = file.read()
-        return {"status": result["status"], "results": data}
-    else:
-        raise HTTPException(status_code=404, detail="Result file not found")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Command failed with return code {e.returncode}")
+        logger.error(f"STDOUT: {e.stdout}")
+        logger.error(f"STDERR: {e.stderr}")
+        raise HTTPException(status_code=500, detail=f"Command failed: {e.stderr}")
 
+    with open("/app/output_all_seqs.fasta", 'r') as file:
+        logger.info(f"Reading result file: /app/output_all_seqs.fasta")
+        result = file.read()
+        
+    return result
 
 if __name__ == '__main__':
     import uvicorn
     
-    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
+    uvicorn.run("app:app", host="0.0.0.0", port=8001, reload=True)
diff --git a/mmseqs2/reload_development.sh b/mmseqs2/reload_development.sh
@@ -1,4 +1,4 @@
-sudo docker stop mmseq_docker
-sudo docker remove mmseq_docker
-sudo docker build --no-cache -t mmseq_docker .
-sudo docker run --name mmseq_docker --volume /mnt/databases:/app -p 8000:8000 mmseq_docker
+sudo docker stop mmseqs
+sudo docker remove mmseqs
+sudo docker build --no-cache -t mmseqs_docker .
+sudo docker run -d --name mmseqs -p 8001:8001 mmseqs_docker