-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
microbe command line script initialized
- Loading branch information
1 parent
6a851c7
commit 32c629b
Showing
1 changed file
with
125 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import os | ||
import multiprocessing | ||
import argparse | ||
import pandas as pd | ||
from microberx import MetabolitePredictor | ||
from rdkit import Chem | ||
from tqdm import tqdm | ||
import warnings | ||
|
||
warnings.filterwarnings("ignore") | ||
|
||
""" | ||
MicrobeRX Predictions Script | ||
============================= | ||
This script allows users to perform predictions using the MicrobeRX library either with a single smiles string or a file containing multiple smiles strings. The predictions can be customized with various parameters such as the biosystem, cutoff value, number of processing cores, and output directory. | ||
Usage | ||
----- | ||
To run the script, use the following command format: | ||
```bash | ||
python microberx_cli.py --smiles <smiles_string> --query_name <query_name> --biosystem <biosystem> --cutoff <cutoff_value> --out <output_directory> | ||
python microberx_cli.py --file <input_file> --biosystem <biosystem> --cutoff <cutoff_value> --num_cores <num_cores> --out <output_directory> | ||
Arguments | ||
--smiles : str, optional | ||
Input smiles string for prediction. If provided, predictions will be made for this single molecule. | ||
--query_name : str, optional | ||
Query name for the provided smiles string. This name will be used in the output file name. | ||
--file : str, optional | ||
Input file containing multiple smiles strings and names. The file should be in tab-separated values (TSV) format with columns "name" and "smiles". | ||
--biosystem : str, optional | ||
Biosystem to use for predictions. Default is 'all'. | ||
--cutoff : float, optional | ||
Cutoff value for predictions. Default is 0.6. | ||
--num_cores : int, optional | ||
Number of cores to use for multiprocessing. Default is 4. | ||
--out : str, optional | ||
Output directory for predictions. Default is the current directory ('./'). | ||
Description | ||
The script can be run in two modes: | ||
- Single Prediction Mode: When the --smiles argument is provided, the script will perform predictions for the single molecule specified by the smiles string. | ||
- Batch Prediction Mode: When the --file argument is provided, the script will read the input file and perform predictions for each molecule listed in the file using multiprocessing. | ||
Examples | ||
Single Prediction Mode: | ||
python microberx_cli.py --SMILES "CCO" --query_name "ethanol" --biosystem "all" --cutoff 0.6 --out "./predictions/" | ||
This command will perform predictions for ethanol and save the results in the specified output directory. | ||
Batch Prediction Mode: | ||
python microberx_cli.py --file "input_molecules.tsv" --biosystem "all" --cutoff 0.6 --num_cores 4 --out "./predictions/" | ||
This command will read the input_molecules.tsv file, perform predictions for each molecule using 4 cores, and save the results in the specified output directory. | ||
Output | ||
The predictions are saved as TSV files in the specified output directory. Each file is named after the query name provided or the names listed in the input file. | ||
""" | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='MicrobeRX Predictions') | ||
parser.add_argument('--smiles', type=str, default="", help='Input smiles string for prediction') | ||
parser.add_argument('--query_name', type=str, default="", help='Query name for the smiles string') | ||
parser.add_argument('--file', type=str, default="", help='Input file for predictions') | ||
parser.add_argument('--biosystem', type=str, default='all', help='Biosystem to use for predictions options: all, human, gutmicrobes') | ||
parser.add_argument('--cutoff', type=float, default=0.6, help='Cutoff for predictions') | ||
parser.add_argument('--num_cores', type=int, default=4, help='Number of cores to use for multiprocessing') | ||
parser.add_argument('--out', type=str, default='./', help='Output directory for predictions') | ||
|
||
args = parser.parse_args() | ||
|
||
smiles = args.smiles | ||
query_name = args.query_name | ||
file = args.file | ||
biosystem = args.biosystem | ||
cutoff = args.cutoff | ||
num_cores = args.num_cores | ||
out = args.out | ||
|
||
if smiles: | ||
query = Chem.MolFromSmiles(smiles) | ||
|
||
Predictor = MetabolitePredictor(query, query_name=query_name, biosystem=biosystem, cut_off=cutoff) | ||
Predictor.run_prediction() | ||
|
||
metabolites = Predictor.predicted_metabolites | ||
|
||
output_file = os.path.join(out, f"{query_name}.tsv") | ||
metabolites.to_csv(output_file, sep='\t', index=False) | ||
print(f"Predictions saved to {output_file}") | ||
|
||
elif file: | ||
mols = pd.read_csv(file, sep="\t") | ||
|
||
def _runPredictions(index): | ||
try: | ||
name = mols.name[index] | ||
query = Chem.MolFromSmiles(mols.smiles[index]) | ||
|
||
Predictor = MetabolitePredictor(query, query_name=name, biosystem=biosystem, cut_off=cutoff) | ||
Predictor.run_prediction() | ||
|
||
output_file = os.path.join(out, f"{name}.tsv") | ||
Predictor.predicted_metabolites.to_csv(output_file, sep='\t', index=False) | ||
|
||
except Exception as e: | ||
print(f"Error processing {mols.name[index]}: {e}") | ||
|
||
with multiprocessing.Pool(num_cores) as pool: | ||
list(tqdm(pool.imap(_runPredictions, mols.index), total=len(mols.index))) | ||
|
||
if __name__ == '__main__': | ||
main() |