molchemaxon2pdb.py

#!/usr/bin/env python3
# author          : Aleksandra Nikonenko
# date            : 17.05.2021
# license         : BSD-3
# ==============================================================================

__author__ = 'aleksandra'

import argparse
from functools import partial
import os
import subprocess
import sys
from io import BytesIO
from multiprocessing import Pool, cpu_count

from rdkit import Chem
from rdkit.Chem import AllChem

import read_input

cmd_run = "cxcalc majormicrospecies -H {pH} -f {out_format} -M -K '{fname}'"


def gen_conf(mol, useRandomCoords, randomSeed):
    params = AllChem.ETKDGv3()
    params.useRandomCoords = useRandomCoords
    params.randomSeed = randomSeed
    conf_stat = AllChem.EmbedMolecule(mol, params)
    return mol, conf_stat


def convert2pdb(mol_iter, outpath, preserve_coord, seed, out_format):
    m, name = mol_iter
    if not m:
        sys.stderr.write('Incorrect molecule: {0}\t{1}\n'.format(Chem.MolToSmiles(m), name))
        sys.stderr.flush()
        return None

    m = Chem.AddHs(m, addCoords=True)
    if not preserve_coord:
        m, conf_stat = gen_conf(m, useRandomCoords=False, randomSeed=seed)
        if conf_stat == -1:
            # if molecule is big enough and rdkit cannot generate a conformation - use params.useRandomCoords = True
            m, conf_stat = gen_conf(m, useRandomCoords=True, randomSeed=seed)
            if conf_stat == -1:
                sys.stderr.write(
                    'Could not get conformation of the molecule: {0}\t{1}\n'.format(Chem.MolToSmiles(m), name))
                sys.stderr.flush()
                return None
        AllChem.UFFOptimizeMolecule(m, maxIters=100)

    if out_format == 'pdb':
        pdb = Chem.MolToPDBBlock(m)
        with open(os.path.join(outpath, name + '.pdb'), 'w') as data:
            data.write(pdb)
    if out_format == 'sdf':
        w = Chem.SDWriter(os.path.join(outpath, name + '.sdf'))
        w.write(m)
        w.close()


def iter_molname(fname, field):
    for _, name in read_input.read_input(fname,
                                         input_format=None,
                                         id_field_name=field,
                                         sanitize=False):
        yield name


def calc(fname, outpath, ncpu, field, pH, preserve_coord, save_sdf, no_protonation, seed, out_format):
    if outpath and not os.path.exists(outpath):
        os.mkdir(outpath)

    if no_protonation:
        mol_gener = read_input.read_input(fname,
                                          input_format=None,
                                          id_field_name=field,
                                          sanitize=True)
    else:
        sdf_pHprotonate = subprocess.check_output(cmd_run.format(fname=fname, out_format='sdf', pH=pH), shell=True)
        if save_sdf is not None:
            with open(save_sdf, 'w') as out_sdf:
                out_sdf.write(sdf_pHprotonate.decode())
        mol_gener = zip(Chem.ForwardSDMolSupplier(BytesIO(sdf_pHprotonate), sanitize=True), iter_molname(fname, field))

    if ncpu > 1:
        p = Pool(max(1, min(ncpu, cpu_count())))
        p.map(partial(convert2pdb, outpath=outpath, preserve_coord=preserve_coord, seed=seed,
                      out_format=out_format), mol_gener)
    else:
        list(map(partial(convert2pdb, outpath=outpath, preserve_coord=preserve_coord, seed=seed,
                         out_format=out_format), mol_gener))


def main():
    parser = argparse.ArgumentParser(description='''Conversion of input file to separate pdb files. 
                                                 Conformer generation is performed by RDKit.
                                                 Major tautomeric form at the given pH is generated by ChemAxon.''')
    parser.add_argument('-i', '--input', metavar='FILENAME', required=True,
                        help='input file with compounds. Supported formats: SMILES (*.smi) or SDF (*.sdf). '
                             'Note: For SDF format always use the neutral form of molecules.')
    parser.add_argument('-f', '--field_name', metavar='STRING', required=False, default=None,
                        help='Is used only if input format is *.sdf.'
                             ' Name of the field containing molecule name, if None molecule title will be taken.')
    parser.add_argument('-o', '--output', metavar='DIRNAME', required=False, default=None,
                        help='output path. If None save to current directory.')
    parser.add_argument('--pH', metavar='FLOAT', required=False, default=7.4,
                        help='To get major tautomer at this pH value.')
    parser.add_argument('--save_sdf', metavar='FILENAME', default=None,
                        help='Output filename to save major tautomer at given pH in SDF format. '
                             'Saving without hydrogens.')
    parser.add_argument('--preserve_coord', action='store_true', default=False,
                        help='If set current coordinates will be saved. Is used only for input SDF format.')
    parser.add_argument('-n', '--ncpu', metavar='INTEGER', required=False, default=1, type=int,
                        help='number of CPUs to use for computation.')
    parser.add_argument('--no_protonation', action='store_true', default=False,
                        help="if set ChemAxon protonation won't be used.")
    parser.add_argument('--seed', metavar='INTEGER', required=False, type=int, default=0,
                        help='seed to make results reproducible.')
    parser.add_argument('--out_format', metavar='SDF or PDB', default='pdb', type=lambda x: x.lower(),
                        help="Choose one of the output formats - SDF or PDB (case insensitive).")
    args = parser.parse_args()

    if args.preserve_coord and args.input.split('.')[-1].lower() != 'sdf':
        raise ValueError('Please use --preserve_coord argument only for input SDF format.')

    calc(args.input, args.output if args.output is not None else os.getcwd(), args.ncpu, args.field_name, args.pH,
         args.preserve_coord, args.save_sdf, args.no_protonation, seed=args.seed, out_format=args.out_format)


if __name__ == '__main__':
    main()