Skip to content

Commit 3016f67

Browse files
Merge branch 'main' into dataset_similarity
2 parents cc7b9cc + cee9288 commit 3016f67

14 files changed

+1379
-143
lines changed

molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py

Lines changed: 168 additions & 74 deletions
Large diffs are not rendered by default.

molpipeline/mol2any/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from molpipeline.mol2any.mol2morgan_fingerprint import MolToMorganFP
99
from molpipeline.mol2any.mol2net_charge import MolToNetCharge
1010
from molpipeline.mol2any.mol2path_fingerprint import Mol2PathFP
11+
from molpipeline.mol2any.mol2pharmacophore2d_fingerprint import MolToPharmacophore2DFP
1112
from molpipeline.mol2any.mol2rdkit_phys_chem import MolToRDKitPhysChem
1213
from molpipeline.mol2any.mol2smiles import MolToSmiles
1314

@@ -21,6 +22,7 @@
2122
"MolToMACCSFP",
2223
"MolToMorganFP",
2324
"MolToNetCharge",
25+
"MolToPharmacophore2DFP",
2426
"MolToRDKitPhysChem",
2527
"MolToSmiles",
2628
]

molpipeline/mol2any/mol2maccs_key_fingerprint.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from rdkit.DataStructs import ExplicitBitVect
77

88
from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
9+
FPReturnAsOption,
910
MolToFingerprintPipelineElement,
1011
)
1112
from molpipeline.utils.molpipeline_types import RDKitMol
@@ -21,10 +22,44 @@ class MolToMACCSFP(MolToFingerprintPipelineElement):
2122
"""
2223

2324
_n_bits = 167 # MACCS keys have 166 bits + 1 bit for an all-zero vector (bit 0)
24-
_feature_names = [f"maccs_{i}" for i in range(_n_bits)]
25+
26+
def __init__(
27+
self,
28+
return_as: FPReturnAsOption = "sparse",
29+
name: str = "MolToMACCSFP",
30+
n_jobs: int = 1,
31+
uuid: str | None = None,
32+
):
33+
"""Initialize MolToMACCSFP.
34+
35+
Parameters
36+
----------
37+
return_as : FPReturnAsOption, default="sparse"
38+
Type of output. When "sparse" the fingerprints will be returned as a
39+
scipy.sparse.csr_matrix holding a sparse representation of the bit vectors.
40+
With "dense" a numpy matrix will be returned. With "rdkit" the fingerprints
41+
will be returned as a list of one of RDKit's ExplicitBitVect,
42+
IntSparseBitVect, UIntSparseBitVect, etc. depending on the fingerprint
43+
and parameters.
44+
name : str, default="MolToMACCSFP"
45+
Name of PipelineElement.
46+
n_jobs : int, default=1
47+
Number of cores to use.
48+
uuid : str | None, optional
49+
UUID of the PipelineElement.
50+
51+
"""
52+
super().__init__( # pylint: disable=R0801
53+
return_as=return_as,
54+
name=name,
55+
n_jobs=n_jobs,
56+
uuid=uuid,
57+
)
58+
self._feature_names = [f"maccs_{i}" for i in range(self._n_bits)]
2559

2660
def pretransform_single(
27-
self, value: RDKitMol
61+
self,
62+
value: RDKitMol,
2863
) -> dict[int, int] | npt.NDArray[np.int_] | ExplicitBitVect:
2964
"""Transform a single molecule to MACCS key fingerprint.
3065
@@ -45,7 +80,7 @@ def pretransform_single(
4580
4681
"""
4782
fingerprint = MACCSkeys.GenMACCSKeys(value) # type: ignore[attr-defined]
48-
if self._return_as == "explicit_bit_vect":
83+
if self._return_as == "rdkit":
4984
return fingerprint
5085
if self._return_as == "dense":
5186
return np.array(fingerprint)

molpipeline/mol2any/mol2morgan_fingerprint.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from __future__ import annotations # for all the python 3.8 users out there.
44

5-
from typing import Any, Literal
5+
from typing import TYPE_CHECKING, Any
66

77
try:
88
from typing import Self # type: ignore[attr-defined]
@@ -15,8 +15,11 @@
1515

1616
from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
1717
ABCMorganFingerprintPipelineElement,
18+
FPReturnAsOption,
1819
)
19-
from molpipeline.utils.molpipeline_types import RDKitMol
20+
21+
if TYPE_CHECKING:
22+
from molpipeline.utils.molpipeline_types import RDKitMol
2023

2124

2225
class MolToMorganFP(ABCMorganFingerprintPipelineElement):
@@ -33,7 +36,7 @@ def __init__(
3336
use_features: bool = False,
3437
n_bits: int = 2048,
3538
counted: bool = False,
36-
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
39+
return_as: FPReturnAsOption = "sparse",
3740
name: str = "MolToMorganFP",
3841
n_jobs: int = 1,
3942
uuid: str | None = None,
@@ -52,12 +55,13 @@ def __init__(
5255
counted: bool, default=False
5356
If True, the fingerprint will be counted.
5457
If False, the fingerprint will be binary.
55-
return_as: Literal["sparse", "dense", "explicit_bit_vect"], default="sparse"
58+
return_as: FPReturnAsOption, default="sparse"
5659
Type of output. When "sparse" the fingerprints will be returned as a
5760
scipy.sparse.csr_matrix holding a sparse representation of the bit vectors.
58-
With "dense" a numpy matrix will be returned.
59-
With "explicit_bit_vect" the fingerprints will be returned as a list of
60-
RDKit's rdkit.DataStructs.cDataStructs.ExplicitBitVect.
61+
With "dense" a numpy matrix will be returned. With "rdkit" the fingerprints
62+
will be returned as a list of one of RDKit's ExplicitBitVect,
63+
IntSparseBitVect, UIntSparseBitVect, etc. depending on the fingerprint
64+
and parameters.
6165
name: str, default="MolToMorganFP"
6266
Name of PipelineElement
6367
n_jobs: int, default=1
@@ -88,7 +92,7 @@ def __init__(
8892
)
8993
if not isinstance(n_bits, int) or n_bits < 1:
9094
raise ValueError(
91-
f"Number of bits has to be a integer > 0! (Received: {n_bits})"
95+
f"Number of bits has to be a integer > 0! (Received: {n_bits})",
9296
)
9397
self._n_bits = n_bits
9498
self._feature_names = [f"morgan_{i}" for i in range(self._n_bits)]
@@ -105,6 +109,7 @@ def get_params(self, deep: bool = True) -> dict[str, Any]:
105109
-------
106110
dict[str, Any]
107111
Dictionary of parameters.
112+
108113
"""
109114
parameters = super().get_params(deep)
110115
if deep:
@@ -125,6 +130,7 @@ def set_params(self, **parameters: Any) -> Self:
125130
-------
126131
Self
127132
MolToMorganFP pipeline element with updated parameters.
133+
128134
"""
129135
parameter_copy = dict(parameters)
130136
n_bits = parameter_copy.pop("n_bits", None)
@@ -143,6 +149,7 @@ def _get_fp_generator(
143149
-------
144150
rdFingerprintGenerator.FingerprintGenerator
145151
RDKit fingerprint generator.
152+
146153
"""
147154
return rdFingerprintGenerator.GetMorganGenerator(
148155
radius=self.radius,
@@ -160,12 +167,13 @@ def _explain_rdmol(self, mol_obj: RDKitMol) -> dict[int, list[tuple[int, int]]]:
160167
Returns
161168
-------
162169
dict[int, list[tuple[int, int]]]
163-
Dictionary with bit position as key and list of tuples with atom index and radius as value.
170+
Dictionary with bit position as key and list of tuples with atom index
171+
and radius as value.
172+
164173
"""
165174
fp_generator = self._get_fp_generator()
166175
additional_output = AllChem.AdditionalOutput()
167176
additional_output.AllocateBitInfoMap()
168177
# using the dense fingerprint here, to get indices after folding
169178
_ = fp_generator.GetFingerprint(mol_obj, additionalOutput=additional_output)
170-
bit_info = additional_output.GetBitInfoMap()
171-
return bit_info
179+
return additional_output.GetBitInfoMap()

molpipeline/mol2any/mol2path_fingerprint.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from __future__ import annotations # for all the python 3.8 users out there.
44

5-
from typing import Any, Literal
5+
from typing import Any
66

77
try:
88
from typing import Self # type: ignore[attr-defined]
@@ -14,6 +14,7 @@
1414
from rdkit.Chem import rdFingerprintGenerator
1515

1616
from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
17+
FPReturnAsOption,
1718
MolToRDKitGenFPElement,
1819
)
1920

@@ -39,7 +40,7 @@ def __init__(
3940
num_bits_per_feature: int = 2,
4041
atom_invariants_generator: Any = None,
4142
counted: bool = False,
42-
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
43+
return_as: FPReturnAsOption = "sparse",
4344
name: str = "Mol2PathFP",
4445
n_jobs: int = 1,
4546
uuid: str | None = None,
@@ -71,12 +72,13 @@ def __init__(
7172
counted: bool, default=False
7273
If True, the fingerprint will be counted.
7374
If False, the fingerprint will be binary.
74-
return_as: Literal["sparse", "dense", "explicit_bit_vect"], default="sparse"
75+
return_as: FPReturnAsOption, default="sparse"
7576
Type of output. When "sparse" the fingerprints will be returned as a
7677
scipy.sparse.csr_matrix holding a sparse representation of the bit vectors.
77-
With "dense" a numpy matrix will be returned.
78-
With "explicit_bit_vect" the fingerprints will be returned as a list of
79-
RDKit's rdkit.DataStructs.cDataStructs.ExplicitBitVect.
78+
With "dense" a numpy matrix will be returned. With "rdkit" the fingerprints
79+
will be returned as a list of one of RDKit's ExplicitBitVect,
80+
IntSparseBitVect, UIntSparseBitVect, etc. depending on the fingerprint
81+
and parameters.
8082
name: str, default="MolToMorganFP"
8183
Name of PipelineElement
8284
n_jobs: int, default=1

0 commit comments

Comments
 (0)