Source code for chemFilters.chem.standardizers

# -*- coding: utf-8 -*-
"""Utility modules with functions for the chemFilters.chem subpackage."""
from functools import partial
from importlib.util import find_spec
from typing import Callable, List, Union

from chembl_structure_pipeline import standardizer as chembl_std
from rdkit import Chem
from tqdm import tqdm

from ..logger import logger
from .interface import MoleculeHandler
from .utils import (
    molToCanon,
    molToConnectivity,
    molToInchi,
    molToInchiKey,
    rdkit_log_controller,
)

if find_spec("papyrus_structure_pipeline"):
    from papyrus_structure_pipeline import standardizer as papyrus_std

if find_spec("molvs"):
    from molvs import Standardizer



[docs]
class ChemStandardizer(MoleculeHandler):
    """A class to standardize molecules/SMILES strings. Initialization allows for the
    selection of the settings of the standardizer. The object can then be called on a
    iterable containing molecules/SMILES strings to apply the standardization."""


[docs]
    def __init__(
        self,
        method: Union[str, Callable] = "chembl",
        n_jobs: int = 5,
        isomeric: bool = True,
        progress: bool = False,
        rdkit_loglevel: str = "warning",
        from_smi: bool = False,
        return_smi: bool = True,
        chunk_size: int = None,
        **kwargs,
    ) -> None:
        """Initializes the ChemStandardizer class.

        Args:
            method: standardization pipeline to use. Current supports "canon",
                "chembl", "papyrus", "molvs", or a callable. If callable, ensure it
                takes rdkit.Mol objects as input. Defaults to "chembl". "canon" is
                rdkit's SMILES canonicalization.
            n_jobs: number of jobs running in parallel. Defaults to 5.
            isomeric: output smiles with isomeric information. Defaults to True.
            progress: display a progress bar with tqdm. Defaults to False.
            rdkit_loglevel: one of `debug, info, warning, error, critical`. Defaults to
                "warning".
            from_smi: if True, the standardizer will expect SMILES strings as input.
                Defaults to False.
            chunk_size: size of chunks for ParallelApplier. If None, auto-calculated.
                Defaults to None.
            kwargs: additional keyword arguments to pass to the standardizer.

        Raises:
            ImportError: if method is "papyrus" but the optional dependency is not
                installed.
            ValueError: when an invalid method is passed.
        """
        if callable(method):
            self.standardizer = method
            logger.info(
                "Custom standardizer detected!! "
                "Ensure it takes Rdkit.Mol objecs as input."
            )
            try:
                import pickle

                _ = pickle.dumps(self.standardizer)
                self._custom_is_pickable = True
            except (pickle.PicklingError, TypeError, AttributeError):
                logger.warning(
                    "Custom standardizer is not pickable. Will use 'n_jobs' "
                    "only to process `SMILES -> Mol` & `Mol -> SMILES` operations"
                )
                self._custom_is_pickable = False
        else:
            self._custom_is_pickable = False
            if method.lower() == "chembl":
                self.standardizer = partial(self.chemblStandardizer, **kwargs)
            elif method.lower() == "canon":
                self.standardizer = partial(molToCanon, isomeric=isomeric, **kwargs)
            elif method.lower() == "papyrus":
                # avoid import since it's not a required dependency
                if find_spec("papyrus_structure_pipeline"):
                    self.standardizer = partial(self.papyrusStandardizer, **kwargs)
                else:
                    raise ImportError(
                        "papyrus_structure_pipeline is required for method='papyrus'. "
                        "Install it with: pip install 'chem-filters[standardizers]'"
                    )
            elif method.lower() == "molvs":
                if find_spec("molvs"):
                    self.standardizer = partial(self.molvsStandardizer, **kwargs)
                else:
                    raise ImportError(
                        "molvs is required for method='molvs'. "
                        "Install it with: pip install 'chem-filters[standardizers]'"
                    )
            else:
                raise ValueError(f"Invalid SMILES standardizing method: {method}")
        super().__init__(from_smi)
        self.n_jobs = n_jobs
        self.progress = progress
        self.rdkit_loglevel = rdkit_loglevel
        self.return_smi = return_smi
        self.method = method
        self.chunk_size = chunk_size
        self.smi_out_func = partial(
            MoleculeHandler(from_smi=False, isomeric=isomeric)._output_smi
        )


    def __call__(self, stdin: List[Union[str, Chem.Mol]]) -> List[str]:
        """Calls the standardizer on a list of SMILES strings / Chem.Mol objects to
        perform the standardization according to the settings set at initialization.

        Args:
            stdin: standard input; a list of SMILES strings or rdkit.Chem.Mol objects
                depending on the value of self._from_smi.

        Returns:
            A list of standardized SMILES strings.
        """
        if callable(self.method):
            if self.method.__name__ == "<lambda>":
                pickable = False
            else:
                pickable = self._custom_is_pickable
        else:
            pickable = True
        with rdkit_log_controller(self.rdkit_loglevel):
            if self._from_smi and self.method == "canon":
                stdin = (  # Too fast to parallelize; overhead dominates
                    tqdm(stdin, desc="Parsing input SMILES") if self.progress else stdin
                )
                stdin = [self._output_mol(s) for s in stdin]

            method_name = self.method if isinstance(self.method, str) else "custom"

            if self.method == "canon":
                vals = (  # Canon method is too fast to parallelize; overhead dominates
                    tqdm(stdin, desc=f"Standardizing molecules ({method_name})")
                    if self.progress
                    else stdin
                )
                vals = [self.standardizer(mol) for mol in vals]
            else:
                vals = self.pmap(
                    self.n_jobs,
                    self.progress,
                    stdin,
                    self.standardizer,
                    pickable=pickable,
                    custom_desc=f"Standardizing molecules ({method_name})",
                    chunk_size=self.chunk_size,
                )

            if self.return_smi and self.method != "canon":  # canon always returns smis
                vals = (  # Too fast to parallelize; overhead dominates
                    tqdm(vals, desc="Converting mols to SMILES")
                    if self.progress
                    else vals
                )
                vals = [self.smi_out_func(mol) for mol in vals]
        return vals


[docs]
    def papyrusStandardizer(
        self,
        stdin: Union[str, Chem.Mol],
        **kwargs,
    ) -> str:
        """Uses the Papyrus standardizer to standardize a SMILES string. By default,
        this standardization pipeline removes stereocenters, so beware of the isomeric
        flag. Accepts extra keyword arguments that will be passed to the standardizer.

        For more information: https://github.com/OlivierBeq/Papyrus_structure_pipeline

        Args:
            stdin: standard input; single SMILES strings or single rdkit.Chem.Mol object
                depending on the value of self._from_smi.
            isomeric: output isomeric smiles. Defaults to True.
            kwargs: aditional keyword arguments to pass to the standardizer.

        Returns:
            standardized smiles string
        """
        mol = self._output_mol(stdin)
        try:
            standard_mol = papyrus_std.standardize(mol, **kwargs)
        except RuntimeError:
            logger.exception("Error standardizing molecule: ", stdin)
            standard_mol = None
        return standard_mol



[docs]
    def chemblStandardizer(
        self, stdin: Union[str, Chem.Mol], neutralize: bool = True, **kwargs
    ) -> str:
        """Uses the ChEMBL standardizer to standardize a SMILES string. Accepts extra
        keyword arguments that will be passed to the standardizer

        Args:
            stdin: standard input; single SMILES strings or single rdkit.Chem.Mol object
                depending on the value of self._from_smi.
            isomeric: output isomeric smiles. Defaults to True.
            neutralize: configure `get_parent_mol` to neutralize the molecule. Defaults
                to True.
            kwargs: keyword arguments to pass to the `get_parent_mol` and the
                `standardize_mol` functions.

        Returns:
            standardized smiles string
        """
        mol = self._output_mol(stdin)
        if mol is None:
            return None
        try:
            parent_mol = chembl_std.get_parent_mol(
                mol,
                neutralize=neutralize,
                **{
                    "check_exclusion": kwargs.get("check_exclusion", True),
                    "verbose": kwargs.get("verbose", False),
                },
            )[0]
        except Chem.rdchem.AtomValenceException:
            stdin_str = stdin if isinstance(stdin, str) else Chem.MolToSmiles(mol)
            logger.error(
                f"AtomValenceException getting {stdin_str} parent mol. Skipping step."
            )
            parent_mol = mol
        except TypeError:
            stdin_str = stdin if isinstance(stdin, str) else Chem.MolToSmiles(mol)
            logger.exception(
                f"TypeError getting {stdin_str} parent mol. Skipping step."
            )
            parent_mol = mol
        except Exception as e:
            stdin_str = stdin if isinstance(stdin, str) else Chem.MolToSmiles(mol)
            logger.exception(f"{type(e).__name__} getting {stdin_str} parent mol")
            raise e
        try:
            standard_mol = chembl_std.standardize_mol(
                parent_mol,
                sanitize=True,
                **{"check_exclusion": kwargs.get("check_exclusion", True)},
            )
            Chem.SanitizeMol(standard_mol)
        except Chem.rdchem.AtomValenceException:
            parent_mol_str = Chem.MolToSmiles(parent_mol)
            stdin_str = stdin if isinstance(stdin, str) else Chem.MolToSmiles(mol)
            logger.error(
                f"AtomValenceException standardizing parent molecule: {parent_mol_str} "
                f"from input {stdin_str}. Returning None."
            )
            standard_mol = None
        except TypeError as e:
            parent_mol_str = Chem.MolToSmiles(parent_mol)
            logger.exception(f"Error standardizing parent molecule: {parent_mol_str}")
            logger.exception(e)
            standard_mol = None
        return standard_mol



[docs]
    def molvsStandardizer(
        self,
        stdin: Union[str, Chem.Mol],
        **kwargs,
    ) -> str:
        """Uses molvs to standardize a SMILES string. By default, this standardization
        pipeline applies the functions `canonicalize_tautomer` and `standardize`
        implemented in the package.

        For more information, see the docs: https://molvs.readthedocs.io/en/latest/

        Args:
            stdin: standard input; single SMILES strings or single rdkit.Chem.Mol object
                depending on the value of self._from_smi.
            isomeric: output isomeric smiles. Defaults to True.
            kwargs: aditional `molvs.Standardizer` object.

        Returns:
            standardized smiles string
        """
        mol = self._output_mol(stdin)
        molvs_std = Standardizer(**kwargs)
        try:
            tautomer_mol = molvs_std.canonicalize_tautomer(mol)
            standard_mol = molvs_std.standardize(tautomer_mol)
        except RuntimeError:
            logger.exception("Error standardizing molecule: ", stdin)
            standard_mol = None
        return standard_mol





[docs]
class InchiHandling(MoleculeHandler):
    """Obtain a list of inchis, inchikeys or connectivities from a list of smiles.
    Initialization allows for the selection of the settings. The object can then be
    called on a iterable containing SMILES strings to obtain the desired identifier."""


[docs]
    def __init__(
        self,
        convert_to: str,
        n_jobs: int = 5,
        progress: bool = False,
        rdkit_loglevel: str = "warning",
        from_smi: bool = False,
        chunk_size: int = None,
    ) -> None:
        """Initialize the InchiHandling class.

        Args:
            convert_to: what to convert the smiles to. Can be "inchi", "inchikey" or
                "connectivity".
            n_jobs: Number of jobs for processing in parallel. Defaults to 5.
            progress: whether to show the progress bar. Defaults to False.
            rdkit_loglevel: one of `debug, info, warning, error, critical`. Defaults to
                "warning".
            from_smi: if True, the standardizer will expect SMILES strings as input.
                Defaults to False.
            chunk_size: size of chunks for ParallelApplier. If None, auto-calculated.
                Defaults to None.

        Raises:
            ValueError: if the convert_to argument is not one of the three options.
        """
        if convert_to.lower() == "inchi":
            self.converter = molToInchi
        elif convert_to.lower() == "inchikey":
            self.converter = molToInchiKey
        elif convert_to.lower() == "connectivity":
            self.converter = molToConnectivity
        else:
            raise ValueError(f"Invalid convertion method: {self.convert_to}")
        self.n_jobs = n_jobs
        self.progress = progress
        self.rdkit_loglevel = rdkit_loglevel
        self.chunk_size = chunk_size
        super().__init__(from_smi)


    def __call__(self, stdin: list) -> list:
        """Calls the standardizer on a list of SMILES strings / Chem.Mol objects to
        perform the standardization according to the settings set at initialization.

        Args:
            stdin: standard input; a list of SMILES strings or rdkit.Chem.Mol objects
                depending on the value of self._from_smi.

        Returns:
            A list of standardized SMILES strings.
        """
        # Determine conversion type for progress description
        if self.converter == molToInchi:
            convert_type = "InChI"
        elif self.converter == molToInchiKey:
            convert_type = "InChIKey"
        elif self.converter == molToConnectivity:
            convert_type = "connectivity"
        else:
            convert_type = "identifier"

        with rdkit_log_controller(self.rdkit_loglevel):
            # Too fast to parallelize; overhead dominates
            mols = [self._output_mol(s) for s in stdin]
            vals = self.pmap(
                self.n_jobs,
                self.progress,
                mols,
                self.converter,
                custom_desc=f"Converting to {convert_type}",
                chunk_size=self.chunk_size,
            )
        return vals