Source code for chemFilters.filters.rdkit_filters

# -*- coding: utf-8 -*-
"""A module for filtering molecules using RDKit FilterCatalogs."""

import warnings
from functools import partial
from typing import List, Tuple, Union

import numpy as np
import pandas as pd
from job_tqdflex import ParallelApplier

warnings.filterwarnings(  # Ugly workaround to avoid RDKit from complaining
    "ignore", message=".*boost::shared_ptr.*FilterHierarchyMatcher.*"
)

from rdkit import Chem
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

from ..chem.interface import MoleculeHandler
from ..logger import logger
from .utils import get_catalog_match

STANDARD_RD_COLS = [
    "NIH",
    "PAINS_A",
    "PAINS_B",
    "PAINS_C",
    "PAINS_any",
    "ZINC",
    "Brenk",
    "ChEMBL23_Dundee",
    "ChEMBL23_BMS",
    "ChEMBL23_MLSMR",
    "ChEMBL23_Inpharmatica",
    "ChEMBL23_SureChEMBL",
    "ChEMBL23_LINT",
    "ChEMBL23_Glaxo",
    "ChEMBL23_any",
]
FILTER_COLLECTIONS = ["PAINS", "CHEMBL", "BRENK", "ALL"]  # Are collections of filters



[docs]
class RdkitFilters(MoleculeHandler):
    _filter_catalog_cache = {}


[docs]
    def __init__(
        self, filter_type="ALL", n_jobs=1, from_smi: bool = False, chunk_size: int = None
    ) -> None:
        """Initiaze RdkitFilters object.

        Args:
            filter_type: type of filter from RDKit FilterCatalogs. Defaults to "ALL".
            n_jobs: number of jobs if wanted to run things in parallel. Defaults to 1.
            from_smi = if True, will do the conversion from SMILES to RDKit Mol object.
            chunk_size: size of chunks for ParallelApplier. If None, auto-calculated.
                Defaults to None.
        """
        self.filter_type = filter_type
        self.filter = self._get_filter()
        self.n_jobs = n_jobs
        self.chunk_size = chunk_size
        super().__init__(from_smi=from_smi)


    @property
    def available_filters(self):
        """List of available filters from RDKit FilterCatalogs."""
        allFilt = FilterCatalogParams.FilterCatalogs.ALL
        return [m for m in dir(allFilt) if any([m.isupper(), m.startswith("CHEMBL_")])]

    def _get_filter(self):
        """Get the filter from RDKit FilterCatalogs, using cache to avoid duplicate registrations."""
        if self.filter_type not in self.available_filters:
            raise ValueError(f"Filter type {self.filter_type} not available.")

        # Check if this filter type is already cached
        if self.filter_type not in self._filter_catalog_cache:
            _filter = getattr(FilterCatalogParams.FilterCatalogs, self.filter_type)
            catalog = FilterCatalogParams()
            catalog.AddCatalog(_filter)
            self._filter_catalog_cache[self.filter_type] = FilterCatalog(catalog)

        return self._filter_catalog_cache[self.filter_type]


[docs]
    def filter_mols(
        self,
        stdin: List[Union[Chem.Mol, str]],
        match_type: str = "string",
    ) -> Tuple[List[List[str]], List[List[str]], List[List[Chem.Mol]]]:
        """Filter molecules using RDKit FilterCatalogs.

        Args:
            stdin: list of RDKit Mol objects of SMILES strings if self._from_smi is True
            match_type: values within the flagging dataframe. If `bool`, will spare
                retrieving substructures and descriptions. If `string`, will have the
                description of the filter that was matched. Defaults to `string`.

        Returns:
            filter_names: list of filter names that were matched.
            descriptions: list of filter descriptions that were matched.
            substructs: list of substructures that were matched.
        """
        applier = ParallelApplier(
            self._output_mol,
            stdin,
            n_jobs=self.n_jobs,
            show_progress=False,
            backend="loky",
            custom_desc="Converting to RDKit molecules",
            chunk_size=self.chunk_size,
        )
        mols = applier()

        catalog_func = partial(
            get_catalog_match,
            catalog=self.filter,
            match_type=match_type,
        )
        applier = ParallelApplier(
            catalog_func,
            mols,
            n_jobs=self.n_jobs,
            show_progress=False,
            backend="loky",
            custom_desc="Applying RDKit filter catalog",
            chunk_size=self.chunk_size,
        )
        result = applier()

        filter_names, descriptions, substructs = zip(*result)
        return filter_names, descriptions, substructs



[docs]
    def get_flagging_df(
        self,
        stdin: List[Union[Chem.Mol, str]],
        match_type: str = "string",
        save_matches: bool = False,
    ) -> pd.DataFrame:
        """Flag molecules using the defined RDKit FilterCatalogs and return a dataframe
        with all the detedcted filters as columns and the molecules as rows. Items
        within the dataframe will be the description of the molecular filter that was
        caught. Will also save the filter names, descriptions, and substructures as
        attributes.

        Args:
            stdin: list of RDKit Mol objects or SMILES strings if self._from_smi is True
            match_type: values within the flagging dataframe. If `bool`, will spare
                retrieving substructures and descriptions. If `string`, will have the
                description of the filter that was matched. Defaults to `string`.
            save_matches: if True, will save the filter names, descriptions, and
                substructures as attributes. Defaults to False.

        Returns:
            pd.DataFrame: dataframe with columns as filter types and rows as molecules.
        """
        if match_type.lower() not in ["string", "bool"]:
            raise ValueError("match_type must be either 'string' or 'bool'.")

        def flatten_labels(labels):
            """Flatten the labels and filter out any None types"""
            return ";".join(filter(None, labels))

        vectorized_flatten = np.vectorize(flatten_labels)
        filter_names, descriptions, substructs = self.filter_mols(
            stdin, match_type=match_type
        )
        if match_type.lower() == "string":
            val_dicts = [
                (
                    dict(zip(names, descs))
                    if all([names is not None, descs is not None])
                    else {}
                )
                for names, descs in zip(filter_names, descriptions)
            ]
            final_df = pd.DataFrame(val_dicts)
            for col in final_df.columns:
                final_df[col] = final_df[col].apply(
                    lambda x: [] if pd.isnull(x) else [x]
                )
            final_df = (
                final_df.apply(vectorized_flatten)
                .replace({"": np.nan})
                .reindex(columns=STANDARD_RD_COLS)
            )
        elif match_type.lower() == "bool":
            final_df = pd.DataFrame(columns=STANDARD_RD_COLS, index=range(len(stdin)))
            for col in STANDARD_RD_COLS:
                names_series = pd.Series(
                    [names if names is not None else [] for names in filter_names]
                )
                final_df[col] = names_series.apply(lambda x, col: col in x, col=col)
        # Add smiles to the final dataframe
        applier = ParallelApplier(
            self._output_smi,
            stdin,
            n_jobs=self.n_jobs,
            show_progress=False,
            backend="loky",
            custom_desc="Converting filtered results to SMILES",
            chunk_size=self.chunk_size,
        )
        smiles = applier()
        final_df.insert(0, "SMILES", smiles)
        # if there are any errors, set the smiles to NaN
        error_idx = [i for i, x in enumerate(filter_names) if x is None]
        if error_idx:
            final_df.loc[error_idx, "SMILES"] = np.nan
            logger.warning(
                f"Failed to get filter names and descriptions for {len(error_idx)} "
                f"molecules in indexes: {error_idx}. SMILES will be set to NaN."
            )
        if save_matches:
            self.filter_names = filter_names
            self.descriptions = descriptions
            self.substructs = substructs
        return final_df