Source code for chemFilters.filters.rdkit_filters

# -*- coding: utf-8 -*-
"""A module for filtering molecules using RDKit FilterCatalogs."""

import warnings
from functools import partial
from typing import List, Tuple, Union

import numpy as np
import pandas as pd
from job_tqdflex import ParallelApplier

warnings.filterwarnings(  # Ugly workaround to avoid RDKit from complaining
    "ignore", message=".*boost::shared_ptr.*FilterHierarchyMatcher.*"
)

from rdkit import Chem
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams

from ..chem.interface import MoleculeHandler
from ..logger import logger
from .utils import get_catalog_match

STANDARD_RD_COLS = [
    "NIH",
    "PAINS_A",
    "PAINS_B",
    "PAINS_C",
    "PAINS_any",
    "ZINC",
    "Brenk",
    "ChEMBL23_Dundee",
    "ChEMBL23_BMS",
    "ChEMBL23_MLSMR",
    "ChEMBL23_Inpharmatica",
    "ChEMBL23_SureChEMBL",
    "ChEMBL23_LINT",
    "ChEMBL23_Glaxo",
    "ChEMBL23_any",
]
FILTER_COLLECTIONS = ["PAINS", "CHEMBL", "BRENK", "ALL"]  # Are collections of filters


[docs] class RdkitFilters(MoleculeHandler): _filter_catalog_cache = {}
[docs] def __init__( self, filter_type="ALL", n_jobs=1, from_smi: bool = False, chunk_size: int = None ) -> None: """Initiaze RdkitFilters object. Args: filter_type: type of filter from RDKit FilterCatalogs. Defaults to "ALL". n_jobs: number of jobs if wanted to run things in parallel. Defaults to 1. from_smi = if True, will do the conversion from SMILES to RDKit Mol object. chunk_size: size of chunks for ParallelApplier. If None, auto-calculated. Defaults to None. """ self.filter_type = filter_type self.filter = self._get_filter() self.n_jobs = n_jobs self.chunk_size = chunk_size super().__init__(from_smi=from_smi)
@property def available_filters(self): """List of available filters from RDKit FilterCatalogs.""" allFilt = FilterCatalogParams.FilterCatalogs.ALL return [m for m in dir(allFilt) if any([m.isupper(), m.startswith("CHEMBL_")])] def _get_filter(self): """Get the filter from RDKit FilterCatalogs, using cache to avoid duplicate registrations.""" if self.filter_type not in self.available_filters: raise ValueError(f"Filter type {self.filter_type} not available.") # Check if this filter type is already cached if self.filter_type not in self._filter_catalog_cache: _filter = getattr(FilterCatalogParams.FilterCatalogs, self.filter_type) catalog = FilterCatalogParams() catalog.AddCatalog(_filter) self._filter_catalog_cache[self.filter_type] = FilterCatalog(catalog) return self._filter_catalog_cache[self.filter_type]
[docs] def filter_mols( self, stdin: List[Union[Chem.Mol, str]], match_type: str = "string", ) -> Tuple[List[List[str]], List[List[str]], List[List[Chem.Mol]]]: """Filter molecules using RDKit FilterCatalogs. Args: stdin: list of RDKit Mol objects of SMILES strings if self._from_smi is True match_type: values within the flagging dataframe. If `bool`, will spare retrieving substructures and descriptions. If `string`, will have the description of the filter that was matched. Defaults to `string`. Returns: filter_names: list of filter names that were matched. descriptions: list of filter descriptions that were matched. substructs: list of substructures that were matched. """ applier = ParallelApplier( self._output_mol, stdin, n_jobs=self.n_jobs, show_progress=False, backend="loky", custom_desc="Converting to RDKit molecules", chunk_size=self.chunk_size, ) mols = applier() catalog_func = partial( get_catalog_match, catalog=self.filter, match_type=match_type, ) applier = ParallelApplier( catalog_func, mols, n_jobs=self.n_jobs, show_progress=False, backend="loky", custom_desc="Applying RDKit filter catalog", chunk_size=self.chunk_size, ) result = applier() filter_names, descriptions, substructs = zip(*result) return filter_names, descriptions, substructs
[docs] def get_flagging_df( self, stdin: List[Union[Chem.Mol, str]], match_type: str = "string", save_matches: bool = False, ) -> pd.DataFrame: """Flag molecules using the defined RDKit FilterCatalogs and return a dataframe with all the detedcted filters as columns and the molecules as rows. Items within the dataframe will be the description of the molecular filter that was caught. Will also save the filter names, descriptions, and substructures as attributes. Args: stdin: list of RDKit Mol objects or SMILES strings if self._from_smi is True match_type: values within the flagging dataframe. If `bool`, will spare retrieving substructures and descriptions. If `string`, will have the description of the filter that was matched. Defaults to `string`. save_matches: if True, will save the filter names, descriptions, and substructures as attributes. Defaults to False. Returns: pd.DataFrame: dataframe with columns as filter types and rows as molecules. """ if match_type.lower() not in ["string", "bool"]: raise ValueError("match_type must be either 'string' or 'bool'.") def flatten_labels(labels): """Flatten the labels and filter out any None types""" return ";".join(filter(None, labels)) vectorized_flatten = np.vectorize(flatten_labels) filter_names, descriptions, substructs = self.filter_mols( stdin, match_type=match_type ) if match_type.lower() == "string": val_dicts = [ ( dict(zip(names, descs)) if all([names is not None, descs is not None]) else {} ) for names, descs in zip(filter_names, descriptions) ] final_df = pd.DataFrame(val_dicts) for col in final_df.columns: final_df[col] = final_df[col].apply( lambda x: [] if pd.isnull(x) else [x] ) final_df = ( final_df.apply(vectorized_flatten) .replace({"": np.nan}) .reindex(columns=STANDARD_RD_COLS) ) elif match_type.lower() == "bool": final_df = pd.DataFrame(columns=STANDARD_RD_COLS, index=range(len(stdin))) for col in STANDARD_RD_COLS: names_series = pd.Series( [names if names is not None else [] for names in filter_names] ) final_df[col] = names_series.apply(lambda x, col: col in x, col=col) # Add smiles to the final dataframe applier = ParallelApplier( self._output_smi, stdin, n_jobs=self.n_jobs, show_progress=False, backend="loky", custom_desc="Converting filtered results to SMILES", chunk_size=self.chunk_size, ) smiles = applier() final_df.insert(0, "SMILES", smiles) # if there are any errors, set the smiles to NaN error_idx = [i for i, x in enumerate(filter_names) if x is None] if error_idx: final_df.loc[error_idx, "SMILES"] = np.nan logger.warning( f"Failed to get filter names and descriptions for {len(error_idx)} " f"molecules in indexes: {error_idx}. SMILES will be set to NaN." ) if save_matches: self.filter_names = filter_names self.descriptions = descriptions self.substructs = substructs return final_df