Source code for chemFilters.filters.silly_filters

# -*- coding: utf-8 -*-
"""Wrapper function for molspotter, a tool based on Pat Water's silly walks (hence the module name)"""  # noqa: E501
from itertools import product
from typing import List, Union

import numpy as np
import pandas as pd
from job_tqdflex import ParallelApplier
try:
    from molspotter import SillyMolSpotter
except ImportError:
    raise ImportError(
        "molspotter is required to use SillyMolSpotterFilter. "
        "Install it with: pip install 'chem-filters[allfilters]'"
    )
from rdkit import Chem

from chemFilters.chem.standardizers import ChemStandardizer

from ..chem.interface import MoleculeHandler, mol_from_smi

STANDARD_SILLY_COLS = [
    "chembl",
    "excape",
    "papyrus",
]



[docs]
class SillyMolSpotterFilter(MoleculeHandler):
    """Wrapper class to molspotter, a tool based on Pat Water's silly walks filter.
    It helps finding unusual molecules in a dataset based the detection of unusual bits
    on a hashed ECFP fingerprint. For more information, see the original repo:
    https://github.com/OlivierBeq/molspotter

    Arguments:
        from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
        standardize: whether to standardize `stdin` or not. Defaults to False.
        std_method: SMILES/mol standardization method. Available: `canon`, `chembl`,
            `papyrus`. Defaults to "chembl".
        n_jobs: number of jobs to run in parallel. Defaults to 1.
    """


[docs]
    def __init__(
        self,
        from_smi=False,
        standardize=False,
        std_method="chembl",
        n_jobs=1,
        chunk_size: int = None,
        **kwargs,
    ):
        """Initialize the SillyMolSpotterFilter class.

        Args:
            from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
            standardize: whether to standardize `stdin` or not. Defaults to False.
            std_method: SMILES/mol standardization method. Available: `canon`, `chembl`,
                `papyrus`. Defaults to "chembl".
            n_jobs: number of jobs to run in parallel. Defaults to 1.
            chunk_size: size of chunks for ParallelApplier. If None, auto-calculated.
                Defaults to None.
        """
        self._spotters = self._get_spotters()
        self._standardize = standardize
        self._smiles_standardizer = self._get_standardizer(
            std_method, from_smi=from_smi, n_jobs=n_jobs, **kwargs
        )
        self._std_method = std_method.lower()
        self._kwargs = kwargs
        self._n_jobs = n_jobs
        self._chunk_size = chunk_size
        super().__init__(from_smi)


    def _get_standardizer(self, std_method, from_smi=True, n_jobs=1, **kwargs):
        return ChemStandardizer(
            method=std_method, from_smi=from_smi, n_jobs=n_jobs, **kwargs
        )

    def _get_spotters(self):
        """Available pretrained spotters from molspotter."""
        return {
            "chembl": SillyMolSpotter.from_pretrained("chembl"),
            "excape": SillyMolSpotter.from_pretrained("excape"),
            "papyrus": SillyMolSpotter.from_pretrained("papyrus"),
        }


[docs]
    def score_smi(self, smi: str, spotter_name: str = "chembl"):
        """Score a SMILES string with a pretrained spotter, indicating how silly the
        processed molecule is."""
        spotter = self._spotters[spotter_name]
        try:
            mol = mol_from_smi(smi)
            return spotter.score_mol(mol)
        except Exception as e:
            print(
                f"An error occurred: {str(e)} when scoring "
                f"{smi} with {spotter_name}."
            )
            return None


    def _unpack_score_smi(self, args):
        """Wrapper function to unpack starmap arguments for ParallelApplier."""
        smi, spotter_name = args
        return self.score_smi(smi, spotter_name)


[docs]
    def get_scoring_df(self, stdin: List[Union[str, Chem.Mol]]):
        """Get a dataframe with the scoring results for each spotter.

        Args:
            stdin: standard input; a list of SMILES strings or rdkit.Chem.Mol objects
                depending on the value of self._from_smi.

        Returns:
            pd.DataFrame: a dataframe with the scoring results for each spotter.
        """
        if self._standardize:
            smiles = self._smiles_standardizer(stdin)
            if smiles == []:
                raise ValueError("No valid SMILES found in the input.")
        else:
            applier = ParallelApplier(
                self._output_smi,
                stdin,
                n_jobs=self._n_jobs,
                show_progress=False,
                backend="loky",
                custom_desc="Converting to SMILES",
                chunk_size=self._chunk_size,
            )
            smiles = applier()

        all_params = list(product(smiles, self._spotters.keys()))
        applier = ParallelApplier(
            self._unpack_score_smi,
            all_params,
            n_jobs=self._n_jobs,
            show_progress=False,
            backend="loky",
            custom_desc="Scoring molecules with SillyMolSpotter",
            chunk_size=self._chunk_size,
        )
        results = applier()

        num_spotters = len(self._spotters)
        df = pd.DataFrame(
            np.array(results).reshape((-1, num_spotters)), columns=self._spotters.keys()
        )
        df.insert(0, "SMILES", smiles)
        return df.replace({None: np.nan})