Source code for chemFilters.filters.bloom_filters

# -*- coding: utf-8 -*-
"""Wrapper class for molbloom. Original repo: https://github.com/whitead/molbloom"""
from itertools import product
from pathlib import Path
from typing import List, Union

import numpy as np
import pandas as pd
from job_tqdflex import ParallelApplier
try:
    from molbloom import _DEFAULT_PATH, _load_filter, buy, catalogs
except ImportError:
    raise ImportError(
        "molbloom is required to use MolbloomFilters. "
        "Install it with: pip install 'chem-filters[allfilters]'"
    )
from rdkit import Chem

from ..chem.interface import MoleculeHandler
from ..chem.standardizers import ChemStandardizer

STANDARD_BLOOM_COLS = [
    "zinc20",
    "zinc-instock",
    "zinc-instock-mini",
    "surechembl",
]



[docs]
class MolbloomFilters(MoleculeHandler):
    """Wrapper class for molbloom. Requires molbloom to be installed.

    Arguments:
        from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
        standardize: whether to standardize `stdin` or not. Defaults to False.
        std_method: SMILES/mol standardization method. Available: `canon`, `chembl`,
            `papyrus`. Defaults to "chembl".
        n_jobs: number of jobs to run in parallel. Defaults to 1."""


[docs]
    def __init__(
        self,
        from_smi: bool = False,
        standardize: bool = False,
        std_method: str = "chembl",
        n_jobs=1,
        chunk_size: int = None,
        **kwargs,
    ):
        """Initalize the MolbloomFilters class.

        Args:
            from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
            standardize: whether to standardize `stdin` or not. Defaults to False.
            std_method: SMILES/mol standardization method. Available: `canon`, `chembl`,
                `papyrus`. Defaults to "chembl".
            n_jobs: number of jobs to run in parallel. Defaults to 1.
            chunk_size: size of chunks for ParallelApplier. If None, auto-calculated.
                Defaults to None.
        """
        self._catalogs = catalogs()
        self._ensure_filters_downloaded()
        self._standardize = standardize
        self._smiles_standardizer = self._get_standardizer(
            std_method, from_smi, **kwargs
        )
        self._std_method = std_method.lower()
        self._kwargs = kwargs
        self._n_jobs = n_jobs
        self._chunk_size = chunk_size
        super().__init__(from_smi)


    def _get_standardizer(self, std_method: str, from_smi: bool, **kwargs):
        return ChemStandardizer(method=std_method, from_smi=from_smi, **kwargs)


[docs]
    def get_catalogs(self):
        """Avilable catalogs in molbloom."""
        return self._catalogs


    def _ensure_filters_downloaded(self):
        """Ensures that the molbloom filters are downloaded."""
        for catalog in self._catalogs:
            filter_path = Path(_DEFAULT_PATH) / f"{catalog}.bloom"
            if not filter_path.exists():
                print(f"Starting {catalog} download...")
                _load_filter(catalog)


[docs]
    def buy_smi(self, smi: str, catalog: str = "zinc-instock"):
        """Wrapper of molbloom.buy. Returns True if the SMILES is probably in the
        catalog, False if it is definitely not."""
        try:
            # canonicalization off as it's handled by the smiles standardizer
            return buy(smi, catalog=catalog, canonicalize=False)
        except Exception as e:
            print(f"An error occurred: {str(e)} on buying {smi} from {catalog}")
            return None



[docs]
    def get_flagging_df(self, stdin: List[Union[str, Chem.Mol]]):
        """Returns a dataframe with the flagging results for each catalog. Flags will
        be the resutls from molbloom.buy, where True means the SMILES is probably in
        the catalog, False means it is definitely not. For more information, see the
        original repo: https://github.com/whitead/molbloom

        Args:
            stdin: standard input; a list of SMILES strings or rdkit.Chem.Mol objects
                depending on the value of self._from_smi.

        Returns:
            pd.DataFrame: dataframe with the flagging results for each catalog.
        """
        if self._standardize:
            smiles = self._smiles_standardizer(stdin)
        else:
            applier = ParallelApplier(
                self._output_smi,
                stdin,
                n_jobs=self._n_jobs,
                show_progress=False,
                backend="loky",
                custom_desc="Converting molecules to SMILES",
                chunk_size=self._chunk_size,
            )
            smiles = applier()

        all_params = list(product(smiles, self._catalogs))
        results = [self.buy_smi(smi, cat) for smi, cat in all_params]

        num_catalogs = len(self._catalogs)
        df = pd.DataFrame(
            np.array(results).reshape((-1, num_catalogs)), columns=self._catalogs.keys()
        )
        df.insert(0, "SMILES", smiles)
        return df.replace({None: np.nan})