# -*- coding: utf-8 -*-
"""Wrapper class for molbloom. Original repo: https://github.com/whitead/molbloom"""
from itertools import product
from pathlib import Path
from typing import List, Union
import numpy as np
import pandas as pd
from job_tqdflex import ParallelApplier
try:
from molbloom import _DEFAULT_PATH, _load_filter, buy, catalogs
except ImportError:
raise ImportError(
"molbloom is required to use MolbloomFilters. "
"Install it with: pip install 'chem-filters[allfilters]'"
)
from rdkit import Chem
from ..chem.interface import MoleculeHandler
from ..chem.standardizers import ChemStandardizer
STANDARD_BLOOM_COLS = [
"zinc20",
"zinc-instock",
"zinc-instock-mini",
"surechembl",
]
[docs]
class MolbloomFilters(MoleculeHandler):
"""Wrapper class for molbloom. Requires molbloom to be installed.
Arguments:
from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
standardize: whether to standardize `stdin` or not. Defaults to False.
std_method: SMILES/mol standardization method. Available: `canon`, `chembl`,
`papyrus`. Defaults to "chembl".
n_jobs: number of jobs to run in parallel. Defaults to 1."""
[docs]
def __init__(
self,
from_smi: bool = False,
standardize: bool = False,
std_method: str = "chembl",
n_jobs=1,
chunk_size: int = None,
**kwargs,
):
"""Initalize the MolbloomFilters class.
Args:
from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
standardize: whether to standardize `stdin` or not. Defaults to False.
std_method: SMILES/mol standardization method. Available: `canon`, `chembl`,
`papyrus`. Defaults to "chembl".
n_jobs: number of jobs to run in parallel. Defaults to 1.
chunk_size: size of chunks for ParallelApplier. If None, auto-calculated.
Defaults to None.
"""
self._catalogs = catalogs()
self._ensure_filters_downloaded()
self._standardize = standardize
self._smiles_standardizer = self._get_standardizer(
std_method, from_smi, **kwargs
)
self._std_method = std_method.lower()
self._kwargs = kwargs
self._n_jobs = n_jobs
self._chunk_size = chunk_size
super().__init__(from_smi)
def _get_standardizer(self, std_method: str, from_smi: bool, **kwargs):
return ChemStandardizer(method=std_method, from_smi=from_smi, **kwargs)
[docs]
def get_catalogs(self):
"""Avilable catalogs in molbloom."""
return self._catalogs
def _ensure_filters_downloaded(self):
"""Ensures that the molbloom filters are downloaded."""
for catalog in self._catalogs:
filter_path = Path(_DEFAULT_PATH) / f"{catalog}.bloom"
if not filter_path.exists():
print(f"Starting {catalog} download...")
_load_filter(catalog)
[docs]
def buy_smi(self, smi: str, catalog: str = "zinc-instock"):
"""Wrapper of molbloom.buy. Returns True if the SMILES is probably in the
catalog, False if it is definitely not."""
try:
# canonicalization off as it's handled by the smiles standardizer
return buy(smi, catalog=catalog, canonicalize=False)
except Exception as e:
print(f"An error occurred: {str(e)} on buying {smi} from {catalog}")
return None
[docs]
def get_flagging_df(self, stdin: List[Union[str, Chem.Mol]]):
"""Returns a dataframe with the flagging results for each catalog. Flags will
be the resutls from molbloom.buy, where True means the SMILES is probably in
the catalog, False means it is definitely not. For more information, see the
original repo: https://github.com/whitead/molbloom
Args:
stdin: standard input; a list of SMILES strings or rdkit.Chem.Mol objects
depending on the value of self._from_smi.
Returns:
pd.DataFrame: dataframe with the flagging results for each catalog.
"""
if self._standardize:
smiles = self._smiles_standardizer(stdin)
else:
applier = ParallelApplier(
self._output_smi,
stdin,
n_jobs=self._n_jobs,
show_progress=False,
backend="loky",
custom_desc="Converting molecules to SMILES",
chunk_size=self._chunk_size,
)
smiles = applier()
all_params = list(product(smiles, self._catalogs))
results = [self.buy_smi(smi, cat) for smi, cat in all_params]
num_catalogs = len(self._catalogs)
df = pd.DataFrame(
np.array(results).reshape((-1, num_catalogs)), columns=self._catalogs.keys()
)
df.insert(0, "SMILES", smiles)
return df.replace({None: np.nan})