Source code for chemFilters.filters.pep_filters

# -*- coding: utf-8 -*-
from itertools import product
from typing import List, Union

import numpy as np
import pandas as pd
from job_tqdflex import ParallelApplier
try:
    from pepsift import PepSift, SiftLevel
except ImportError:
    raise ImportError(
        "pepsift is required to use PeptideFilters. "
        "Install it with: pip install 'chem-filters[allfilters]'"
    )
from rdkit import Chem

from ..chem.interface import MoleculeHandler
from ..logger import logger

STANDARD_PEP_COLS = [
    "NaturalLAminoAcids",
    "NaturalLDAminoAcids",
    "NaturalAminoAcidDerivatives",
    "NonNaturalAminoAcidDerivatives",
    "AllAmineAndAcid",
]



[docs]
class PeptideFilters(MoleculeHandler):
    """Wrapper class for PepSift, a tool for identifying peptides and their derivatives
    from small molecule datasets. For the original repo, see:
    https://github.com/OlivierBeq/PepSift/tree/master

    Arguments:
        filter_type: filter type to initialize a PepSift object. See available
            filters on `self.available filters`. Defaults to "all".
        from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
        n_jobs: number of jobs to run in parallel. Defaults to 1.
    """


[docs]
    def __init__(
        self,
        filter_type: Union[str, int] = "all",
        from_smi: bool = False,
        n_jobs: int = 1,
        chunk_size: int = None,
    ) -> None:
        """Initialize the PeptideFilters class.

        Args:
            filter_type: filter type to initialize a PepSift object. See available
                filters on `self.available filters`. Defaults to "all".
            from_smi: treats standard inputs (stdin) as smiles. Defaults to False.
            n_jobs: number of jobs to run in parallel. Defaults to 1.
            chunk_size: size of chunks for ParallelApplier. If None, auto-calculated.
                Defaults to None.
        """
        self.filter_type = filter_type
        self.filter = self._get_filter(_type=filter_type)
        self._n_jobs = n_jobs
        self._chunk_size = chunk_size
        super().__init__(from_smi=from_smi)


    @property
    def available_filters(self):
        """List of available filters on pepsift."""
        return dict(SiftLevel.__members__.items())

    def _get_filter(self, _type: Union[str, int] = None):
        """Helper function to get a PepSift object."""
        if isinstance(_type, SiftLevel):
            sift_object = _type
        elif _type in ["NaturalLAminoAcids", 1, "naturall"]:
            sift_object = SiftLevel(1)
        elif _type in ["NaturalLDAminoAcids", 2, "naturald"]:
            sift_object = SiftLevel(2)
        elif _type in ["NaturalAminoAcidDerivatives", 3, "naturalderivative"]:
            sift_object = SiftLevel(3)
        elif _type in ["NonNaturalAminoAcidDerivatives", 4, "nonnaturalderivative"]:
            sift_object = SiftLevel(4)
        elif _type in ["AllAmineAndAcid", 5, "all"]:
            sift_object = SiftLevel(5)
        else:
            raise ValueError(
                f"Filter type {self.filter_type} not available. "
                f"Try one of:\n{self.available_filters}"
            )
        return PepSift(sift_object)

    def _peptide_filter_func(
        self, stdin: Union[str, Chem.Mol], sift_obj: PepSift = None
    ) -> bool:
        """Function to be used in parallelization. Will return True if the molecule
        is a peptide according to the filtering level used to initialize sift_object,
        False otherwise.

        Args:
            stdin: standard input; single SMILES strings or single rdkit.Chem.Mol object
                depending on the value of self._from_smi.
            sift_obj: pepsift object to filter the molecules if `None`, will use
                `self.filter`. Defaults to None.

        Returns:
            bool: True if the molecule is a peptide, False otherwise.
        """
        mol = self._output_mol(stdin)
        if mol is None:
            return None
        if sift_obj is None:
            assert isinstance(self.filter, PepSift), "self.filter must be a PepSift obj"
            sift_obj = self.filter
        try:
            return sift_obj.is_peptide(mol)
        except Exception as e:
            logger.error(
                "PepSift.is_peptide() error - returning np.nan for sift level "
                f"{sift_obj.level}. Message:\n {e} for stdin {stdin}",
            )
            return None


[docs]
    def filter_mols(self, stdin: List[Chem.Mol]):
        """Filter molecules using the designated pepsift filter. If `sift_level=None`
        as default, will load it from `self.filter`.

        Args:
            stdin: standard input; a list of SMILES strings or rdkit.Chem.Mol objects
                depending on the value of self._from_smi.

        Returns:
            List[bool]: a list of booleans indicating whether the molecule is a peptide
                according to the initialized filter level on `self.filter`.
        """
        applier = ParallelApplier(
            self._peptide_filter_func,
            stdin,
            n_jobs=self._n_jobs,
            show_progress=False,
            backend="loky",
            custom_desc="Filtering peptides with PepSift",
            chunk_size=self._chunk_size,
        )
        bool_mask = applier()
        return bool_mask


    def _unpack_peptide_filter(self, args):
        """Wrapper function to unpack starmap arguments for ParallelApplier."""
        mol, filter_obj = args
        return self._peptide_filter_func(mol, filter_obj)


[docs]
    def get_flagging_df(self, stdin: List[Union[str, Chem.Mol]]):
        """Will flag the molecules according to all filter types avialable in pepsift.

        Args:
            stdin: standard input; a list of SMILES strings or rdkit.Chem.Mol objects
                depending on the value of self._from_smi.

        Returns:
            pd.DataFrame: dataframe with the flags for each filter type.
        """
        all_filters = [self._get_filter(idx) for idx in range(1, 6)]
        all_params = list(product(stdin, all_filters))

        applier = ParallelApplier(
            self._unpack_peptide_filter,
            all_params,
            n_jobs=self._n_jobs,
            show_progress=False,
            backend="loky",
            custom_desc="Testing peptide filters (all levels)",
            chunk_size=self._chunk_size,
        )
        results = applier()

        applier = ParallelApplier(
            self._output_smi,
            stdin,
            n_jobs=self._n_jobs,
            show_progress=False,
            backend="loky",
            custom_desc="Converting to SMILES",
            chunk_size=self._chunk_size,
        )
        smiles = applier()

        df = pd.DataFrame(
            np.array(results).reshape(-1, len(all_filters)),
            columns=self.available_filters.keys(),
        )
        df.insert(0, "SMILES", smiles)
        return df.replace({None: np.nan})