Skip to content

Compound Filtering

This module provides functions for filtering compounds based on precomputed molecular properties (number of heavy atoms, number of rings, and pattern fingerprints).

Example Use

The precompute_properties function computes properties for a large set of compounds and saves them to a JSON file. The example below shows how to use it with a subset of the n1-stock.txt file.

from pathlib import Path
from fragmentretro.utils.filter_compound import precompute_properties

DATA_PATH = Path(__file__).parent.parent / "data"
PAROUTES_PATH = DATA_PATH / "paroutes"
PRECOMPUTE_PATH = DATA_PATH / "precompute"
MOL_PROPERTIES_PATH = PRECOMPUTE_PATH / "n1_stock_properties_subset.json"

# Create the directory if it doesn't exist
PRECOMPUTE_PATH.mkdir(parents=True, exist_ok=True)

with open(PAROUTES_PATH / "n1-stock.txt") as f:
    n1_stock = [line.strip() for line in f]
n1_stock_subset = n1_stock[:500]
precompute_properties(n1_stock_subset, MOL_PROPERTIES_PATH, fpSize=2048)

The CompoundFilter class is used to initialize a filter, and the get_filtered_BBs method retrieves the screened building blocks. The following example demonstrates filtering using a specific fragment SMILES string.

from fragmentretro.utils.filter_compound import CompoundFilter
from fragmentretro.utils.helpers import replace_dummy_atoms_regex

fragment_smiles_list = ["[5*]N1CCC[C@@]1([13*])C", "[4*]CCN[5*]", "[4*]C[8*]", "[*]C[*]", "[3*]O[3*]"]
fragment_smiles = fragment_smiles_list[1]

compound_filter = CompoundFilter(MOL_PROPERTIES_PATH, fpSize=2048)
filtered_indices, filtered_BBs = compound_filter.get_filtered_BBs(fragment_smiles)

Source Code

fragmentretro.utils.filter_compound

CompoundFilter

A class for filtering compounds based on precomputed molecular properties.

Source code in src/fragmentretro/utils/filter_compound.py
class CompoundFilter:
    """A class for filtering compounds based on precomputed molecular properties."""

    def __init__(self, mol_properties_path: Path, fpSize: int = 2048):
        """
        Initializes the CompoundFilter with molecular properties loaded from a JSON file.

        Args:
            mol_properties_path: Path to the JSON file containing molecular properties.
        """
        self.mol_properties_path = mol_properties_path
        self.cano_smiles_list: list[str] = []
        self.num_heavy_atoms_list: list[int] = []
        self.num_rings_list: list[int] = []
        self.pfp_len_list: list[int] = []
        self.pfp_list: list[list[int]] = []
        self.fpSize = fpSize

        self._load_mol_properties()
        self._create_numpy_arrays()

    def _load_mol_properties(self) -> None:
        """Loads molecular properties from the JSON file."""

        logger.debug("[CompoundFilter] Loading mol properties")
        with open(self.mol_properties_path) as f:
            mol_properties_list = json.load(f)

        self.len_BBs = len(mol_properties_list)
        self.cano_smiles_list = [props["cano_smiles"] for props in mol_properties_list]
        self.num_heavy_atoms_list = [props["num_heavy_atoms"] for props in mol_properties_list]
        self.num_rings_list = [props["num_rings"] for props in mol_properties_list]
        self.pfp_len_list = [len(props["pfp"]) for props in mol_properties_list]
        self.pfp_list = [props["pfp"] for props in mol_properties_list]
        logger.debug("[CompoundFilter] Finished loading mol properties")

    def _create_numpy_arrays(self) -> None:
        """Creates NumPy arrays for faster filtering."""
        self.num_heavy_atoms_array = np.array(self.num_heavy_atoms_list)
        self.num_rings_array = np.array(self.num_rings_list)
        self.pfp_len_array = np.array(self.pfp_len_list)

        # Create a boolean NumPy array for PFP bits
        self.pfp_bit_array = np.zeros((len(self.pfp_list), self.fpSize), dtype=bool)
        for i, pfp in enumerate(self.pfp_list):
            self.pfp_bit_array[i, pfp] = True

    def filter_compounds(self, smiles: str, prefiltered_indices: FilterIndicesType | None = None) -> FilterIndicesType:
        """Filters compounds based on a query SMILES string and prefiltered indices.
        Note that dummy atoms have to be replaced by hydrogen atoms so that we can get
        the minimal format for pattern fingerprint processing.

        Args:
            smiles: The query SMILES string.
            prefiltered_indices: A list of prefiltered indices.

        Returns:
            A list of indices of the compounds that pass the filter.
        """
        no_dummy_smiles = replace_dummy_atoms_regex(smiles)
        try:
            mol_properties = get_mol_properties(no_dummy_smiles, fpSize=self.fpSize)
        except ValueError as e:
            print(f"Invalid SMILES: {e}")
            return []

        logger.debug(f"[CompoundFilter] Filtering BBs for {no_dummy_smiles} ( {smiles} )")

        num_heavy_atoms = mol_properties["num_heavy_atoms"]
        num_rings = mol_properties["num_rings"]
        pfp = mol_properties["pfp"]
        pfp_len = len(pfp)

        query_pfp_bit_array = np.zeros(self.fpSize, dtype=bool)
        query_pfp_bit_array[pfp] = True

        # Filtering based on molecular properties
        indices_array = np.where(
            (self.num_heavy_atoms_array >= num_heavy_atoms)
            & (self.num_rings_array >= num_rings)
            & (self.pfp_len_array >= pfp_len)
        )[0]
        if prefiltered_indices is not None:
            indices_array = np.intersect1d(indices_array, prefiltered_indices)

        # check pfp of query is a subset of pfp of filtered compounds
        if indices_array.size == 0:
            filtered_indices = []
        else:
            filtered_indices = indices_array[
                np.all(self.pfp_bit_array[indices_array][:, query_pfp_bit_array], axis=1)
            ].tolist()

        if prefiltered_indices is None:
            logger.debug(f"[CompoundFilter] Originally {self.len_BBs} BBs, filtered down to {len(filtered_indices)}")
        else:
            logger.debug(
                f"[CompoundFilter] Originally {len(prefiltered_indices)} BBs, filtered down to {len(filtered_indices)}"
            )

        return filtered_indices

    def get_filtered_BBs(
        self, smiles: str, prefiltered_indices: FilterIndicesType | None = None
    ) -> tuple[FilterIndicesType, BBsType]:
        """Filters building blocks based on a query SMILES string and prefiltered indices.

        This method filters the building blocks based on the properties
        of the provided SMILES string and a list of prefiltered indices.
        It uses the `filter_compounds` method to get a list of indices
        that pass the filter, and then returns a set of the corresponding
        canonical SMILES strings.

        Args:
            smiles: The query SMILES string.
            prefiltered_indices: A list of prefiltered indices.

        Returns:
            tuple[list[int], BBsType]: A tuple containing a list of indices
            of the compounds that pass the filter and a set of canonical
            SMILES strings of the building blocks that pass the filter.
        """
        filtered_indices = self.filter_compounds(smiles, prefiltered_indices)
        return filtered_indices, set(self.cano_smiles_list[i] for i in filtered_indices)

__init__(mol_properties_path, fpSize=2048)

Initializes the CompoundFilter with molecular properties loaded from a JSON file.

Parameters:

Name Type Description Default
mol_properties_path Path

Path to the JSON file containing molecular properties.

required
Source code in src/fragmentretro/utils/filter_compound.py
def __init__(self, mol_properties_path: Path, fpSize: int = 2048):
    """
    Initializes the CompoundFilter with molecular properties loaded from a JSON file.

    Args:
        mol_properties_path: Path to the JSON file containing molecular properties.
    """
    self.mol_properties_path = mol_properties_path
    self.cano_smiles_list: list[str] = []
    self.num_heavy_atoms_list: list[int] = []
    self.num_rings_list: list[int] = []
    self.pfp_len_list: list[int] = []
    self.pfp_list: list[list[int]] = []
    self.fpSize = fpSize

    self._load_mol_properties()
    self._create_numpy_arrays()

filter_compounds(smiles, prefiltered_indices=None)

Filters compounds based on a query SMILES string and prefiltered indices. Note that dummy atoms have to be replaced by hydrogen atoms so that we can get the minimal format for pattern fingerprint processing.

Parameters:

Name Type Description Default
smiles str

The query SMILES string.

required
prefiltered_indices FilterIndicesType | None

A list of prefiltered indices.

None

Returns:

Type Description
FilterIndicesType

A list of indices of the compounds that pass the filter.

Source code in src/fragmentretro/utils/filter_compound.py
def filter_compounds(self, smiles: str, prefiltered_indices: FilterIndicesType | None = None) -> FilterIndicesType:
    """Filters compounds based on a query SMILES string and prefiltered indices.
    Note that dummy atoms have to be replaced by hydrogen atoms so that we can get
    the minimal format for pattern fingerprint processing.

    Args:
        smiles: The query SMILES string.
        prefiltered_indices: A list of prefiltered indices.

    Returns:
        A list of indices of the compounds that pass the filter.
    """
    no_dummy_smiles = replace_dummy_atoms_regex(smiles)
    try:
        mol_properties = get_mol_properties(no_dummy_smiles, fpSize=self.fpSize)
    except ValueError as e:
        print(f"Invalid SMILES: {e}")
        return []

    logger.debug(f"[CompoundFilter] Filtering BBs for {no_dummy_smiles} ( {smiles} )")

    num_heavy_atoms = mol_properties["num_heavy_atoms"]
    num_rings = mol_properties["num_rings"]
    pfp = mol_properties["pfp"]
    pfp_len = len(pfp)

    query_pfp_bit_array = np.zeros(self.fpSize, dtype=bool)
    query_pfp_bit_array[pfp] = True

    # Filtering based on molecular properties
    indices_array = np.where(
        (self.num_heavy_atoms_array >= num_heavy_atoms)
        & (self.num_rings_array >= num_rings)
        & (self.pfp_len_array >= pfp_len)
    )[0]
    if prefiltered_indices is not None:
        indices_array = np.intersect1d(indices_array, prefiltered_indices)

    # check pfp of query is a subset of pfp of filtered compounds
    if indices_array.size == 0:
        filtered_indices = []
    else:
        filtered_indices = indices_array[
            np.all(self.pfp_bit_array[indices_array][:, query_pfp_bit_array], axis=1)
        ].tolist()

    if prefiltered_indices is None:
        logger.debug(f"[CompoundFilter] Originally {self.len_BBs} BBs, filtered down to {len(filtered_indices)}")
    else:
        logger.debug(
            f"[CompoundFilter] Originally {len(prefiltered_indices)} BBs, filtered down to {len(filtered_indices)}"
        )

    return filtered_indices

get_filtered_BBs(smiles, prefiltered_indices=None)

Filters building blocks based on a query SMILES string and prefiltered indices.

This method filters the building blocks based on the properties of the provided SMILES string and a list of prefiltered indices. It uses the filter_compounds method to get a list of indices that pass the filter, and then returns a set of the corresponding canonical SMILES strings.

Parameters:

Name Type Description Default
smiles str

The query SMILES string.

required
prefiltered_indices FilterIndicesType | None

A list of prefiltered indices.

None

Returns:

Type Description
FilterIndicesType

tuple[list[int], BBsType]: A tuple containing a list of indices

BBsType

of the compounds that pass the filter and a set of canonical

tuple[FilterIndicesType, BBsType]

SMILES strings of the building blocks that pass the filter.

Source code in src/fragmentretro/utils/filter_compound.py
def get_filtered_BBs(
    self, smiles: str, prefiltered_indices: FilterIndicesType | None = None
) -> tuple[FilterIndicesType, BBsType]:
    """Filters building blocks based on a query SMILES string and prefiltered indices.

    This method filters the building blocks based on the properties
    of the provided SMILES string and a list of prefiltered indices.
    It uses the `filter_compounds` method to get a list of indices
    that pass the filter, and then returns a set of the corresponding
    canonical SMILES strings.

    Args:
        smiles: The query SMILES string.
        prefiltered_indices: A list of prefiltered indices.

    Returns:
        tuple[list[int], BBsType]: A tuple containing a list of indices
        of the compounds that pass the filter and a set of canonical
        SMILES strings of the building blocks that pass the filter.
    """
    filtered_indices = self.filter_compounds(smiles, prefiltered_indices)
    return filtered_indices, set(self.cano_smiles_list[i] for i in filtered_indices)

get_mol_properties(smiles, fpSize=2048)

Given a SMILES string, returns a dictionary containing molecular properties.

Parameters:

Name Type Description Default
smiles str

The SMILES string.

required

Returns:

Type Description
MolProperties

A dictionary containing the following keys: - 'num_heavy_atoms': Number of heavy atoms - 'num_rings': Number of rings - 'pfp': Pattern fingerprint bits

Raises:

Type Description
ValueError

If the SMILES string is invalid and cannot be converted to an RDKit molecule.

Source code in src/fragmentretro/utils/filter_compound.py
def get_mol_properties(smiles: str, fpSize: int = 2048) -> MolProperties:
    """Given a SMILES string, returns a dictionary containing molecular properties.

    Args:
        smiles: The SMILES string.

    Returns:
        A dictionary containing the following keys:
            - 'num_heavy_atoms': Number of heavy atoms
            - 'num_rings': Number of rings
            - 'pfp': Pattern fingerprint bits

    Raises:
        ValueError: If the SMILES string is invalid and cannot be converted
            to an RDKit molecule.
    """
    if "*" in smiles:
        logger.error(f"Invalid smiles string: {smiles}, contains dummy atom '*' during get_mol_properties")
        raise ValueError(f"Invalid smiles string: {smiles}, contains dummy atom '*'")

    cano_smiles = canonicalize_smiles(smiles)
    mol = Chem.MolFromSmiles(cano_smiles)
    # solve C++ signature problems?
    mol.UpdatePropertyCache()
    Chem.GetSymmSSSR(mol)

    pfp = list(Chem.rdmolops.PatternFingerprint(mol, fpSize=fpSize).GetOnBits())

    return {
        "cano_smiles": cano_smiles,
        "num_heavy_atoms": mol.GetNumHeavyAtoms(),
        "num_rings": rdMolDescriptors.CalcNumRings(mol),
        "pfp": pfp,
    }

precompute_properties(smiles_list, output_path, fpSize=2048)

Calculates molecular properties for a list of SMILES strings and saves them to a JSON file.

Parameters:

Name Type Description Default
smiles_list list[str]

A list of SMILES strings.

required
output_path Path

The path to the output JSON file.

required
Source code in src/fragmentretro/utils/filter_compound.py
def precompute_properties(smiles_list: list[str], output_path: Path, fpSize: int = 2048) -> None:
    """Calculates molecular properties for a list of SMILES strings and saves them to a JSON file.

    Args:
        smiles_list: A list of SMILES strings.
        output_path: The path to the output JSON file.
    """
    results = []
    for smiles in tqdm(smiles_list, desc="Precomputing molecular properties"):
        try:
            mol_properties = get_mol_properties(smiles, fpSize=fpSize)
            results.append(mol_properties)
        except ValueError as e:
            logger.error(f"Error processing SMILES '{smiles}': {e} during precompute_properties")
            continue

    with open(output_path, "w") as f:
        json.dump(results, f, indent=4)