Compound Filtering

This module provides functions for filtering compounds based on precomputed molecular properties (number of heavy atoms, number of rings, and pattern fingerprints).

Example Use

The precompute_properties function computes properties for a large set of compounds and saves them to a JSON file. The example below shows how to use it with a subset of the n1-stock.txt file.

from pathlib import Path
from fragmentretro.utils.filter_compound import precompute_properties

DATA_PATH = Path(__file__).parent.parent / "data"
PAROUTES_PATH = DATA_PATH / "paroutes"
PRECOMPUTE_PATH = DATA_PATH / "precompute"
MOL_PROPERTIES_PATH = PRECOMPUTE_PATH / "n1_stock_properties_subset.json"

# Create the directory if it doesn't exist
PRECOMPUTE_PATH.mkdir(parents=True, exist_ok=True)

with open(PAROUTES_PATH / "n1-stock.txt") as f:
    n1_stock = [line.strip() for line in f]
n1_stock_subset = n1_stock[:500]
precompute_properties(n1_stock_subset, MOL_PROPERTIES_PATH, fpSize=2048)

The CompoundFilter class is used to initialize a filter, and the get_filtered_BBs method retrieves the screened building blocks. The following example demonstrates filtering using a specific fragment SMILES string.

from fragmentretro.utils.filter_compound import CompoundFilter
from fragmentretro.utils.helpers import replace_dummy_atoms_regex

fragment_smiles_list = ["[5*]N1CCC[C@@]1([13*])C", "[4*]CCN[5*]", "[4*]C[8*]", "[*]C[*]", "[3*]O[3*]"]
fragment_smiles = fragment_smiles_list[1]

compound_filter = CompoundFilter(MOL_PROPERTIES_PATH, fpSize=2048)
filtered_indices, filtered_BBs = compound_filter.get_filtered_BBs(fragment_smiles)

Source Code

`fragmentretro.utils.filter_compound`

`CompoundFilter`

A class for filtering compounds based on precomputed molecular properties.

Source code in src/fragmentretro/utils/filter_compound.py

class CompoundFilter:
    """A class for filtering compounds based on precomputed molecular properties."""

    def __init__(self, mol_properties_path: Path, fpSize: int = 2048):
        """
        Initializes the CompoundFilter with molecular properties loaded from a JSON file.

        Args:
            mol_properties_path: Path to the JSON file containing molecular properties.
        """
        self.mol_properties_path = mol_properties_path
        self.cano_smiles_list: list[str] = []
        self.num_heavy_atoms_list: list[int] = []
        self.num_rings_list: list[int] = []
        self.pfp_len_list: list[int] = []
        self.pfp_list: list[list[int]] = []
        self.fpSize = fpSize

        self._load_mol_properties()
        self._create_numpy_arrays()

    def _load_mol_properties(self) -> None:
        """Loads molecular properties from the JSON file."""

        logger.debug("[CompoundFilter] Loading mol properties")
        with open(self.mol_properties_path) as f:
            mol_properties_list = json.load(f)

        self.len_BBs = len(mol_properties_list)
        self.cano_smiles_list = [props["cano_smiles"] for props in mol_properties_list]
        self.num_heavy_atoms_list = [props["num_heavy_atoms"] for props in mol_properties_list]
        self.num_rings_list = [props["num_rings"] for props in mol_properties_list]
        self.pfp_len_list = [len(props["pfp"]) for props in mol_properties_list]
        self.pfp_list = [props["pfp"] for props in mol_properties_list]
        logger.debug("[CompoundFilter] Finished loading mol properties")

    def _create_numpy_arrays(self) -> None:
        """Creates NumPy arrays for faster filtering."""
        self.num_heavy_atoms_array = np.array(self.num_heavy_atoms_list)
        self.num_rings_array = np.array(self.num_rings_list)
        self.pfp_len_array = np.array(self.pfp_len_list)

        # Create a boolean NumPy array for PFP bits
        self.pfp_bit_array = np.zeros((len(self.pfp_list), self.fpSize), dtype=bool)
        for i, pfp in enumerate(self.pfp_list):
            self.pfp_bit_array[i, pfp] = True

    def filter_compounds(self, smiles: str, prefiltered_indices: FilterIndicesType | None = None) -> FilterIndicesType:
        """Filters compounds based on a query SMILES string and prefiltered indices.
        Note that dummy atoms have to be replaced by hydrogen atoms so that we can get
        the minimal format for pattern fingerprint processing.

        Args:
            smiles: The query SMILES string.
            prefiltered_indices: A list of prefiltered indices.

        Returns:
            A list of indices of the compounds that pass the filter.
        """
        no_dummy_smiles = replace_dummy_atoms_regex(smiles)
        try:
            mol_properties = get_mol_properties(no_dummy_smiles, fpSize=self.fpSize)
        except ValueError as e:
            print(f"Invalid SMILES: {e}")
            return []

        logger.debug(f"[CompoundFilter] Filtering BBs for {no_dummy_smiles} ( {smiles} )")

        num_heavy_atoms = mol_properties["num_heavy_atoms"]
        num_rings = mol_properties["num_rings"]
        pfp = mol_properties["pfp"]
        pfp_len = len(pfp)

        query_pfp_bit_array = np.zeros(self.fpSize, dtype=bool)
        query_pfp_bit_array[pfp] = True

        # Filtering based on molecular properties
        indices_array = np.where(
            (self.num_heavy_atoms_array >= num_heavy_atoms)
            & (self.num_rings_array >= num_rings)
            & (self.pfp_len_array >= pfp_len)
        )[0]
        if prefiltered_indices is not None:
            indices_array = np.intersect1d(indices_array, prefiltered_indices)

        # check pfp of query is a subset of pfp of filtered compounds
        if indices_array.size == 0:
            filtered_indices = []
        else:
            filtered_indices = indices_array[
                np.all(self.pfp_bit_array[indices_array][:, query_pfp_bit_array], axis=1)
            ].tolist()

        if prefiltered_indices is None:
            logger.debug(f"[CompoundFilter] Originally {self.len_BBs} BBs, filtered down to {len(filtered_indices)}")
        else:
            logger.debug(
                f"[CompoundFilter] Originally {len(prefiltered_indices)} BBs, filtered down to {len(filtered_indices)}"
            )

        return filtered_indices

    def get_filtered_BBs(
        self, smiles: str, prefiltered_indices: FilterIndicesType | None = None
    ) -> tuple[FilterIndicesType, BBsType]:
        """Filters building blocks based on a query SMILES string and prefiltered indices.

        This method filters the building blocks based on the properties
        of the provided SMILES string and a list of prefiltered indices.
        It uses the `filter_compounds` method to get a list of indices
        that pass the filter, and then returns a set of the corresponding
        canonical SMILES strings.

        Args:
            smiles: The query SMILES string.
            prefiltered_indices: A list of prefiltered indices.

        Returns:
            tuple[list[int], BBsType]: A tuple containing a list of indices
            of the compounds that pass the filter and a set of canonical
            SMILES strings of the building blocks that pass the filter.
        """
        filtered_indices = self.filter_compounds(smiles, prefiltered_indices)
        return filtered_indices, set(self.cano_smiles_list[i] for i in filtered_indices)

`init(mol_properties_path, fpSize=2048)`

Initializes the CompoundFilter with molecular properties loaded from a JSON file.

Parameters:

Name	Type	Description	Default
`mol_properties_path`	`Path`	Path to the JSON file containing molecular properties.	required

Source code in src/fragmentretro/utils/filter_compound.py

def __init__(self, mol_properties_path: Path, fpSize: int = 2048):
    """
    Initializes the CompoundFilter with molecular properties loaded from a JSON file.

    Args:
        mol_properties_path: Path to the JSON file containing molecular properties.
    """
    self.mol_properties_path = mol_properties_path
    self.cano_smiles_list: list[str] = []
    self.num_heavy_atoms_list: list[int] = []
    self.num_rings_list: list[int] = []
    self.pfp_len_list: list[int] = []
    self.pfp_list: list[list[int]] = []
    self.fpSize = fpSize

    self._load_mol_properties()
    self._create_numpy_arrays()

`filter_compounds(smiles, prefiltered_indices=None)`

Filters compounds based on a query SMILES string and prefiltered indices. Note that dummy atoms have to be replaced by hydrogen atoms so that we can get the minimal format for pattern fingerprint processing.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	The query SMILES string.	required
`prefiltered_indices`	`FilterIndicesType \| None`	A list of prefiltered indices.	`None`

Returns:

Type	Description
`FilterIndicesType`	A list of indices of the compounds that pass the filter.

Source code in src/fragmentretro/utils/filter_compound.py

def filter_compounds(self, smiles: str, prefiltered_indices: FilterIndicesType | None = None) -> FilterIndicesType:
    """Filters compounds based on a query SMILES string and prefiltered indices.
    Note that dummy atoms have to be replaced by hydrogen atoms so that we can get
    the minimal format for pattern fingerprint processing.

    Args:
        smiles: The query SMILES string.
        prefiltered_indices: A list of prefiltered indices.

    Returns:
        A list of indices of the compounds that pass the filter.
    """
    no_dummy_smiles = replace_dummy_atoms_regex(smiles)
    try:
        mol_properties = get_mol_properties(no_dummy_smiles, fpSize=self.fpSize)
    except ValueError as e:
        print(f"Invalid SMILES: {e}")
        return []

    logger.debug(f"[CompoundFilter] Filtering BBs for {no_dummy_smiles} ( {smiles} )")

    num_heavy_atoms = mol_properties["num_heavy_atoms"]
    num_rings = mol_properties["num_rings"]
    pfp = mol_properties["pfp"]
    pfp_len = len(pfp)

    query_pfp_bit_array = np.zeros(self.fpSize, dtype=bool)
    query_pfp_bit_array[pfp] = True

    # Filtering based on molecular properties
    indices_array = np.where(
        (self.num_heavy_atoms_array >= num_heavy_atoms)
        & (self.num_rings_array >= num_rings)
        & (self.pfp_len_array >= pfp_len)
    )[0]
    if prefiltered_indices is not None:
        indices_array = np.intersect1d(indices_array, prefiltered_indices)

    # check pfp of query is a subset of pfp of filtered compounds
    if indices_array.size == 0:
        filtered_indices = []
    else:
        filtered_indices = indices_array[
            np.all(self.pfp_bit_array[indices_array][:, query_pfp_bit_array], axis=1)
        ].tolist()

    if prefiltered_indices is None:
        logger.debug(f"[CompoundFilter] Originally {self.len_BBs} BBs, filtered down to {len(filtered_indices)}")
    else:
        logger.debug(
            f"[CompoundFilter] Originally {len(prefiltered_indices)} BBs, filtered down to {len(filtered_indices)}"
        )

    return filtered_indices

`get_filtered_BBs(smiles, prefiltered_indices=None)`

Filters building blocks based on a query SMILES string and prefiltered indices.

This method filters the building blocks based on the properties of the provided SMILES string and a list of prefiltered indices. It uses the filter_compounds method to get a list of indices that pass the filter, and then returns a set of the corresponding canonical SMILES strings.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	The query SMILES string.	required
`prefiltered_indices`	`FilterIndicesType \| None`	A list of prefiltered indices.	`None`

Returns:

Type	Description
`FilterIndicesType`	tuple[list[int], BBsType]: A tuple containing a list of indices
`BBsType`	of the compounds that pass the filter and a set of canonical
`tuple[FilterIndicesType, BBsType]`	SMILES strings of the building blocks that pass the filter.

Source code in src/fragmentretro/utils/filter_compound.py

def get_filtered_BBs(
    self, smiles: str, prefiltered_indices: FilterIndicesType | None = None
) -> tuple[FilterIndicesType, BBsType]:
    """Filters building blocks based on a query SMILES string and prefiltered indices.

    This method filters the building blocks based on the properties
    of the provided SMILES string and a list of prefiltered indices.
    It uses the `filter_compounds` method to get a list of indices
    that pass the filter, and then returns a set of the corresponding
    canonical SMILES strings.

    Args:
        smiles: The query SMILES string.
        prefiltered_indices: A list of prefiltered indices.

    Returns:
        tuple[list[int], BBsType]: A tuple containing a list of indices
        of the compounds that pass the filter and a set of canonical
        SMILES strings of the building blocks that pass the filter.
    """
    filtered_indices = self.filter_compounds(smiles, prefiltered_indices)
    return filtered_indices, set(self.cano_smiles_list[i] for i in filtered_indices)

`get_mol_properties(smiles, fpSize=2048)`

Given a SMILES string, returns a dictionary containing molecular properties.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	The SMILES string.	required

Returns:

Type	Description
`MolProperties`	A dictionary containing the following keys: - 'num_heavy_atoms': Number of heavy atoms - 'num_rings': Number of rings - 'pfp': Pattern fingerprint bits

Raises:

Type	Description
`ValueError`	If the SMILES string is invalid and cannot be converted to an RDKit molecule.

Source code in src/fragmentretro/utils/filter_compound.py

def get_mol_properties(smiles: str, fpSize: int = 2048) -> MolProperties:
    """Given a SMILES string, returns a dictionary containing molecular properties.

    Args:
        smiles: The SMILES string.

    Returns:
        A dictionary containing the following keys:
            - 'num_heavy_atoms': Number of heavy atoms
            - 'num_rings': Number of rings
            - 'pfp': Pattern fingerprint bits

    Raises:
        ValueError: If the SMILES string is invalid and cannot be converted
            to an RDKit molecule.
    """
    if "*" in smiles:
        logger.error(f"Invalid smiles string: {smiles}, contains dummy atom '*' during get_mol_properties")
        raise ValueError(f"Invalid smiles string: {smiles}, contains dummy atom '*'")

    cano_smiles = canonicalize_smiles(smiles)
    mol = Chem.MolFromSmiles(cano_smiles)
    # solve C++ signature problems?
    mol.UpdatePropertyCache()
    Chem.GetSymmSSSR(mol)

    pfp = list(Chem.rdmolops.PatternFingerprint(mol, fpSize=fpSize).GetOnBits())

    return {
        "cano_smiles": cano_smiles,
        "num_heavy_atoms": mol.GetNumHeavyAtoms(),
        "num_rings": rdMolDescriptors.CalcNumRings(mol),
        "pfp": pfp,
    }

`precompute_properties(smiles_list, output_path, fpSize=2048)`

Calculates molecular properties for a list of SMILES strings and saves them to a JSON file.

Parameters:

Name	Type	Description	Default
`smiles_list`	`list[str]`	A list of SMILES strings.	required
`output_path`	`Path`	The path to the output JSON file.	required

Source code in src/fragmentretro/utils/filter_compound.py

def precompute_properties(smiles_list: list[str], output_path: Path, fpSize: int = 2048) -> None:
    """Calculates molecular properties for a list of SMILES strings and saves them to a JSON file.

    Args:
        smiles_list: A list of SMILES strings.
        output_path: The path to the output JSON file.
    """
    results = []
    for smiles in tqdm(smiles_list, desc="Precomputing molecular properties"):
        try:
            mol_properties = get_mol_properties(smiles, fpSize=fpSize)
            results.append(mol_properties)
        except ValueError as e:
            logger.error(f"Error processing SMILES '{smiles}': {e} during precompute_properties")
            continue

    with open(output_path, "w") as f:
        json.dump(results, f, indent=4)

Compound Filtering

Example Use

Source Code

fragmentretro.utils.filter_compound

CompoundFilter

__init__(mol_properties_path, fpSize=2048)

filter_compounds(smiles, prefiltered_indices=None)

get_filtered_BBs(smiles, prefiltered_indices=None)

get_mol_properties(smiles, fpSize=2048)

precompute_properties(smiles_list, output_path, fpSize=2048)

`fragmentretro.utils.filter_compound`

`CompoundFilter`

`init(mol_properties_path, fpSize=2048)`

`filter_compounds(smiles, prefiltered_indices=None)`

`get_filtered_BBs(smiles, prefiltered_indices=None)`

`get_mol_properties(smiles, fpSize=2048)`

`precompute_properties(smiles_list, output_path, fpSize=2048)`