Skip to content

Helper Functions

This module provides helper functions for working with SMILES strings.

Example Use

canonicalize_smiles is for canonicalizing SMILES strings.

from fragmentretro.utils.helpers import canonicalize_smiles

smiles = 'C[C@@H](O)C(=O)O'
canonical_smiles = canonicalize_smiles(smiles)
print(f"Original SMILES: {smiles}")
print(f"Canonical SMILES: {canonical_smiles}")

replace_dummy_atoms_regex is for replacing dummy atoms with hydrogen atoms for pattern fingerprint screening. See how it's used in the filter_compounds function in the CompoundFilter class.

from fragmentretro.utils.helpers import replace_dummy_atoms_regex

smiles_with_dummy = '[5*]N1CCC[C@@]1([13*])C'
smiles_without_dummy = replace_dummy_atoms_regex(smiles_with_dummy)
print(f"SMILES with dummy atoms: {smiles_with_dummy}")
print(f"SMILES without dummy atoms: {smiles_without_dummy}")

remove_indices_before_dummy is for removing indices before dummy atoms. This is to record processed fragment SMILES strings in the most general format. See how it's used in the Retrosythesis class as well.

from fragmentretro.utils.helpers import remove_indices_before_dummy

smiles_with_indices = '[5*]N1CCC[C@@]1([13*])C'
smiles_without_indices = remove_indices_before_dummy(smiles_with_indices)
print(f"SMILES with indices: {smiles_with_indices}")
print(f"SMILES without indices: {smiles_without_indices}")

Source Code

fragmentretro.utils.helpers

canonicalize_smiles(smiles)

Canonicalizes a SMILES string using RDKit.

Parameters:

Name Type Description Default
smiles str

The SMILES string to canonicalize.

required

Returns:

Type Description
str

The canonicalized SMILES string.

Raises:

Type Description
ValueError

If the SMILES string cannot be parsed by RDKit.

Source code in src/fragmentretro/utils/helpers.py
def canonicalize_smiles(smiles: str) -> str:
    """Canonicalizes a SMILES string using RDKit.

    Args:
        smiles: The SMILES string to canonicalize.

    Returns:
        The canonicalized SMILES string.

    Raises:
        ValueError: If the SMILES string cannot be parsed by RDKit.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Failed to parse SMILES: {smiles}")
    return cast(str, Chem.MolToSmiles(mol))

count_heavy_atoms(smiles)

Counts the number of heavy atoms in a SMILES string.

Parameters:

Name Type Description Default
smiles str

The SMILES string representing the chemical structure.

required

Returns:

Type Description
int

The number of heavy atoms in the molecule.

Raises:

Type Description
ValueError

If the SMILES string is invalid and cannot be converted to an RDKit molecule.

Source code in src/fragmentretro/utils/helpers.py
def count_heavy_atoms(smiles: str) -> int:
    """Counts the number of heavy atoms in a SMILES string.

    Args:
        smiles: The SMILES string representing the chemical structure.

    Returns:
        The number of heavy atoms in the molecule.

    Raises:
        ValueError: If the SMILES string is invalid and cannot be converted
            to an RDKit molecule.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES string: {smiles}")
    return cast(int, mol.GetNumHeavyAtoms())

sort_by_heavy_atoms(smiles_list)

Sorts a list of SMILES strings by the number of heavy atoms.

The sorting is done in ascending order, i.e., the molecule with the fewest heavy atoms will come first.

Parameters:

Name Type Description Default
smiles_list list[str]

The list of SMILES strings to be sorted.

required

Returns:

Type Description
list[str]

The sorted list of SMILES strings.

Source code in src/fragmentretro/utils/helpers.py
def sort_by_heavy_atoms(smiles_list: list[str]) -> list[str]:
    """Sorts a list of SMILES strings by the number of heavy atoms.

    The sorting is done in ascending order, i.e., the molecule with the fewest
    heavy atoms will come first.

    Args:
        smiles_list: The list of SMILES strings to be sorted.

    Returns:
        The sorted list of SMILES strings.
    """
    return sorted(smiles_list, key=count_heavy_atoms)

replace_dummy_atoms_regex(smiles)

Replaces dummy atoms ('*') in a SMILES string with explicit hydrogen ('H') using regex.

Parameters:

Name Type Description Default
smiles str

The SMILES string containing dummy atoms.

required

Returns:

Type Description
str

A SMILES string where '[{int}*]' is replaced with '[H]'.

Source code in src/fragmentretro/utils/helpers.py
def replace_dummy_atoms_regex(smiles: str) -> str:
    """Replaces dummy atoms ('*') in a SMILES string with explicit hydrogen ('H') using regex.

    Args:
        smiles: The SMILES string containing dummy atoms.

    Returns:
        A SMILES string where '[{int}*]' is replaced with '[H]'.
    """
    return canonicalize_smiles(re.sub(r"\[\d*\*\]", "[H]", smiles))

remove_indices_before_dummy(smiles)

Removes indices before asterisks (*) in a SMILES string using regex.

Parameters:

Name Type Description Default
smiles str

The SMILES string containing indices before asterisks.

required

Returns:

Type Description
str

The SMILES string with indices before asterisks removed.

Source code in src/fragmentretro/utils/helpers.py
def remove_indices_before_dummy(smiles: str) -> str:
    """Removes indices before asterisks (*) in a SMILES string using regex.

    Args:
        smiles: The SMILES string containing indices before asterisks.

    Returns:
        The SMILES string with indices before asterisks removed.
    """
    return re.sub(r"\[\d*\*\]", "[*]", smiles)