from base64 import b64decode, b64encode
from functools import reduce
from pathlib import Path
from typing import Any, Dict, List, Optional, Union # noqa: F401
from warnings import warn
from drugforge.data.schema.schema_base import MoleculeFilter
from openeye import ( # noqa: F401
oechem,
oedepict,
oedocking,
oeff,
oegraphsim,
oegrid,
oeomega,
oequacpac,
oeshape,
oespruce,
oeszybki,
)
# exec on module import
if not oechem.OEChemIsLicensed("python"):
warn("OpenEye license required to use drugforge openeye module")
[docs]
def combine_protein_ligand(
prot: oechem.OEMol,
lig: oechem.OEMol,
lig_name: str = "LIG",
lig_chain: Optional[str] = "",
resid: Optional[int] = None,
start_atom_id: Optional[int] = None,
) -> oechem.OEMol:
"""
Combine a protein OEMol and ligand OEMol into one, handling residue/atom
numbering, and HetAtom status.
Parameters
----------
prot : oechem.OEMol
OEMol with the protein atoms. This should have perceived resiudes
lig : oechem.OEMol
OEMol with the ligand atoms
lig_name : str, default="LIG"
Residue name to give to the ligand atoms
resid : int, optional
Which residue number to assign to the ligand. If not given, the largest existing
residue number in `prot` will be found, and the ligand will be assigned the next
number
start_atom_id : int, optional
Which atom number to assign to the first atom in the ligand. If not given, the
next available atom number will be calculated and assigned
Returns
-------
oechem.OEMol
Combined molecule, with the appropriate biopolymer field set for the lig atoms
"""
# Calculate residue number if necessary
if resid is None:
# Find max resid for numbering the ligand residue
# Add 1 so we start numbering at the next residue id
resid = max([r.GetResidueNumber() for r in oechem.OEGetResidues(prot)]) + 1
# Calculate atom number if necessary
if start_atom_id is None:
# Same with atom numbering
start_atom_id = (
max([oechem.OEAtomGetResidue(a).GetSerialNumber() for a in prot.GetAtoms()])
+ 1
)
# Make copies so we don't modify the original molecules
prot = prot.CreateCopy()
lig = lig.CreateCopy()
# Keep track of how many times each element has been seen in the ligand
# Each atom in a residue needs a unique name, so just append this number to the
# element
num_elem_atoms = {}
# Adjust molecule residue properties
for a in lig.GetAtoms():
# Set atom name
cur_name = oechem.OEGetAtomicSymbol(a.GetAtomicNum())
try:
new_name = f"{cur_name}{num_elem_atoms[cur_name]}"
num_elem_atoms[cur_name] += 1
except KeyError:
new_name = cur_name
num_elem_atoms[cur_name] = 1
a.SetName(new_name)
# Set residue level properties
res = oechem.OEAtomGetResidue(a)
res.SetName(lig_name.upper())
res.SetResidueNumber(resid)
res.SetSerialNumber(start_atom_id)
if lig_chain:
res.SetChainID(lig_chain)
start_atom_id += 1
res.SetHetAtom(True)
oechem.OEAtomSetResidue(a, res)
# Combine the mols
oechem.OEAddMols(prot, lig)
return prot
[docs]
def load_openeye_pdb(
pdb_fn: Union[str, Path], alt_loc: bool = False
) -> oechem.OEGraphMol:
"""
Load an OpenEye OEGraphMol from a PDB file.
Parameters
----------
pdb_fn : Union[str, Path]
The path to the input PDB file.
alt_loc : bool, optional
Whether to keep track of alternate locations, by default False.
Returns
-------
oechem.OEGraphMol
The OEGraphMol loaded from the input file.
Raises
------
FileNotFoundError
If the specified file does not exist.
oechem.OEError
If the CIF file cannot be opened.
"""
if not Path(pdb_fn).exists():
raise FileNotFoundError(f"{pdb_fn} does not exist!")
ifs = oechem.oemolistream()
ifs_flavor = oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA
# Add option for keeping track of alternat locations in PDB file
if alt_loc:
ifs_flavor |= oechem.OEIFlavor_PDB_ALTLOC
ifs.SetFlavor(
oechem.OEFormat_PDB,
ifs_flavor,
)
if ifs.open(str(pdb_fn)):
in_mol = oechem.OEGraphMol()
oechem.OEReadMolecule(ifs, in_mol)
ifs.close()
return in_mol
else:
oechem.OEThrow.Fatal(f"Unable to open {pdb_fn}")
[docs]
def load_openeye_smi(smi_fn: Union[str, Path]) -> list[oechem.OEGraphMol]:
"""
Load an OpenEye SMILES file containing a set of molecules and return them as
OpenEye OEGraphMol objects.
Parameters
----------
smi_fn : Union[str, Path]
Path to the SMILES file to load.
Returns
-------
list[oechem.OEGraphMol]
A list of OpenEye OEGraphMol objects corresponding to the data from the SMI file.
Raises
------
FileNotFoundError
If the specified file does not exist.
oechem.OEError
If the SMI file cannot be opened.
"""
# convert to path to make consistent
smi_fn = Path(smi_fn)
if not smi_fn.exists():
raise FileNotFoundError(f"{str(smi_fn)} does not exist!")
ifs = oechem.oemolistream(smi_fn.as_posix())
ifs.SetFlavor(oechem.OEFormat_SMI, oechem.OEIFlavor_SMI_DEFAULT)
molecules = []
for mol in ifs.GetOEGraphMols():
molecules.append(oechem.OEGraphMol(mol))
return molecules
[docs]
def load_openeye_cif1(cif1_fn: Union[str, Path]) -> oechem.OEGraphMol:
"""
Loads a biological assembly file into an OEGraphMol object.
Current version requires going through an OpenMM intermediate.
Parameters
----------
cif1_fn : Union[str, Path]
The path to the input CIF1 file.
Returns
-------
oechem.OEGraphMol
oechem.OEGraphMol: the biological assembly as an OEGraphMol object.
"""
from tempfile import NamedTemporaryFile
from openmm.app import PDBFile, PDBxFile
if not Path(cif1_fn).exists():
raise FileNotFoundError(f"{cif1_fn} does not exist!")
cif = PDBxFile(str(cif1_fn))
# the keep ids flag is critical to make sure the residue numbers are correct
with NamedTemporaryFile("w", suffix=".pdb") as f:
PDBFile.writeFile(cif.topology, cif.positions, f, keepIds=True)
prot = load_openeye_pdb(f.name)
return prot
[docs]
def load_openeye_cif(
cif_fn: Union[str, Path], alt_loc: bool = False
) -> oechem.OEGraphMol:
"""
Load an OpenEye OEGraphMol object from a CIF file.
Parameters
----------
cif_fn : Union[str, Path]
The path of the CIF file to read.
alt_loc : bool, optional
If True, include alternative locations for atoms in the resulting OEGraphMol
object. Default is False.
Returns
-------
oechem.OEGraphMol
The OEGraphMol object read from the CIF file.
Raises
------
FileNotFoundError
If the specified file does not exist.
oechem.OEError
If the CIF file cannot be opened.
Notes
-----
This function will raise an exception if the specified file does not exist or
if the CIF file cannot be opened. If `alt_loc` is True, the resulting OEGraphMol
object will include alternative locations for atoms.
"""
if not Path(cif_fn).exists():
raise FileNotFoundError(f"{cif_fn} does not exist!")
ifs = oechem.oemolistream()
ifs_flavor = oechem.OEIFlavor_MMCIF_DEFAULT
# Add option for keeping track of alternat locations in PDB file
# TODO: check if this is a thing in mmcif
# TODO: Currently this actually fails on biological assemblies so I'm not using it but it *should* work
if not alt_loc:
ifs_flavor |= oechem.OEIFlavor_MMCIF_NoAltLoc
ifs.SetFlavor(
oechem.OEFormat_MMCIF,
ifs_flavor,
)
if ifs.open(str(cif_fn)):
in_mol = oechem.OEGraphMol()
oechem.OEReadMolecule(ifs, in_mol)
ifs.close()
return in_mol
else:
oechem.OEThrow.Fatal(f"Unable to open {cif_fn}")
[docs]
def load_openeye_mol2(mol2_fn: Union[str, Path]) -> oechem.OEMol:
"""
Load an OpenEye MOL2 file and return it as an OpenEye OEMol object.
Reads multiple conformers into the OEMol object but if the MOL2 file contains
multiple molecules, it will only return the first one.
Parameters
----------
MOL2_fn : Union[str, Path]
Path to the MOL2 file to load.
Returns
-------
oechem.OEMol
An OpenEye OEMol object containing the molecule data from the MOL2 file.
Raises
------
FileNotFoundError
If the specified file does not exist.
oechem.OEError
If the MOL2 file cannot be opened.
Notes
-----
This function assumes that the MOL2 file contains a single molecule. If the
file contains more than one molecule, only the first molecule will be loaded.
"""
if not Path(mol2_fn).exists():
raise FileNotFoundError(f"{mol2_fn} does not exist!")
ifs = oechem.oemolistream()
ifs.SetFlavor(
oechem.OEFormat_MOL2,
oechem.OEIFlavor_MOL2_Default,
)
ifs.SetConfTest(oechem.OEOmegaConfTest())
if ifs.open(str(mol2_fn)):
for mol in ifs.GetOEMols():
ifs.close()
return mol
else:
oechem.OEThrow.Fatal(f"Unable to open {mol2_fn}")
[docs]
def load_openeye_sdf(sdf_fn: Union[str, Path]) -> oechem.OEMol:
"""
Load an OpenEye SDF file and return it as an OpenEye OEMol object.
Reads multiple conformers into the OEMol object but if the sdf file contains
multiple molecules, it will only return the first one.
Parameters
----------
sdf_fn : Union[str, Path]
Path to the SDF file to load.
Returns
-------
oechem.OEMol
An OpenEye OEMol object containing the molecule data from the SDF file.
Raises
------
FileNotFoundError
If the specified file does not exist.
oechem.OEError
If the SDF file cannot be opened.
Notes
-----
This function assumes that the SDF file contains a single molecule. If the
file contains more than one molecule, only the first molecule will be loaded.
"""
if not Path(sdf_fn).exists():
raise FileNotFoundError(f"{sdf_fn} does not exist!")
ifs = oechem.oemolistream()
ifs.SetFlavor(
oechem.OEFormat_SDF,
oechem.OEIFlavor_SDF_Default,
)
ifs.SetConfTest(oechem.OEOmegaConfTest())
if ifs.open(str(sdf_fn)):
for mol in ifs.GetOEMols():
ifs.close()
return mol
else:
oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}")
[docs]
def load_openeye_sdfs(sdf_fn: Union[str, Path]) -> list[oechem.OEGraphMol]:
"""
Load a list of OpenEye OEGraphMol objects from an SDF file.
Parameters
----------
sdf_fn : Union[str, Path]
The path of the SDF file to read.
Returns
-------
list of oechem.OEGraphMol
A list of the OEGraphMol objects read from the SDF file.
Raises
------
FileNotFoundError
If the specified file does not exist.
oechem.OEError
If the CIF file cannot be opened.
Notes
-----
This function will raise an exception if the specified file does not exist or
if the SDF file cannot be opened.
"""
if not Path(sdf_fn).exists():
raise FileNotFoundError(f"{sdf_fn} does not exist!")
ifs = oechem.oemolistream()
ifs.SetFlavor(
oechem.OEFormat_SDF,
oechem.OEIFlavor_SDF_Default,
)
cmpd_list = []
if ifs.open(str(sdf_fn)):
for mol in ifs.GetOEGraphMols():
cmpd_list.append(mol.CreateCopy())
ifs.close()
return cmpd_list
else:
oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}")
[docs]
def load_openeye_design_unit(du_fn: Union[str, Path]) -> oechem.OEDesignUnit:
"""
Load an OpenEye design unit from a file
Parameters
----------
du_fn : Union[str, Path]
The path of the DesignUnit file to read.
Returns
-------
oechem.OEDesignUnit
OpenEye DesignUnit
Raises
------
FileNotFoundError
If the specified file does not exist.
oechem.OEError
If the CIF file cannot be opened.
"""
if not Path(du_fn).exists():
raise FileNotFoundError(f"{du_fn} does not exist!")
du = oechem.OEDesignUnit()
retcode = oechem.OEReadDesignUnit(str(du_fn), du)
if not retcode:
oechem.OEThrow.Fatal(f"Unable to open {du_fn}")
return du
[docs]
def save_openeye_pdb(mol, pdb_fn: Union[str, Path]) -> Path:
"""
Write an OpenEye OEGraphMol object to a PDB file.
Parameters
----------
mol : oechem.OEGraphMol
The OEGraphMol object to write to the PDB file.
pdb_fn : Union[str, Path]
The path of the PDB file to create or overwrite.
Returns
-------
Path
The path of the PDB file that was written.
Notes
-----
This function will overwrite any existing file with the same name as `pdb_fn`.
"""
ofs = oechem.oemolostream()
ofs.SetFlavor(oechem.OEFormat_PDB, oechem.OEOFlavor_PDB_Default)
if ofs.open(str(pdb_fn)):
oechem.OEWriteMolecule(ofs, mol)
else:
oechem.OEThrow.Fatal(f"Unable to open {pdb_fn}")
ofs.close()
return Path(pdb_fn)
[docs]
def save_openeye_sdf(mol, sdf_fn: Union[str, Path]) -> Path:
"""
Write an OpenEye OEGraphMol object to an SDF file.
Parameters
----------
mol : oechem.OEGraphMol
The OEGraphMol object to write to the SDF file.
sdf_fn : Union[str, Path]
The path of the SDF file to create or overwrite.
Returns
-------
Path
The path of the SDF file that was written.
Notes
-----
This function will overwrite any existing file with the same name as `sdf_fn`.
"""
ofs = oechem.oemolostream()
ofs.SetFlavor(oechem.OEFormat_SDF, oechem.OEOFlavor_SDF_Default)
if ofs.open(str(sdf_fn)):
oechem.OEWriteMolecule(ofs, mol)
else:
oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}")
ofs.close()
return Path(sdf_fn)
[docs]
def save_openeye_sdfs(mols, sdf_fn: Union[str, Path]) -> Path:
"""
Write a list of OpenEye OEGraphMol objects to a single SDF file.
Parameters
----------
mols : list of oechem.OEGraphMol
The list of OEGraphMol objects to write to the SDF file.
sdf_fn : Union[str, Path]
The path of the SDF file to create or overwrite.
Returns
-------
Path
The path of the SDF file that was written.
Raises
------
oechem.OEError
If the SDF file cannot be opened.
Notes
-----
This function will overwrite any existing file with the same name as `sdf_fn`.
"""
ofs = oechem.oemolostream()
ofs.SetFlavor(
oechem.OEFormat_SDF,
oechem.OEOFlavor_SDF_Default,
)
if ofs.open(str(sdf_fn)):
for mol in mols:
oechem.OEWriteMolecule(ofs, mol)
ofs.close()
else:
oechem.OEThrow.Fatal(f"Unable to open {sdf_fn}")
return Path(sdf_fn)
[docs]
def save_openeye_design_unit(du: oechem.OEDesignUnit, du_fn: Union[str, Path]) -> Path:
"""
Write an OpenEye design unit to a file
Parameters
----------
du : oechem.OEDesignUnit
The OpenEye DesignUnit to write to the file.
du_fn : Union[str, Path]
The path of the DesignUnit file to create or overwrite.
Returns
-------
Path
The path of the DesignUnit file that was written.
Raises
------
oechem.OEError
If the DesignUnit file cannot be opened.
Notes
-----
This function will overwrite any existing file with the same name as `du_fn`.
"""
retcode = oechem.OEWriteDesignUnit(str(du_fn), du)
if not retcode:
oechem.OEThrow.Fatal(f"Unable to open {du_fn}")
return Path(du_fn)
[docs]
def openeye_perceive_residues(
prot: oechem.OEGraphMol, preserve_all: bool = False
) -> oechem.OEGraphMol:
"""
Re-perceive the residues of a protein molecule using OpenEye's OEPerceiveResidues function,
which is necessary when changes are made to the protein to ensure correct atom ordering and CONECT record creation.
Parameters
----------
prot : oechem.OEGraphMol
The input protein molecule to be processed.
preserve_all : bool, optional, default=False
If True, preserve all residue information, including chain ID, residue number, and residue name.
Returns
-------
oechem.OEGraphMol
The processed protein molecule with re-perceived residue information.
"""
# Clean up PDB info by re-perceiving, perserving chain ID, residue number, and residue name
if preserve_all:
preserve = oechem.OEPreserveResInfo_All
else:
preserve = (
oechem.OEPreserveResInfo_ChainID
| oechem.OEPreserveResInfo_ResidueNumber
| oechem.OEPreserveResInfo_ResidueName
)
oechem.OEPerceiveResidues(prot, preserve)
return prot
[docs]
def save_receptor_grid(du_fn: Union[str, Path], out_fn: Union[str, Path]) -> Path:
"""
Load in a design unit from a file and write out the receptor grid as a .ccp4 grid file.
Parameters
----------
du_fn: Union[str, Path]
File name/path with the design units.
out_fn: Union[str, Path]
Works with a .ccp4 extension
Returns
-------
Path
Path to the receptor grid file
"""
du = oechem.OEDesignUnit()
oechem.OEReadDesignUnit(str(du_fn), du)
# oedocking.OEMakeReceptor(du)
oegrid.OEWriteGrid(
str(out_fn),
oegrid.OEScalarGrid(du.GetReceptor().GetNegativeImageGrid()),
)
return Path(out_fn)
[docs]
def openeye_copy_pdb_data(
source: oechem.OEGraphMol, destination: oechem.OEGraphMol, tag: str
) -> None:
"""
Copy over the PDB data from one object to another. Tag examples include "SEQRES"
Parameters
----------
source: oechem.OEGraphMol
Source molecule to copy the data from
destination: oechem.OEGraphMol
Destination molecule to copy the data
tag: str
Tag identifier for the data/metadata.
Returns
-------
"""
# first, delete data with that tag
oechem.OEDeletePDBData(destination, tag)
# now, add over all the data with the tag
for data_pair in oechem.OEGetPDBDataPairs(source):
if data_pair.GetTag() == tag:
oechem.OEAddPDBData(destination, data_pair)
[docs]
def oemol_to_sdf_string(mol: oechem.OEMol) -> str:
"""
Dumps an OpenEye OEMol to an SDF string
Parameters
----------
mol: oechem.OEMol
OpenEye OEMol
Returns
-------
str
SDF string representation of the input OEMol
"""
oms = oechem.oemolostream()
oms.SetFormat(oechem.OEFormat_SDF)
oms.openstring()
oechem.OEWriteMolecule(oms, mol)
molstring = oms.GetString().decode("UTF-8")
return molstring
[docs]
def sdf_string_to_oemol(sdf_str: str) -> oechem.OEMol:
"""
Loads an SDF string into an openeye molecule.
Enables multiple conformers but only returns the first molecule in a multi molecule sdf
Parameters
----------
sdf_str: str
The string representation of an SDF file
Returns
-------
oechem.OEMol:
resulting OpenEye OEMol
"""
ims = oechem.oemolistream()
ims.SetFormat(oechem.OEFormat_SDF)
ims.SetFlavor(
oechem.OEFormat_SDF,
oechem.OEIFlavor_SDF_Default,
)
rmol = oechem.OEMol()
ims.SetConfTest(oechem.OEOmegaConfTest())
if ims.openstring(sdf_str):
for mol in ims.GetOEMols():
rmol = mol.CreateCopy()
break # only return the first molecule
return rmol
[docs]
def smiles_to_oemol(smiles: str) -> oechem.OEMol:
"""
Loads a smiles string into an openeye molecule
Parameters
----------
smiles: str
SMILES string
Returns
-------
oechem.OEGraphMol
resulting OpenEye OEGraphMol
"""
mol = oechem.OEMol()
oechem.OESmilesToMol(mol, smiles)
return mol
[docs]
def oemol_to_smiles(mol: oechem.OEMol, isomeric=True) -> str:
"""
Canonical SMILES string of an OpenEye OEMol
Paramers
--------
mol: oechem.OEMol
OpenEye OEMol
isomeric: bool, optional, default=True
If True, generate canonical isomeric SMILES (including stereochem)
If False, generate canonical SMILES without stereochem
Returns
-------
str
SMILES string of molecule
"""
# By default, OEMolToSmiles generates isomeric SMILES, which includes stereochemistry
if isomeric:
return oechem.OEMolToSmiles(mol)
# However, if we want to treat two stereoisomers as the same molecule,
# we can generate canonical SMILES that don't include stereo info
else:
return oechem.OECreateCanSmiString(mol)
[docs]
def oe_smiles_roundtrip(smiles: str) -> str:
"""
Canonical SMILES string of an OpenEye OEMol
Paramers
--------
mol: oechem.OEMol
OpenEye OEMol
Returns
-------
str
SMILES string of molecule
"""
mol = smiles_to_oemol(smiles)
return oemol_to_smiles(mol)
[docs]
def oemol_to_inchi(mol: oechem.OEMol, fixed_hydrogens: bool = False) -> str:
"""
InChI string of an OpenEye OEMol
Paramers
--------
mol: oechem.OEMol
OpenEye OEMol
fixed_hydrogens: bool
If a fixed hydrogen layer should be added to the InChI, if `True` this will result in a non-standard inchi
which can distinguish tautomers.
Returns
-------
str
InChI string of molecule
"""
if fixed_hydrogens:
inchi_opts = oechem.OEInChIOptions()
inchi_opts.SetFixedHLayer(True)
inchi = oechem.OEMolToInChI(mol)
else:
inchi = oechem.OEMolToSTDInChI(mol)
return inchi
[docs]
def oemol_to_inchikey(mol: oechem.OEMol, fixed_hydrogens: bool = False) -> str:
"""
InChI key string of an OpenEye OEMol
Paramers
--------
mol: oechem.OEMol
OpenEye OEMol
fixed_hydrogens: bool
If a fixed hydrogen layer should be added to the InChI, if `True` this will result in a non-standard inchi
which can distinguish tautomers.
Returns
-------
str
InChI key string of molecule
"""
if fixed_hydrogens:
inchi_opts = oechem.OEInChIOptions()
inchi_opts.SetFixedHLayer(True)
inchi_key = oechem.OEMolToInChIKey(mol)
else:
inchi_key = oechem.OEMolToSTDInChIKey(mol)
return inchi_key
def _set_SD_data(mol: oechem.OEMolBase, data: dict[str, str]) -> oechem.OEMolBase:
"""
Set SD data on an OpenEye OEMolBase object.
Since this function works on OEMol, OEGraphMol, OEConfBase objects, it is worth repurposing.
But it is not recommended to use this function directly for multi-conformer molecules.
Parameters
----------
mol: oechem.OEMolBase
OpenEye OEMolBase
Returns
-------
oechem.OEMolBase
OpenEye OEMolBase with SD data set
"""
for key, value in data.items():
oechem.OESetSDData(mol, key, str(value))
return mol
[docs]
def set_SD_data(mol: oechem.OEMol, data: dict[str, str | list]) -> oechem.OEMol:
"""
Set the SD data on an OpenEye OEMol, overwriting any existing data with the same tag
If a str or a single-length list is passed as the values of the dictionary, the data will be set to all conformers.
If a list is provided, the data will be set to the conformers in the order provided.
If the list is not the same length as the number of conformers, an error will be raised.
Parameters
----------
mol: oechem.OEMol
OpenEye OEMol
data: dict[str, str | list]
Dictionary of SD data to set.
Returns
-------
oechem.OEMol
OpenEye OEMol with SD data set
"""
from drugforge.data.util.data_conversion import get_first_value_of_dict_of_lists
# convert to dict of lists first
data = {k: v if isinstance(v, list) else [v] for k, v in data.items()}
# If the object is an OEMol, we will set the SD data to all the conformers
if isinstance(mol, oechem.OEMol):
for key, value_list in data.items():
# if list is len 1, generate a list of len N, where N is the number of conformers
if len(value_list) == 1:
value_list = value_list * mol.NumConfs()
if not len(value_list) == mol.NumConfs():
raise ValueError(
f"Length of data for tag '{key}' does not match number of conformers ({mol.NumConfs()}). "
f"Expected {mol.NumConfs()} but got {len(value_list)} elements."
)
for i, conf in enumerate(mol.GetConfs()):
oechem.OESetSDData(conf, key, str(value_list[i]))
return mol
elif isinstance(mol, oechem.OEMolBase):
return _set_SD_data(mol, get_first_value_of_dict_of_lists(data))
else:
raise TypeError(
f"Expected an OpenEye OEMol, OEGraphMol, or OEConf, but got {type(mol)}"
)
def _get_SD_data(mol: oechem.OEMolBase) -> dict[str, str]:
"""
Get SD data from an OpenEye OEMolBase object.
Since this function works on OEMol, OEGraphMol, OEConfBase objects, it is worth repurposing.
But it is not recommended to use this function directly for multi-conformer molecules.
Parameters
----------
mol: oechem.OEMolBase
OpenEye OEMolBase
Returns
-------
Dict[str, str]
Dictionary of SD data
"""
return {dp.GetTag(): dp.GetValue() for dp in oechem.OEGetSDDataPairs(mol)}
[docs]
def get_SD_data(mol: oechem.OEMolBase) -> dict[str, list]:
"""
Get all SD data on an OpenEye OEMol, OEGraphMol, or OEConfBase object.
If multiple conformers are found, the SD tags from the conformers will be used, and any properties at the highest
level will be ignored.
Parameters
----------
mol: oechem.OEMol
OpenEye OEMol
Returns
-------
dict[str, list]
Dictionary of SD data
Raises
------
TypeError
If mol is a type that cant be converted to an OEMol, OEGraphMol, or OEConfBase
"""
from drugforge.data.util.data_conversion import (
get_dict_of_lists_from_dict_of_str,
get_dict_of_lists_from_list_of_dicts,
)
# If the object is an OEMol, we have to pull from the conformers, because even if there is only one conformer
# the data is stored at the conformer level if you generate an oemol from an sdf file
# However, if you've manually fiddled with the tags, the data might be stored at the molecule level.
# In order to resolve this, if there is data at the high level that is not repeated at the conformer level,
# we'll add to all the conformers and return that dict of lists.
if isinstance(mol, oechem.OEMol):
# Get the data from the molecule
molecule_tags = _get_SD_data(mol)
# get the data from the conformers
conformer_tags = get_dict_of_lists_from_list_of_dicts(
[_get_SD_data(conf) for conf in mol.GetConfs()]
)
for k, v in molecule_tags.items():
if k not in conformer_tags:
conformer_tags[k] = [v] * mol.NumConfs()
return conformer_tags
elif isinstance(mol, oechem.OEMolBase):
return get_dict_of_lists_from_dict_of_str(_get_SD_data(mol))
else:
raise TypeError(
f"Expected an OpenEye OEMol, OEGraphMol, or OEConf, but got {type(mol)}"
)
def _get_SD_data_to_object(mol: oechem.OEMol) -> dict[str, Any]:
"""
Get all SD data on an OpenEye OEMol, converting to Python objects
Parameters
----------
mol: oechem.OEMol
OpenEye OEMol
Returns
-------
Dict[str, Any]
Dictionary of SD data
"""
import ast
sd_data = get_SD_data(mol)
for key, value in sd_data.items():
sd_data[key] = ast.literal_eval(value)
return sd_data
[docs]
def print_SD_data(mol: oechem.OEMol) -> None:
print("SD data of", mol.GetTitle())
# loop over SD data
for dp in oechem.OEGetSDDataPairs(mol):
print(dp.GetTag(), ":", dp.GetValue())
[docs]
def clear_SD_data(mol: oechem.OEMolBase) -> oechem.OEMol:
"""
Clear all SD data on an OpenEye OEMol
Parameters
----------
mol: oechem.OEMol
OpenEye OEMol
Returns
-------
oechem.OEMol
OpenEye OEMol with SD data cleared
"""
oechem.OEClearSDData(mol)
mol.SetTitle("")
if isinstance(mol, oechem.OEMCMolBase):
for conf in mol.GetConfs():
oechem.OEClearSDData(conf)
return mol
[docs]
def oemol_to_pdb_string(mol: oechem.OEMol) -> str:
"""
Dumps an OpenEye OEMol to a PDB string
Parameters
----------
mol: oechem.OEMol
OpenEye OEMol
Returns
-------
str
PDB string representation of the input OEMol
"""
oms = oechem.oemolostream()
oms.SetFormat(oechem.OEFormat_PDB)
oms.SetFlavor(oechem.OEFormat_PDB, oechem.OEOFlavor_PDB_Default)
oms.openstring()
oechem.OEWriteMolecule(oms, mol)
molstring = oms.GetString().decode("UTF-8")
return molstring
[docs]
def pdb_string_to_oemol(pdb_str: str) -> oechem.OEGraphMol:
"""
Loads a PDB string into an OpenEye OEGraphMol
Parameters
----------
pdb_str: str
The string representation of a PDB file
Returns
-------
oechem.OEMol
resulting OpenEye OEMol
"""
ifs = oechem.oemolistream()
ifs.SetFormat(oechem.OEFormat_PDB)
ifs.SetFlavor(
oechem.OEFormat_PDB,
oechem.OEIFlavor_PDB_Default
| oechem.OEIFlavor_PDB_DATA
| oechem.OEIFlavor_PDB_ALTLOC,
) # noqa
ifs.openstring(pdb_str)
mol = oechem.OEGraphMol()
if not oechem.OEReadMolecule(ifs, mol):
oechem.OEThrow.Fatal("Cannot read molecule")
return mol
[docs]
def oedu_to_bytes64(oedu: oechem.OEDesignUnit) -> bytes:
"""
Convert an OpenEye DesignUnit to bytes
Parameters
----------
oedu: oechem.OEDesignUnit
OpenEye DesignUnit
Returns
-------
bytes
bytes representation of the input DesignUnit encoded in base64
"""
oedu_bytes = oechem.OEWriteDesignUnitToBytes(oedu)
# convert to base64
return b64encode(oedu_bytes)
[docs]
def bytes64_to_oedu(bytes: bytes) -> oechem.OEDesignUnit:
"""
Convert bytes to an OpenEye DesignUnit
Parameters
----------
bytes: bytes
bytes representation of a DesignUnit encoded in base64
Returns
-------
oechem.OEDesignUnit
resulting OpenEye DesignUnit
"""
# convert from base64
bytes = b64decode(bytes)
du = oechem.OEDesignUnit()
retcode = oechem.OEReadDesignUnitFromBytes(du, bytes)
if not retcode:
oechem.OEThrow.Fatal("Cannot read DesignUnit from bytes")
return du
[docs]
def split_openeye_mol(
complex_mol,
molecule_filter: Optional[Union[str, list[str], MoleculeFilter]] = None,
prot_cutoff_len=10,
keep_one_lig=True,
) -> dict:
"""
Split an OpenEye-loaded molecule into protein, ligand, etc.
Uses the OpenEye OESplitMolComplex function, which automatically splits out
only the first ligand binding site it sees.
Parameters
----------
complex_mol : oechem.OEMolBase
Complex molecule to split.
molecule_filter : MoleculeFilter
Molecule filter object that contains the filter criteria.
prot_cutoff_len : int, default=10
Minimum number of residues in a protein chain required in order to keep
keep_one_lig : bool, default=True
Only keep one copy of the ligand in lig_mol
Returns
-------
dict
A dict containing the protein ("prot"), ligand ("lig"), water atoms ("wat"),
and all other atoms ("oth")
"""
# These are the objects that will be returned at the end
lig_mol = oechem.OEGraphMol()
prot_mol = oechem.OEGraphMol()
water_mol = oechem.OEGraphMol()
oth_mol = oechem.OEGraphMol()
if molecule_filter is None:
molecule_filter = MoleculeFilter()
elif type(molecule_filter) is str:
molecule_filter = MoleculeFilter(components_to_keep=[molecule_filter])
elif type(molecule_filter) is list:
molecule_filter = MoleculeFilter(components_to_keep=molecule_filter)
else:
molecule_filter = molecule_filter
# Make splitting split out covalent ligands possible
# TODO: look into different covalent-related options here
opts = oechem.OESplitMolComplexOptions()
opts.SetSplitCovalent(True)
opts.SetSplitCovalentCofactors(True)
# Protein splitting options
# Default of all atoms identified as protein + complex
prot_only = oechem.OEMolComplexFilterFactory(
oechem.OEMolComplexFilterCategory_ProtComplex
)
# add in peptides as well, sometimes proteins are misidentified as peptides if bound or short
peptide = oechem.OEMolComplexFilterFactory(
oechem.OEMolComplexFilterCategory_Peptide
)
# combine protein and peptide filters
prot_only = oechem.OEOrRoleSet(prot_only, peptide)
# If protein_chains are specified, only take protein atoms from those chains
if len(molecule_filter.protein_chains) > 0:
chain_filters = [
oechem.OERoleMolComplexFilterFactory(
oechem.OEMolComplexChainRoleFactory(chain)
)
for chain in molecule_filter.protein_chains
]
chain_filter = reduce(oechem.OEOrRoleSet, chain_filters)
prot_filter = oechem.OEAndRoleSet(prot_only, chain_filter)
else:
prot_filter = prot_only
opts.SetProteinFilter(prot_filter)
# Ligand splitting options
# Default of all atoms identified as ligand
lig_only = oechem.OEMolComplexFilterFactory(
oechem.OEMolComplexFilterCategory_Ligand
)
# If ligand_chain is specified, only take protein atoms from that chains
if molecule_filter.ligand_chain:
lig_chain = oechem.OERoleMolComplexFilterFactory(
oechem.OEMolComplexChainRoleFactory(molecule_filter.ligand_chain)
)
lig_filter = oechem.OEAndRoleSet(lig_only, lig_chain)
else:
lig_filter = lig_only
# combine with NOT peptide filter
lig_filter = oechem.OEAndRoleSet(lig_filter, oechem.OENotRoleSet(peptide))
opts.SetLigandFilter(lig_filter)
# If water_chains are specified, set up filter for them
if len(molecule_filter.water_chains) > 0:
water_only = oechem.OEMolComplexFilterFactory(
oechem.OEMolComplexFilterCategory_Water
)
chain_filters = [
oechem.OERoleMolComplexFilterFactory(
oechem.OEMolComplexChainRoleFactory(chain)
)
for chain in molecule_filter.water_chains
]
chain_filter = reduce(oechem.OEOrRoleSet, chain_filters)
wat_filter = oechem.OEAndRoleSet(water_only, chain_filter)
# combine with NOT peptide filter
wat_filter = oechem.OEAndRoleSet(wat_filter, oechem.OENotRoleSet(peptide))
opts.SetWaterFilter(wat_filter)
# Use python 'reduce' to combine all the filters into one, otherwise OpenEye will throw an error
oechem.OESplitMolComplex(
lig_mol,
prot_mol,
water_mol,
oth_mol,
complex_mol,
opts,
)
# TODO: make this nicer?
# Get rid of any straggling extra copies of the ligand
prot_mol = trim_small_chains(prot_mol, prot_cutoff_len)
if molecule_filter.ligand_chain:
keep_lig_chain = molecule_filter.ligand_chain
else:
keep_lig_chain = None
# Get rid of extra copies of the ligand
if keep_one_lig:
all_lig_chains = [res.GetChainID() for res in oechem.OEGetResidues(lig_mol)]
# Handle the case where the input has no ligand, otherwise throws an IndexError
if len(all_lig_chains) > 0:
# Keep first alphabetically, for reproducibility
if not keep_lig_chain:
keep_lig_chain = sorted(all_lig_chains)[0]
for a in lig_mol.GetAtoms():
# Delete all atoms that don't match
if oechem.OEAtomGetResidue(a).GetChainID() != keep_lig_chain:
lig_mol.DeleteAtom(a)
return {
"prot": prot_mol,
"lig": lig_mol,
"wat": water_mol,
"oth": oth_mol,
"keep_lig_chain": keep_lig_chain,
}
[docs]
def split_openeye_design_unit(du, lig=None, lig_title=None):
"""
Parameters
----------
du : oechem.OEDesignUnit
Design Unit to be saved
lig : oechem.OEGraphMol, optional
lig_title : str, optional
ID of Ligand to be saved to the Title tag in the SDF, by default None
include_solvent : bool, optional
Whether to include solvent in the complex, by default True
Returns
-------
lig : oechem.OEGraphMol
OE object containing ligand
protein : oechem.OEGraphMol
OE object containing only protein
complex : oechem.OEGraphMol
OE object containing ligand + protein
"""
prot = oechem.OEGraphMol()
complex_ = oechem.OEGraphMol()
du.GetProtein(prot)
# if no ligand, return protein and complex with no ligand
if not du.HasLigand():
return None, prot, prot
if not lig:
lig = oechem.OEGraphMol()
du.GetLigand(lig)
# Set ligand title, useful for the combined sdf file
if lig_title:
lig.SetTitle(f"{lig_title}")
# Give ligand atoms their own chain "L" and set the resname to "LIG"
residue = oechem.OEAtomGetResidue(next(iter(lig.GetAtoms())))
residue.SetChainID("L")
residue.SetName("LIG")
residue.SetHetAtom(True)
for atom in list(lig.GetAtoms()):
oechem.OEAtomSetResidue(atom, residue)
# Combine protein and ligand and save
# TODO: consider saving water as well
oechem.OEAddMols(complex_, prot)
oechem.OEAddMols(complex_, lig)
# Clean up PDB info by re-perceiving, perserving chain ID,
# residue number, and residue name
openeye_perceive_residues(prot)
openeye_perceive_residues(complex_)
return lig, prot, complex_
[docs]
def trim_small_chains(input_mol, cutoff_len=10):
"""
Remove short chains from a protein molecule object. The goal is to get rid
of any peptide ligands that were mistakenly collected by OESplitMolComplex.
Parameters
----------
input_mol : oechem.OEGraphMol
OEGraphMol object containing the protein to trim
cutoff_len : int, default=10
The cutoff length for peptide chains (chains must have more than this
many residues to be included)
Returns
-------
oechem.OEGraphMol
Trimmed molecule
"""
# Copy the molecule
mol_copy = input_mol.CreateCopy()
# Remove chains from mol_copy that are too short (possibly a better way of
# doing this with OpenEye functions)
# Get number of residues per chain
chain_len_dict = {}
hv = oechem.OEHierView(mol_copy)
for chain in hv.GetChains():
chain_id = chain.GetChainID()
for frag in chain.GetFragments():
frag_len = len(list(frag.GetResidues()))
try:
chain_len_dict[chain_id] += frag_len
except KeyError:
chain_len_dict[chain_id] = frag_len
# Remove short chains atom by atom
for a in mol_copy.GetAtoms():
chain_id = oechem.OEAtomGetResidue(a).GetExtChainID()
if (chain_id not in chain_len_dict) or (chain_len_dict[chain_id] <= cutoff_len):
mol_copy.DeleteAtom(a)
return mol_copy