Source code for drugforge.modeling.modeling

import warnings
from pathlib import Path

from drugforge.data.backend.openeye import (
    oechem,
    oedocking,
    oegrid,
    oespruce,
    openeye_perceive_residues,
)
from drugforge.data.schema.schema_base import MoleculeComponent



[docs]
def add_seqres_to_openeye_protein(
    prot: oechem.OEGraphMol, seqres: str = None
) -> oechem.OEGraphMol:
    """
    Adds the SEQRES metadata to the given protein structure.

    Args:
    - prot (oechem.OEDesignUnit): the protein structure to add the SEQRES metadata to.
    """
    # Add SEQRES entries if they're not present
    if (not oechem.OEHasPDBData(prot, "SEQRES")) and seqres:
        for seqres_line in seqres.split("\n"):
            if seqres_line != "":
                oechem.OEAddPDBData(prot, "SEQRES", seqres_line[6:])
    return prot




[docs]
def get_oe_prep_opts():
    """
    These are the default options we've been using for OESpruce. They are based on John's function.
    Returns
    -------
    oespruce.OEMakeDesignUnitOptions

    """
    # Set up DU building options
    opts = oespruce.OEMakeDesignUnitOptions()
    opts.SetSuperpose(False)
    # Options set from John's function ########################################
    # (https://github.com/FoldingAtHome/covid-moonshot/blob/454098f4255467f4655102e0330ebf9da0d09ccb/synthetic-enumeration/sprint-14-quinolones/00-prep-receptor.py)
    opts.GetPrepOptions().SetStrictProtonationMode(True)
    # set minimal number of ligand atoms to 5, e.g. a 5-membered ring fragment\
    opts.GetSplitOptions().SetMinLigAtoms(5)

    # also consider alternate locations outside binding pocket, important for later filtering
    opts.GetPrepOptions().GetEnumerateSitesOptions().SetCollapseNonSiteAlts(True)

    # Both N- and C-termini should be zwitterionic
    # Mpro cleaves its own N- and C-termini
    # See https://www.pnas.org/content/113/46/12997
    opts.GetPrepOptions().GetBuildOptions().SetCapNTermini(False)
    opts.GetPrepOptions().GetBuildOptions().SetCapCTermini(False)
    # Don't allow truncation of termini, since force fields don't have
    #  parameters for this
    opts.GetPrepOptions().GetBuildOptions().GetCapBuilderOptions().SetAllowTruncate(
        False
    )
    return opts




[docs]
def get_oe_structure_metadata_from_sequence(initial_prot, protein_sequence):
    """
    Add sequence to structure metadata

    Parameters
    ----------
    initial_prot : oechem.OEGraphMol
        The input protein structure to be prepared.
    protein_sequence : str
        The protein sequence to be added to the metadata

    Returns
    -------
    metadata : oespruce.OEStructureMetadata
        The metadata object with the sequence added.
    """
    # Structure metadata object
    metadata = oespruce.OEStructureMetadata()

    # Add SEQRES metadata
    all_prot_chains = find_component_chains(initial_prot, "protein")
    for chain in all_prot_chains:
        seq_metadata = oespruce.OESequenceMetadata()
        seq_metadata.SetChainID(chain)
        seq_metadata.SetSequence(protein_sequence)
        metadata.AddSequenceMetadata(seq_metadata)
    return metadata




[docs]
def spruce_protein(
    initial_prot: oechem.OEGraphMol,
    protein_sequence: str = None,
    loop_db: Path = None,
) -> oechem.OEDesignUnit or oechem.OEGraphMol:
    """
    Applies the OESpruce protein preparation pipeline to the given protein structure.

    Parameters
    ----------
    initial_prot : oechem.OEMol
        The input protein structure to be prepared.

    protein_sequence : str, optional
        The sequence of the protein for a single change. If provided, this will be added to the Structure Metadata before applying the OESpruce pipeline.
        Default is None.

    loop_db : str, optional
        The filename of the loop database to be used by the OESpruce pipeline. If provided, the pipeline will include the loop building step.
        Default is None.

    Returns
    -------
    (success: bool, spruce_error_msg: str, initial_prot: oechem.OEMol)
        Returns a tuple of:
        a boolean for whether sprucing was successful
        a string of the error message if sprucing failed
        the prepared protein structure.
    """

    # Add Hs to prep protein and ligand
    oechem.OEAddExplicitHydrogens(initial_prot)

    opts = get_oe_prep_opts()

    # Set Build Loop and Sidechain Opts
    sc_opts = oespruce.OESidechainBuilderOptions()

    loop_opts = oespruce.OELoopBuilderOptions()
    loop_opts.SetSeqAlignMethod(oechem.OESeqAlignmentMethod_Identity)
    loop_opts.SetSeqAlignGapPenalty(-1)
    loop_opts.SetSeqAlignExtendPenalty(0)

    # Allow for adding residues at the beginning/end if they're missing
    loop_opts.SetBuildTails(True)

    if loop_db is not None:
        print("Adding loop db")
        loop_opts.SetLoopDBFilename(str(loop_db))

    # Construct spruce filter
    spruce_opts = oespruce.OESpruceFilterOptions()
    spruce = oespruce.OESpruceFilter(spruce_opts, opts)

    # Spruce!

    # These objects is for some reason needed in order to run spruce
    grid = oegrid.OESkewGrid()
    if protein_sequence:
        metadata = get_oe_structure_metadata_from_sequence(
            initial_prot, protein_sequence
        )
    else:
        metadata = oespruce.OEStructureMetadata()

    # Building the loops actually does use the sequence metadata
    build_loops_success = oespruce.OEBuildLoops(
        initial_prot, metadata, sc_opts, loop_opts
    )
    build_sidechains_success = oespruce.OEBuildSidechains(initial_prot, sc_opts)
    place_hydrogens_success = oechem.OEPlaceHydrogens(initial_prot)
    spruce_error_code = spruce.StandardizeAndFilter(initial_prot, grid, metadata)
    spruce_error_msg = spruce.GetMessages(spruce_error_code)
    success = (
        build_loops_success and build_sidechains_success and place_hydrogens_success
    )
    # Re-percieve residues so that atom number and connect records dont get screwed up
    initial_prot = openeye_perceive_residues(initial_prot, preserve_all=False)
    return success, spruce_error_msg, initial_prot




[docs]
def make_design_unit(
    initial_prot: oechem.OEGraphMol,
    site_residue: str = None,
    protein_sequence: str = None,
):
    """

    Parameters
    ----------
    initial_prot: oechem.OEGraphMol
    site_residue: str, optional
        The site residue used to define the binding site of the protein. i.e. "HIS:41: :A".
        This is necessary when there is no ligand in the input structure, otherwise OpenEye will not know where to put the binding site.
    protein_sequence: str, optional

    Returns
    -------

    """
    opts = get_oe_prep_opts()
    if protein_sequence:
        metadata = get_oe_structure_metadata_from_sequence(
            initial_prot, protein_sequence
        )
    else:
        metadata = oespruce.OEStructureMetadata()

    if site_residue:
        dus = list(
            oespruce.OEMakeDesignUnits(initial_prot, metadata, opts, site_residue)
        )
    else:
        dus = list(oespruce.OEMakeDesignUnits(initial_prot, metadata, opts))
    try:
        du = dus[0]
        if not du.HasProtein():
            raise ValueError(f"Resulting design unit '{du.GetTitle()}' has no protein.")
        if not site_residue and not du.HasLigand():
            raise ValueError(f"Resulting design unit '{du.GetTitle()}' has no ligand.")

        # Generate docking receptor for each DU
        oedocking.OEMakeReceptor(du)
        success = True
    except IndexError:
        success = False
        du = None
    return success, du




[docs]
def make_du_from_new_lig(
    protein: oechem.OEGraphMol,
    lig: oechem.OEGraphMol,
    opts: oespruce.OEMakeDesignUnitOptions = None,
):
    """
    Make a design unit from a protein and ligand. Does not resolve clashes,
    and should really only be used to guide docking.

    Parameters
    ----------
    protein : oechem.OEGraphMol
        Protein molecule
    lig : oechem.OEGraphMol
        Ligand molecule
    opts : oechem.OEMakeDesignUnitOptions, optional
        Options for making the design unit, by default the options from `get_oe_prep_opts` will be used.
    """
    if not opts:
        opts = get_oe_prep_opts()
    du = oechem.OEDesignUnit()
    success = oespruce.OEMakeDesignUnit(du, protein, lig, opts)
    return success, du




[docs]
def superpose_molecule(ref_mol, mobile_mol, ref_chain="A", mobile_chain="A"):
    """
    Superpose `mobile_mol` onto `ref_mol`.

    Parameters
    ----------
    ref_mol : oechem.OEGraphMol
        Reference molecule to align to.
    mobile_mol : oechem.OEGraphMol
        Molecule to align.
    ref_chain : Reference chain to align to
    mobile_chain : Mobile chain to use for alignment (the whole molecule will move as well though)

    Returns
    -------
    oechem.OEGraphMol
        New aligned molecule.
    float
        RMSD between `ref_mol` and `mobile_mol` after alignment.
    """
    chains_in_ref = find_component_chains(ref_mol, "protein", sort_by="size")
    if ref_chain not in chains_in_ref or ref_chain is None:
        warnings.warn(
            f"Chain {ref_chain} not found in reference molecule: chains {chains_in_ref}, using largest chain as reference {chains_in_ref[0]}"
        )
        ref_chain = chains_in_ref[0]

    chains_in_mobile = find_component_chains(mobile_mol, "protein", sort_by="size")
    if mobile_chain not in chains_in_mobile or mobile_chain is None:
        warnings.warn(
            f"Chain {mobile_chain} not found in mobile molecule: chains {chains_in_mobile}, using largest chain {chains_in_mobile[0]}"
        )
        mobile_chain = chains_in_mobile[0]

    if ref_chain != mobile_chain:
        warnings.warn(
            f"Chains {ref_chain} and {mobile_chain} are not the same, this may not be what you want"
        )
    ref_pred = oechem.OEHasChainID(ref_chain)
    mobile_pred = oechem.OEHasChainID(mobile_chain)

    # Create object to store results
    aln_res = oespruce.OESuperposeResults()

    # Set up superposing object and set reference molecule
    superpos = oespruce.OESuperpose()
    superpos.SetupRef(ref_mol, ref_pred)

    # Perform superposing
    superpos.Superpose(aln_res, mobile_mol, mobile_pred)
    # print(f"RMSD: {aln_res.GetRMSD()}")

    # Create copy of molecule and transform it to the aligned position
    mobile_mol_aligned = mobile_mol.CreateCopy()
    aln_res.Transform(mobile_mol_aligned)
    return mobile_mol_aligned, aln_res.GetRMSD()




[docs]
def mutate_residues(input_mol, res_list, protein_chains=None, place_h=True):
    """
    Mutate residues in the input molecule using OpenEye.
    TODO: Make this more robust using some kind of sequence alignment.

    Parameters
    ----------
    input_mol : oechem.OEGraphMol
        Input OpenEye molecule
    res_list : List[str]
        List of 3 letter codes for the full sequence of `input_mol`. Must have
        exactly the same number of residues or this will fail
    place_h : bool, default=True
        Whether to place hydrogen atoms

    Returns
    -------
    oechem.OEGraphMol
        Newly mutated molecule
    """
    # Create a copy of the molecule to avoid modifying original molecule
    mut_prot = input_mol.CreateCopy()
    # Get sequence of input protein
    input_mol_chain = [r.GetExtChainID() for r in oechem.OEGetResidues(input_mol)]
    input_mol_seq = [r.GetName() for r in oechem.OEGetResidues(input_mol)]
    input_mol_num = [r.GetResidueNumber() for r in oechem.OEGetResidues(input_mol)]

    # Build mutation map from OEResidue to new res name by indexing from res num
    mut_map = {}
    for old_res_name, res_num, chain, r in zip(
        input_mol_seq,
        input_mol_num,
        input_mol_chain,
        oechem.OEGetResidues(mut_prot),
    ):
        # Skip if not in identified protein chains
        if protein_chains:
            if chain not in protein_chains:
                continue
        # Skip if we're looking at a water
        if old_res_name == "HOH":
            continue
        try:
            new_res = res_list[res_num - 1]
        except IndexError:
            # If the residue number is out of range (because its a water or something
            # weird) then we can skip right on by it
            continue
        if new_res != old_res_name:
            print(res_num, old_res_name, new_res)
            mut_map[r] = new_res

    # Return early if no mutations found
    if len(mut_map) == 0:
        print("No mutations found", flush=True)
        return input_mol

    # Mutate and build sidechains
    oespruce.OEMutateResidues(mut_prot, mut_map)

    # Place hydrogens
    if place_h:
        oechem.OEPlaceHydrogens(mut_prot)

    # Re-percieve residues so that atom number and connect records dont get screwed up
    openeye_perceive_residues(mut_prot, preserve_all=True)

    return mut_prot




[docs]
def build_dimer_from_monomer(prot):
    # Build monomer into dimer as necessary (will need to handle
    #  re-labeling chains since the monomer seems to get the chainID C)
    # Shouldn't affect the protein if the dimer has already been built
    bus = list(oespruce.OEExtractBioUnits(prot))

    # Need to cast to OEGraphMol bc returned type is OEMolBase, which
    #  doesn't pickle
    prot = oechem.OEGraphMol(bus[0])

    # Keep track of chain IDs
    all_chain_ids = {
        r.GetExtChainID()
        for r in oechem.OEGetResidues(prot)
        if all([not oechem.OEIsWater()(a) for a in oechem.OEGetResidueAtoms(prot, r)])
    }
    if len(all_chain_ids) != 2:
        raise AssertionError(f"Chains: {all_chain_ids}")

    print(all_chain_ids)
    return prot




[docs]
def find_component_chains(
    mol: oechem.OEMolBase, component: str, res_name=None, sort_by="alphabetical"
):

    if sort_by not in ["alphabetical", "size"]:
        raise ValueError("sort_by must be either 'alphabetical' or 'size'")

    molcomp = MoleculeComponent(component)

    if res_name:
        # find all chains and their lengths in residues
        chain_lengths = {}
        for res in oechem.OEGetResidues(mol):
            if res.GetName() == res_name:
                chain_id = res.GetChainID()
                if chain_id not in chain_lengths:
                    chain_lengths[chain_id] = 0
                chain_lengths[chain_id] += 1

    elif molcomp.name == MoleculeComponent.PROTEIN.name:
        chain_lengths = {}
        for res in oechem.OEGetResidues(mol):
            if oechem.OEIsStandardProteinResidue(res):
                chain_id = res.GetChainID()
                if chain_id not in chain_lengths:
                    chain_lengths[chain_id] = 0
                chain_lengths[chain_id] += 1

    elif molcomp.name == MoleculeComponent.LIGAND.name:
        chain_lengths = {}
        for res in oechem.OEGetResidues(mol):
            if res.IsHetAtom() and not res.GetName() == "HOH":
                chain_id = res.GetChainID()
                if chain_id not in chain_lengths:
                    chain_lengths[chain_id] = 0
                chain_lengths[chain_id] += 1

    if sort_by == "size":
        chainids = sorted(
            chain_lengths.keys(), key=lambda x: chain_lengths[x], reverse=True
        )
    elif sort_by == "alphabetical":
        chainids = sorted(chain_lengths.keys())

    return chainids




[docs]
def du_to_complex(du, include_solvent=False):
    """
    Convert OEDesignUnit to OEGraphMol containing the protein and ligand from
    `du`.

    Parameters
    ----------
    du : oechem.OEDesignUnit
        OEDesignUnit object to extract from.
    include_solvent : bool, default=False
        Whether to include solvent molecules.

    Returns
    -------
    oechem.OEGraphMol
        Molecule with protein and ligand from `du`
    """
    complex_mol = oechem.OEGraphMol()
    comp_tag = (
        oechem.OEDesignUnitComponents_Protein | oechem.OEDesignUnitComponents_Ligand
    )
    if include_solvent:
        comp_tag |= oechem.OEDesignUnitComponents_Solvent
    du.GetComponents(complex_mol, comp_tag)

    complex_mol = openeye_perceive_residues(complex_mol)

    return complex_mol