Source code for drugforge.dataviz.plip

import logging  # noqa: F401
import os
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from pathlib import Path

import numpy as np
import xmltodict
import yaml
from drugforge.data.backend.openeye import (
    combine_protein_ligand,
    oechem,
    save_openeye_pdb,
)
from drugforge.data.metadata.resources import FINTSCORE_PARAMETERS
from drugforge.data.services.postera.manifold_data_validation import TargetTags
from drugforge.dataviz._gif_blocks import GIFBlockData
from drugforge.spectrum.fitness import parse_fitness_json, target_has_fitness_data

logger = logging.getLogger(__name__)



[docs]
def make_color_res_subpockets(protein, target) -> dict[str, str]:
    """
    Based on subpocket coloring, creates a dict where keys are colors, values are residue numbers.
    """
    # get a list of all residue numbers of the protein.
    protein_residues = [
        oechem.OEAtomGetResidue(atom).GetResidueNumber() for atom in protein.GetAtoms()
    ]

    # build a dict with all specified residue colorings.
    color_res_dict = {}
    binding_pocket_chainID = GIFBlockData.pocket_dict_chains_per_target[target]

    for subpocket, color in GIFBlockData.get_color_dict(target).items():
        subpocket_residues = GIFBlockData.get_pocket_dict(target)[subpocket].split("+")
        color_res_dict[color] = [
            f"{res}_{binding_pocket_chainID}" for res in subpocket_residues
        ]

    # set any non-specified residues to white.
    treated_res_nums = [
        f"{res}_{binding_pocket_chainID}"
        for sublist in color_res_dict.values()
        for res in sublist
    ]
    non_treated_res_nums = [
        f"{res}_{binding_pocket_chainID}"
        for res in set(protein_residues)
        if res not in treated_res_nums
    ]
    color_res_dict["white"] = non_treated_res_nums

    return color_res_dict




[docs]
def make_color_res_fitness(protein, target) -> dict[str, str]:
    """
    Based on fitness coloring, creates a dict where keys are colors, values are residue numbers.
    """

    # get a list of all residue numbers of the protein.
    protein_residues = [
        oechem.OEAtomGetResidue(atom).GetResidueNumber() for atom in protein.GetAtoms()
    ]
    protein_chainIDs = [
        oechem.OEAtomGetResidue(atom).GetChainID() for atom in protein.GetAtoms()
    ]

    hex_color_codes = [
        "#ffffff",
        "#ff9e83",
        "#ff8a6c",
        "#ff7454",
        "#ff5c3d",
        "#ff3f25",
        "#ff0707",
    ]

    color_res_dict = {}
    json_data = parse_fitness_json(target)
    for res_num, chain in set(zip(protein_residues, protein_chainIDs)):
        try:
            # color residue white->red depending on fitness value.
            color_index_to_grab = json_data[f"{res_num}_{chain}"]
            try:
                color = hex_color_codes[color_index_to_grab]
            except IndexError:
                # insane residue that has tons of fit mutants; just assign the darkest red.
                color = hex_color_codes[-1]
            if color not in color_res_dict:
                color_res_dict[color] = [f"{res_num}_{chain}"]
            else:
                color_res_dict[color].append(f"{res_num}_{chain}")
        except KeyError:
            # fitness data is missing for this residue, color blue instead.
            color = "#642df0"
            if color not in color_res_dict:
                color_res_dict[color] = [f"{res_num}_{chain}"]
            else:
                color_res_dict[color].append(f"{res_num}_{chain}")

    return color_res_dict




[docs]
def get_interaction_color(intn_type) -> str:
    """
    Generated using PLIP docs; colors match PyMol interaction colors. See
    https://github.com/pharmai/plip/blob/master/DOCUMENTATION.md
    converted RGB to HEX using https://www.rgbtohex.net/
    """
    if intn_type == "hydrophobic_interaction":
        return "#808080"
    elif intn_type == "hydrogen_bond":
        return "#0000FF"
    elif intn_type == "water_bridge":
        return "#BFBFFF"
    elif intn_type == "salt_bridge":
        return "#FFFF00"
    elif intn_type == "pi_stack":
        return "#00FF00"
    elif intn_type == "pi_cation_interaction":
        return "#FF8000"
    elif intn_type == "halogen_bond":
        return "#36FFBF"
    elif intn_type == "metal_complex":
        return "#8C4099"
    else:
        raise ValueError(f"Interaction type {intn_type} not recognized.")




[docs]
def is_backbone_residue(protein, x, y, z) -> bool:
    """
    Given xyz coordinates, find the atom in the protein and return whether
    it is a backbone atom. This would be much easier if PLIP would return
    the atom idx of the protein, currently all we have are the coordinates.
    """
    # make a list with this protein's backbone atom indices. Could do higher up,
    # but this is very fast so ok to repeat.
    backbone_atoms = [at.GetIdx() for at in protein.GetAtoms(oechem.OEIsBackboneAtom())]

    # with oe, iterate over atoms until this one's found. then use oechem.OEIsBackboneAtom
    is_backbone = False
    for idx, res_coords in protein.GetCoords().items():
        # round to 3 because OE pointlessly extends the coordinates float.
        if (
            float(x) == round(res_coords[0], 3)
            and float(y) == round(res_coords[1], 3)
            and float(z) == round(res_coords[2], 3)
        ):
            is_backbone = True if idx in backbone_atoms else False

    if is_backbone:
        return True
    else:
        # this also catches pi-pi stack where protein coordinates are centered to a ring (e.g. Phe),
        # in which case the above coordinate matching doesn't find any atoms. pi-pi of this form
        # can never be on backbone anyway, so this works.
        return False




[docs]
def get_interaction_fitness_color(plip_xml_dict, protein, target) -> str:
    """
    Get fitness color for a residue. If the interaction is with a backbone atom on
    the residue, color it green.
    """
    # first get the fitness color of the residue the interaction hits, this
    # can be white->red or blue if fitness data is missing.
    intn_color = None
    for fitness_color, res_ids in make_color_res_fitness(protein, target).items():
        if f"{plip_xml_dict['resnr']}_{plip_xml_dict['reschain']}" in res_ids:
            intn_color = fitness_color
            break

    # overwrite the interaction as green if it hits a backbone atom.
    if is_backbone_residue(
        protein,
        plip_xml_dict["protcoo"]["x"],
        plip_xml_dict["protcoo"]["y"],
        plip_xml_dict["protcoo"]["z"],
    ):
        intn_color = "#008000"

    return intn_color




[docs]
def build_interaction_dict(
    plip_xml_dict, intn_counter, intn_type, color_method, protein, target
):
    """
    Parses a PLIP interaction dict and builds the dict key values needed for 3DMol.
    """
    k = f"{intn_counter}_{plip_xml_dict['restype']}{plip_xml_dict['resnr']}.{plip_xml_dict['reschain']}"

    if color_method == "fitness":
        intn_color = get_interaction_fitness_color(plip_xml_dict, protein, target)
    else:
        intn_color = get_interaction_color(intn_type)
    v = {
        "lig_at_x": plip_xml_dict["ligcoo"]["x"],
        "lig_at_y": plip_xml_dict["ligcoo"]["y"],
        "lig_at_z": plip_xml_dict["ligcoo"]["z"],
        "prot_at_x": plip_xml_dict["protcoo"]["x"],
        "prot_at_y": plip_xml_dict["protcoo"]["y"],
        "prot_at_z": plip_xml_dict["protcoo"]["z"],
        "type": intn_type,
        "color": intn_color,
    }
    return k, v




[docs]
def get_interactions_plip(protein, pose, color_method, target) -> dict:
    """
    Get protein-ligand interactions according to PLIP.

    TODO:
    currently this uses a tmp PDB file, uses PLIP CLI (python package throws
    ```
    libc++abi: terminating with uncaught exception of type swig::stop_iteration
    Abort trap: 6
    ```
    ), then parses XML to get interactions. This is a bit convoluted, we could refactor
    this to use OE's InteractionHints instead? ProLIF struggles to detect incorrections
    because it's v sensitive to protonation. PLIP does protonation itself.
    """
    # create the complex PDB file.
    with tempfile.TemporaryDirectory() as tmpdirname:
        tmp_pdb = os.path.join(tmpdirname, "tmp_complex.pdb")

        save_openeye_pdb(combine_protein_ligand(protein, pose), tmp_pdb)

        # run the PLIP CLI.
        subprocess.run(["plip", "-f", tmp_pdb, "-x", "-o", tmpdirname])

        # load the XML produced by PLIP that contains all the interaction data.
        intn_dict_xml = xmltodict.parse(
            ET.tostring(
                ET.parse(os.path.join(tmpdirname, "tmp_complex_report.xml")).getroot()
            )
        )

    intn_dict = {}
    intn_counter = 0
    # wrangle all interactions into a dict that can be read directly by 3DMol.

    # if there is only one site we get a dict, otherwise a list of dicts.
    sites = intn_dict_xml["report"]["bindingsite"]
    if isinstance(sites, dict):
        sites = [sites]

    for bs in sites:
        for _, data in bs["interactions"].items():
            if data:
                # we build keys for the dict to be unique, so no interactions are overwritten
                for intn_type, intn_data in data.items():
                    if isinstance(
                        intn_data, list
                    ):  # multiple interactions of this type
                        for intn_data_i in intn_data:
                            k, v = build_interaction_dict(
                                intn_data_i,
                                intn_counter,
                                intn_type,
                                color_method,
                                protein,
                                target,
                            )
                            intn_dict[k] = v
                            intn_counter += 1

                    elif isinstance(intn_data, dict):  # single interaction of this type
                        k, v = build_interaction_dict(
                            intn_data,
                            intn_counter,
                            intn_type,
                            color_method,
                            protein,
                            target,
                        )
                        intn_dict[k] = v
                        intn_counter += 1

    return intn_dict



# this should be placed around that area as well, but FINTscore should be added to docking scores by default

[docs]
def compute_fint_score(
    protein: oechem.OEMol, pose: oechem.OEMol, target: TargetTags
) -> tuple[float, float]:
    """
    Compute the Fitness Interaction Score (FINTscore) given a dict with interactions generated by PLIP.

    Parameters
    ----------
    protein: oechem.OEMol
        Protein molecule
    pose: oechem.OEMol
        Pose molecule
    target: str
        Target name


    Returns
    ----------
    intn_score: float
        Score based purely on interactions, without penalties applied
    fint_score: float
        FINTscore which is computed as the `intn_score` with penalties applied
    """
    if not target_has_fitness_data(target):
        raise ValueError(
            f"Target {target} does not have fitness data, cannot compute FINTscore."
        )

    # read YAML file that contains settings for rewards/penalties of interaction types.
    fintscore_parameters = yaml.safe_load(Path(FINTSCORE_PARAMETERS).read_text())

    # set empty parameters to add to when iterating over interactions.
    penalty_multipliers = 1
    reward_multipliers = 1
    intn_score_bucket = []

    # iterate over each interaction that was found.
    intn_dict = get_interactions_plip(protein, pose, "fitness", target)

    for _, data in intn_dict.items():
        # if the interaction is with backbone, add a reward to the score.
        if data["color"] == "#008000":
            reward_multipliers += 1

        # if the interaction is with a residue that is shown to be able to mutate, add a penalty to the score.
        if data["color"] in fintscore_parameters.keys():
            penalty_multipliers += fintscore_parameters[data["color"]]

        # compute this interaction's score (set in metadata yaml).
        intn_score_bucket.append(fintscore_parameters[data["type"]])

    # simply compute the mean score, will end up being between 0.5 and 1.0, typically.
    # we need to take the mean because we don't want a compound with many interactions being favored by this score.
    intn_score = np.mean(intn_score_bucket)

    # now compute the FINTscore by applying the reward/penalty correction terms.
    # this follows FINT_{score} = INT_{score} * (REWARD*N_{backbone}) * PENALTY^{N_{mutable}},
    # where PENALTY <= 1.0 <= REWARD.
    fint_score = (
        intn_score
        * reward_multipliers
        * fintscore_parameters["backbone_reward_multiplier"]
        * fintscore_parameters["mutating_intn_penalty_multiplier"]
        ** penalty_multipliers
    )

    # finally, in some cases with lots of reward the FINTscore can shoot over 1.0; then just set as 1.0.
    if fint_score > 1.0:
        fint_score = 1.0

    return intn_score, fint_score