Source code for drugforge.dataviz.plip

import logging  # noqa: F401
import os
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from pathlib import Path

import numpy as np
import xmltodict
import yaml
from drugforge.data.backend.openeye import (
    combine_protein_ligand,
    oechem,
    save_openeye_pdb,
)
from drugforge.data.metadata.resources import FINTSCORE_PARAMETERS
from drugforge.data.services.postera.manifold_data_validation import TargetTags
from drugforge.dataviz._gif_blocks import GIFBlockData
from drugforge.spectrum.fitness import parse_fitness_json, target_has_fitness_data

logger = logging.getLogger(__name__)


[docs] def make_color_res_subpockets(protein, target) -> dict[str, str]: """ Based on subpocket coloring, creates a dict where keys are colors, values are residue numbers. """ # get a list of all residue numbers of the protein. protein_residues = [ oechem.OEAtomGetResidue(atom).GetResidueNumber() for atom in protein.GetAtoms() ] # build a dict with all specified residue colorings. color_res_dict = {} binding_pocket_chainID = GIFBlockData.pocket_dict_chains_per_target[target] for subpocket, color in GIFBlockData.get_color_dict(target).items(): subpocket_residues = GIFBlockData.get_pocket_dict(target)[subpocket].split("+") color_res_dict[color] = [ f"{res}_{binding_pocket_chainID}" for res in subpocket_residues ] # set any non-specified residues to white. treated_res_nums = [ f"{res}_{binding_pocket_chainID}" for sublist in color_res_dict.values() for res in sublist ] non_treated_res_nums = [ f"{res}_{binding_pocket_chainID}" for res in set(protein_residues) if res not in treated_res_nums ] color_res_dict["white"] = non_treated_res_nums return color_res_dict
[docs] def make_color_res_fitness(protein, target) -> dict[str, str]: """ Based on fitness coloring, creates a dict where keys are colors, values are residue numbers. """ # get a list of all residue numbers of the protein. protein_residues = [ oechem.OEAtomGetResidue(atom).GetResidueNumber() for atom in protein.GetAtoms() ] protein_chainIDs = [ oechem.OEAtomGetResidue(atom).GetChainID() for atom in protein.GetAtoms() ] hex_color_codes = [ "#ffffff", "#ff9e83", "#ff8a6c", "#ff7454", "#ff5c3d", "#ff3f25", "#ff0707", ] color_res_dict = {} json_data = parse_fitness_json(target) for res_num, chain in set(zip(protein_residues, protein_chainIDs)): try: # color residue white->red depending on fitness value. color_index_to_grab = json_data[f"{res_num}_{chain}"] try: color = hex_color_codes[color_index_to_grab] except IndexError: # insane residue that has tons of fit mutants; just assign the darkest red. color = hex_color_codes[-1] if color not in color_res_dict: color_res_dict[color] = [f"{res_num}_{chain}"] else: color_res_dict[color].append(f"{res_num}_{chain}") except KeyError: # fitness data is missing for this residue, color blue instead. color = "#642df0" if color not in color_res_dict: color_res_dict[color] = [f"{res_num}_{chain}"] else: color_res_dict[color].append(f"{res_num}_{chain}") return color_res_dict
[docs] def get_interaction_color(intn_type) -> str: """ Generated using PLIP docs; colors match PyMol interaction colors. See https://github.com/pharmai/plip/blob/master/DOCUMENTATION.md converted RGB to HEX using https://www.rgbtohex.net/ """ if intn_type == "hydrophobic_interaction": return "#808080" elif intn_type == "hydrogen_bond": return "#0000FF" elif intn_type == "water_bridge": return "#BFBFFF" elif intn_type == "salt_bridge": return "#FFFF00" elif intn_type == "pi_stack": return "#00FF00" elif intn_type == "pi_cation_interaction": return "#FF8000" elif intn_type == "halogen_bond": return "#36FFBF" elif intn_type == "metal_complex": return "#8C4099" else: raise ValueError(f"Interaction type {intn_type} not recognized.")
[docs] def is_backbone_residue(protein, x, y, z) -> bool: """ Given xyz coordinates, find the atom in the protein and return whether it is a backbone atom. This would be much easier if PLIP would return the atom idx of the protein, currently all we have are the coordinates. """ # make a list with this protein's backbone atom indices. Could do higher up, # but this is very fast so ok to repeat. backbone_atoms = [at.GetIdx() for at in protein.GetAtoms(oechem.OEIsBackboneAtom())] # with oe, iterate over atoms until this one's found. then use oechem.OEIsBackboneAtom is_backbone = False for idx, res_coords in protein.GetCoords().items(): # round to 3 because OE pointlessly extends the coordinates float. if ( float(x) == round(res_coords[0], 3) and float(y) == round(res_coords[1], 3) and float(z) == round(res_coords[2], 3) ): is_backbone = True if idx in backbone_atoms else False if is_backbone: return True else: # this also catches pi-pi stack where protein coordinates are centered to a ring (e.g. Phe), # in which case the above coordinate matching doesn't find any atoms. pi-pi of this form # can never be on backbone anyway, so this works. return False
[docs] def get_interaction_fitness_color(plip_xml_dict, protein, target) -> str: """ Get fitness color for a residue. If the interaction is with a backbone atom on the residue, color it green. """ # first get the fitness color of the residue the interaction hits, this # can be white->red or blue if fitness data is missing. intn_color = None for fitness_color, res_ids in make_color_res_fitness(protein, target).items(): if f"{plip_xml_dict['resnr']}_{plip_xml_dict['reschain']}" in res_ids: intn_color = fitness_color break # overwrite the interaction as green if it hits a backbone atom. if is_backbone_residue( protein, plip_xml_dict["protcoo"]["x"], plip_xml_dict["protcoo"]["y"], plip_xml_dict["protcoo"]["z"], ): intn_color = "#008000" return intn_color
[docs] def build_interaction_dict( plip_xml_dict, intn_counter, intn_type, color_method, protein, target ): """ Parses a PLIP interaction dict and builds the dict key values needed for 3DMol. """ k = f"{intn_counter}_{plip_xml_dict['restype']}{plip_xml_dict['resnr']}.{plip_xml_dict['reschain']}" if color_method == "fitness": intn_color = get_interaction_fitness_color(plip_xml_dict, protein, target) else: intn_color = get_interaction_color(intn_type) v = { "lig_at_x": plip_xml_dict["ligcoo"]["x"], "lig_at_y": plip_xml_dict["ligcoo"]["y"], "lig_at_z": plip_xml_dict["ligcoo"]["z"], "prot_at_x": plip_xml_dict["protcoo"]["x"], "prot_at_y": plip_xml_dict["protcoo"]["y"], "prot_at_z": plip_xml_dict["protcoo"]["z"], "type": intn_type, "color": intn_color, } return k, v
[docs] def get_interactions_plip(protein, pose, color_method, target) -> dict: """ Get protein-ligand interactions according to PLIP. TODO: currently this uses a tmp PDB file, uses PLIP CLI (python package throws ``` libc++abi: terminating with uncaught exception of type swig::stop_iteration Abort trap: 6 ``` ), then parses XML to get interactions. This is a bit convoluted, we could refactor this to use OE's InteractionHints instead? ProLIF struggles to detect incorrections because it's v sensitive to protonation. PLIP does protonation itself. """ # create the complex PDB file. with tempfile.TemporaryDirectory() as tmpdirname: tmp_pdb = os.path.join(tmpdirname, "tmp_complex.pdb") save_openeye_pdb(combine_protein_ligand(protein, pose), tmp_pdb) # run the PLIP CLI. subprocess.run(["plip", "-f", tmp_pdb, "-x", "-o", tmpdirname]) # load the XML produced by PLIP that contains all the interaction data. intn_dict_xml = xmltodict.parse( ET.tostring( ET.parse(os.path.join(tmpdirname, "tmp_complex_report.xml")).getroot() ) ) intn_dict = {} intn_counter = 0 # wrangle all interactions into a dict that can be read directly by 3DMol. # if there is only one site we get a dict, otherwise a list of dicts. sites = intn_dict_xml["report"]["bindingsite"] if isinstance(sites, dict): sites = [sites] for bs in sites: for _, data in bs["interactions"].items(): if data: # we build keys for the dict to be unique, so no interactions are overwritten for intn_type, intn_data in data.items(): if isinstance( intn_data, list ): # multiple interactions of this type for intn_data_i in intn_data: k, v = build_interaction_dict( intn_data_i, intn_counter, intn_type, color_method, protein, target, ) intn_dict[k] = v intn_counter += 1 elif isinstance(intn_data, dict): # single interaction of this type k, v = build_interaction_dict( intn_data, intn_counter, intn_type, color_method, protein, target, ) intn_dict[k] = v intn_counter += 1 return intn_dict
# this should be placed around that area as well, but FINTscore should be added to docking scores by default
[docs] def compute_fint_score( protein: oechem.OEMol, pose: oechem.OEMol, target: TargetTags ) -> tuple[float, float]: """ Compute the Fitness Interaction Score (FINTscore) given a dict with interactions generated by PLIP. Parameters ---------- protein: oechem.OEMol Protein molecule pose: oechem.OEMol Pose molecule target: str Target name Returns ---------- intn_score: float Score based purely on interactions, without penalties applied fint_score: float FINTscore which is computed as the `intn_score` with penalties applied """ if not target_has_fitness_data(target): raise ValueError( f"Target {target} does not have fitness data, cannot compute FINTscore." ) # read YAML file that contains settings for rewards/penalties of interaction types. fintscore_parameters = yaml.safe_load(Path(FINTSCORE_PARAMETERS).read_text()) # set empty parameters to add to when iterating over interactions. penalty_multipliers = 1 reward_multipliers = 1 intn_score_bucket = [] # iterate over each interaction that was found. intn_dict = get_interactions_plip(protein, pose, "fitness", target) for _, data in intn_dict.items(): # if the interaction is with backbone, add a reward to the score. if data["color"] == "#008000": reward_multipliers += 1 # if the interaction is with a residue that is shown to be able to mutate, add a penalty to the score. if data["color"] in fintscore_parameters.keys(): penalty_multipliers += fintscore_parameters[data["color"]] # compute this interaction's score (set in metadata yaml). intn_score_bucket.append(fintscore_parameters[data["type"]]) # simply compute the mean score, will end up being between 0.5 and 1.0, typically. # we need to take the mean because we don't want a compound with many interactions being favored by this score. intn_score = np.mean(intn_score_bucket) # now compute the FINTscore by applying the reward/penalty correction terms. # this follows FINT_{score} = INT_{score} * (REWARD*N_{backbone}) * PENALTY^{N_{mutable}}, # where PENALTY <= 1.0 <= REWARD. fint_score = ( intn_score * reward_multipliers * fintscore_parameters["backbone_reward_multiplier"] * fintscore_parameters["mutating_intn_penalty_multiplier"] ** penalty_multipliers ) # finally, in some cases with lots of reward the FINTscore can shoot over 1.0; then just set as 1.0. if fint_score > 1.0: fint_score = 1.0 return intn_score, fint_score