Source code for drugforge.data.services.cdd.cdd_api

import json
import time
from typing import Optional

import pandas
from drugforge.data.services.services_config import CDDSettings
from drugforge.data.services.web_utils import _BaseWebAPI



[docs]
class CDDAPI(_BaseWebAPI):
    """
    An interface to the CDD JSON API which allows you to search for molecules protocols and readouts like IC50.
    """


[docs]
    def __init__(self, url: str, api_version: str, api_key: str, vault: str):
        super().__init__(url=url, api_version=api_version, api_key=api_key)
        # now fix the url str
        self.api_url += f"/vaults/{vault}/"



[docs]
    @classmethod
    def token_name(cls) -> str:
        return "X-CDD-token"


    @classmethod
    def from_settings(cls, settings: CDDSettings):
        return cls(
            url=settings.CDD_API_URL,
            api_version=settings.CDD_API_VERSION,
            api_key=settings.CDD_API_KEY,
            vault=settings.CDD_VAULT_NUMBER,
        )


[docs]
    def get_molecules(
        self,
        smiles: Optional[str] = None,
        names: Optional[list[str]] = None,
        compound_ids: Optional[list[int]] = None,
    ) -> Optional[list[dict]]:
        """
        Search for molecules in the CDD vault.

        Notes:
            CDD only allows for a single structure searches via smiles, multiple molecules can be downloaded when using
            names or compound_ids.
            If molecule ids are missing in CDD we only return the subset that can be found

        Args:
            smiles: The smiles of the molecule to search for.
            names: The list of names of molecules which should be searched in the CDD.
            compound_ids: The list of CDD compound ids of molecules we wish to search for.

        Returns:
            A list of molecules found in the CDD.

        """
        if len([i for i in [smiles, names, compound_ids] if i is not None]) > 1:
            raise ValueError(
                "The arguments `smiles`, `names` and `compound_ids` are mutually exclusive provide only one."
            )

        mol_data = {"only_batch_ids": "true"}
        if smiles is not None:
            mol_data["structure"] = smiles
            mol_data["no_structures"] = "true"
            mol_data["structure_search_type"] = "exact"
        elif names is not None:
            mol_data["names"] = names
            mol_data["async"] = "true"
        else:
            mol_data["molecules"] = compound_ids
            mol_data["async"] = "true"
        result = json.loads(
            self._session.get(
                url=self.api_url + "molecules/", json=mol_data
            ).content.decode()
        )
        # handle missing molecules, originally found when searching moonshot data
        if "error" in result:
            import re

            # extract the list of missing molecule ids
            missing_mols = []
            for match in re.finditer("[0-9]+", result["error"]):
                missing_mols.append(int(match.group()))
            to_find = [mol for mol in compound_ids if mol not in missing_mols]
            mol_data["molecules"] = to_find
            # run the search again
            result = json.loads(
                self._session.get(
                    url=self.api_url + "molecules/", json=mol_data
                ).content.decode()
            )
        if "async" in mol_data:
            result = self.get_async_export(job_id=result["id"])
        if result["count"] == 0:
            return None
        else:
            return result["objects"]



[docs]
    def get_protocols(
        self,
        protocol_names: Optional[list[str]] = None,
    ) -> list[dict]:
        """
        Search for a specific protocol.

        Args:
            protocol_names: The list of protocol names to search for, if not provided all protocols will be pulled.

        Returns:
            A list of protocols associated with the given name
        """
        protocol_data = {}
        if protocol_names is not None:
            protocol_data["names"] = protocol_names
        result = self._session.get(url=self.api_url + "protocols", json=protocol_data)
        result_data = json.loads(result.content.decode())
        return result_data["objects"]



[docs]
    def get_readout_rows(
        self,
        protocol: int,
        molecule_ids: Optional[list[int]] = None,
        types: Optional[list[str]] = None,
    ) -> Optional[list[dict]]:
        """
        Get the readout data for a specific protocol performed on a set of molecules.

        Args:
            molecule_ids: The CDD ids of the molecules to get the values for if None all molecules under this protocol will be downloaded.
            protocol: The id of the protocol to use in the search.
            types: A list of readout types to pull the results for.

        Returns:
            A dictionary of the readout data matching the search. The actual values are stored under `readouts`.
        """
        readout_data = {
            "protocols": [protocol],
            "async": "true",  # use async as we may have many results
        }
        if types is not None:
            readout_data["type"] = types
        if molecule_ids is not None:
            readout_data["molecules"] = molecule_ids
        result = self._session.get(url=self.api_url + "readout_rows", json=readout_data)
        request_id = json.loads(result.content.decode())["id"]
        result_data = self.get_async_export(job_id=request_id)
        if result_data["count"] == 0:
            return None
        else:
            return result_data["objects"]



[docs]
    def get_async_export(self, job_id: int) -> dict:
        """
        A helper method to gather async request results.

        Args:
            job_id: The id of the request we want the results for.

        Notes:
            This function waits till the request is complete before returning the results.

        Returns:
            The finished request.
        """
        done = False
        while not done:
            result = json.loads(
                self._session.get(
                    url=self.api_url + f"exports/{job_id}"
                ).content.decode()
            )
            if "objects" not in result:
                time.sleep(1)
            else:
                return result



[docs]
    def get_ic50_data(
        self, protocol_name: str
    ) -> Optional[
        pandas.DataFrame
    ]:  # TODO: remove duplication with the below readout method
        """
        A convenience method which wraps the required function calls to gather the raw ic50 data from the CDD for the
        calculated as part of the named protocol.

        Args:
            protocol_name: The name of the protocol we want all IC50 result for.

        Returns:
            A list of dictionaries containing the IC50 values along with upper and lower CI and curve class for each
            batch measurement on the molecules performed as part of the given protocol.

        """
        # get the id of the protocol we want the readout for
        protocols = self.get_protocols(protocol_names=[protocol_name])
        if protocols:
            protocol = protocols[0]
        else:
            return None
        # define the readouts we want to find and get the ids
        required_data = {
            "IC50": None,
            "IC50 CI (Lower)": None,
            "IC50 CI (Upper)": None,
            "Curve class": None,
        }
        for readout_def in protocol["readout_definitions"]:
            if (readout_name := readout_def["name"]) in required_data:
                # gather the id of result for this readout
                required_data[readout_name] = readout_def["id"]
        # if any of the data is missing return
        if None in required_data:
            return None

        # pull down all batch readouts for this protocol and extract the data
        readout_data = self.get_readout_rows(
            protocol=protocol["id"], types=["batch_run_aggregate_row"]
        )
        # make a list of molecules we want to pull from the CDD
        compound_ids = set()
        # extract the results linking the molecules to the extracted data
        ic50_data = []
        for readout in readout_data:
            try:
                batch_data = {
                    f"{protocol_name}: {key}{' (µM)' if 'IC50' in key else ''}": readout[
                        "readouts"
                    ][
                        str(value)
                    ][
                        "value"
                    ]
                    for key, value in required_data.items()
                }
                # add a placeholder for the molecule data to be added later
                batch_data["name"] = readout["molecule"]
                batch_data["modified_at"] = readout["modified_at"]
                compound_ids.add(readout["molecule"])
                ic50_data.append(batch_data)
            except KeyError:
                # This is triggered if the upper and lower CI values are missing
                # This means the values falls outside the does series
                continue
        # gather the molecules
        molecule_data = self.get_molecules(compound_ids=list(compound_ids))
        compounds_by_id = {molecule["id"]: molecule for molecule in molecule_data}
        # loop over the list again and update the molecule info
        final_data = []
        for compound_data in ic50_data:
            try:
                mol_data = compounds_by_id[compound_data["name"]]
                compound_data["Smiles"] = mol_data["smiles"]
                compound_data["Inchi"] = mol_data["inchi"]
                compound_data["Inchi Key"] = mol_data["inchi_key"]
                compound_data["Molecule Name"] = mol_data["name"]
                compound_data["CXSmiles"] = mol_data["cxsmiles"]

                final_data.append(compound_data)
            except KeyError:
                continue

        return pandas.DataFrame(final_data)


    def get_readout(
        self, protocol_name: str, readout: str
    ) -> Optional[pandas.DataFrame]:
        # get the id of the protocol we want the readout for
        protocols = self.get_protocols(protocol_names=[protocol_name])
        if protocols:
            protocol = protocols[0]
        else:
            return None

        readout_ids = {}
        for readout_def in protocol["readout_definitions"]:
            readout_ids[readout_def["name"]] = readout_def["id"]

        if readout not in readout_ids:
            raise ValueError(
                f"Column {readout} not found in protocol {protocol_name}, available columns: {set(readout_ids.keys())}"
            )

        readout_data = self.get_readout_rows(protocol=protocol["id"])
        compound_ids = set()

        coldata = []
        for readout_elem in readout_data:
            try:
                batch_data = {}
                batch_data[readout] = readout_elem["readouts"][
                    str(readout_ids[readout])
                ]["value"]
                batch_data["name"] = readout_elem["molecule"]
                batch_data["modified_at"] = readout_elem["modified_at"]
                compound_ids.add(readout_elem["molecule"])
                coldata.append(batch_data)
            except KeyError:
                continue

        molecule_data = self.get_molecules(compound_ids=list(compound_ids))
        compounds_by_id = {molecule["id"]: molecule for molecule in molecule_data}
        final_data = []
        for compound_data in coldata:
            try:
                mol_data = compounds_by_id[compound_data["name"]]
                compound_data["Smiles"] = mol_data["smiles"]
                compound_data["Inchi"] = mol_data["inchi"]
                compound_data["Inchi Key"] = mol_data["inchi_key"]
                compound_data["Molecule Name"] = mol_data["name"]
                compound_data["CXSmiles"] = mol_data["cxsmiles"]

                final_data.append(compound_data)
            except KeyError:
                continue

        return pandas.DataFrame(final_data)