Source code for drugforge.data.services.fragalysis.fragalysis_download

import os
from pathlib import Path
from zipfile import ZipFile

import requests
from drugforge.data.schema.legacy import CrystalCompoundData
from drugforge.data.util.stringenum import StringEnum

BASE_URL = "https://fragalysis.diamond.ac.uk/api/download_structures/"
BASE_URL_LEGACY = (
    "https://fragalysis-legacy.xchem.diamond.ac.uk/api/download_structures/"
)

# Info for the POST call
API_CALL_BASE = {
    "target_name": "",
    "proteins": "",
    "event_info": False,
    "sigmaa_info": False,
    "diff_info": False,
    "trans_matrix_info": False,
    "NAN": False,
    "mtz_info": False,
    "cif_info": False,
    "NAN2": False,
    "map_info": False,
    "single_sdf_file": True,
    "sdf_info": True,
    "pdb_info": False,
    "bound_info": True,
    "metadata_info": True,
    "smiles_info": True,
    "static_link": False,
    "file_url": "",
}


API_CALL_BASE_LEGACY = {
    "target_name": "",
    "file_url": "",
    "proteins": "",
}



[docs]
class FragalysisTargets(StringEnum):
    SARS = "Mpro"
    MAC1 = "Mac1"
    D68EV3CPRO = "D68EV3CPROA"
    NPROT = "Nprot"
    nsp13 = "nsp13"
    XX01ZVNS2B = "XX01ZVNS2B"




[docs]
def download(out_fn, api_call, extract=True, base_url=BASE_URL):
    """
    Download target structures from fragalysis.

    Parameters
    ----------
    out_fn : Union[str, Path]
        Where to save the downloaded zip file
    api_call : dict
        Dictionary containing args for the POST request. Target is specified here.
    extract : bool, default=True
        Whether to extract the zip file after downloading. Extracts to the
        directory given by `dirname(out_fn)`
    """
    # First send POST request to prepare the download file and get its URL
    r = requests.post(base_url, json=api_call)
    if not r.ok:
        raise requests.HTTPError(
            f"Post request to {base_url} failed with {r.status_code} error code, "
            f"using the following API call {api_call}."
        )
    url_dl = r.json()["file_url"]
    print("Downloading archive", flush=True)
    # Send GET request for the zip archive
    r_dl = requests.get(base_url, params={"file_url": url_dl})
    # Full archive stored in r_dl.content, so write to zip file
    with open(out_fn, "wb") as fp:
        fp.write(r_dl.content)

    # Extract files if requested
    if extract:
        extract_zip(out_fn)



# TODO: move this function to utils or similar, if we end up needing it somewhere else

[docs]
def extract_zip(out_fn):
    """Extracts contents of zip file

    Parameters
    ----------
    out_fn: str or Path
        Zip file path to extract
    """
    print("Extracting files", flush=True)
    zf = ZipFile(out_fn)
    zf.extractall(path=os.path.dirname(out_fn))




[docs]
def parse_xtal(x_fn, x_dir, p_only=True):
    """
    Load all crystal structures into schema.CrystalCompoundData objects.

    Parameters
    ----------
    x_fn : str
        CSV file giving information on each crystal structure
    x_dir : str
        Path to directory containing directories with crystal structure PDB
        files
    p_only : bool, default=True
        Whether to filter to only include fragalysis structures of the
        format Mpro-P*

    Returns
    -------
    List[schema.CrystalCompoundData]
        List of parsed crystal structures
    """
    import pandas

    df = pandas.read_csv(x_fn)

    if p_only:
        # Find all P-files
        idx = [(type(d) is str) and ("-P" in d) for d in df["Dataset"]]
    else:
        idx = [type(d) is str for d in df["Dataset"]]

    # Build argument dicts for the CrystalCompoundData objects
    xtal_dicts = [
        dict(zip(("smiles", "dataset", "compound_id"), r[1].values))
        for r in df.loc[idx, ["SMILES", "Dataset", "Compound ID"]].iterrows()
    ]

    # Add structure filename information and filter if not found
    filtered_xtal_dicts = []
    for d in xtal_dicts:
        fn_base = f'{x_dir}/{d["dataset"]}_0{{}}/{d["dataset"]}_0{{}}_{{}}.pdb'
        for suf in ["seqres", "bound"]:
            for chain in ["A", "B"]:
                fn = fn_base.format(chain, chain, suf)
                if os.path.isfile(fn):
                    d["str_fn"] = fn
                    break
            if os.path.isfile(fn):
                break
        if os.path.isfile(fn):
            filtered_xtal_dicts.append(d)
        else:
            print(f'No structure found for {d["dataset"]}.')
    assert (
        len(filtered_xtal_dicts) > 0
    ), "No structure filenames were found by parse_xtal"
    # Build CrystalCompoundData objects for each row
    print(f"Loading {len(filtered_xtal_dicts)} structures")
    xtal_compounds = [CrystalCompoundData(**d) for d in filtered_xtal_dicts]

    return xtal_compounds




[docs]
def parse_fragalysis(
    x_fn,
    x_dir,
    name_filter=None,
    name_filter_column="crystal_name",
    drop_duplicate_datasets=False,
):
    """
    Load all crystal structures into schema.CrystalCompoundData objects.

    Parameters
    ----------
    x_fn : str or Path
        metadata.CSV file giving information on each crystal structure
    x_dir : str or Path
        Path to directory containing directories with crystal structure PDB
        files
    name_filter : str or list
        String or list of strings that are required to be in the name_filter_column
    name_filter_column : str
        Name of column in the metadata.csv that will be used to filter the dataframe
    drop_duplicate_datasets : bool
        If true, will drop the _1A, _0B, etc duplicate datasets for a given crystal structure.

    Returns
    -------
    List[schema.CrystalCompoundData]
        List of parsed crystal structures
    """
    import pandas
    from tqdm import tqdm

    x_dir = Path(x_dir)

    df = pandas.read_csv(x_fn)

    # Only keep rows of dataframe where the name_filter_column includes the name_filter string
    if name_filter:
        if isinstance(name_filter, str):
            idx = df[name_filter_column].apply(lambda x: name_filter in x)
            df = df[idx]
        elif isinstance(name_filter, list):
            for filter in name_filter:
                idx = df[name_filter_column].apply(lambda x: filter in x)
                df = df[idx]
    # Drop duplicates, keeping only the first one.
    if drop_duplicate_datasets:
        df = df.drop_duplicates("RealCrystalName")

    # Remove whitespace from the the relevant columns
    df["smiles"].str.strip()
    df["crystal_name"].str.strip()
    df["alternate_name"].str.strip()

    # Build argument dicts for the CrystalCompoundData objects
    try:
        xtal_dicts = [
            dict(zip(("smiles", "dataset", "compound_id"), r[1].values))
            for r in df.loc[:, ["smiles", "crystal_name", "alternate_name"]].iterrows()
        ]
    except KeyError as e:
        raise Exception(
            "Did you use 'Mpro_compound_tracker_csv.csv'? Use 'metadata.csv' instead. "
            "This CSV is expected to contain columns 'smiles', 'crystal_name', and 'alternate_name', which correspond "
            "to the SD tags 'smiles', 'dataset', and 'compound_id' respectively."
        ) from e

    # Add structure filename information and filter if not found
    filtered_xtal_dicts = []
    for d in tqdm(xtal_dicts):
        glob_str = f"{d['dataset']}*/*.pdb"
        fns = list(x_dir.glob(glob_str))
        for fn in fns:
            d["str_fn"] = str(fn)

            # This should basically always be true since we're getting the filenames from glob but just in case.
            if os.path.isfile(fn):
                filtered_xtal_dicts.append(d)
    assert (
        len(filtered_xtal_dicts) > 0
    ), "No structure filenames were found by parse_fragalysis"

    # Build CrystalCompoundData objects for each row
    print(f"Loading {len(filtered_xtal_dicts)} structures")
    xtal_compounds = [CrystalCompoundData(**d) for d in filtered_xtal_dicts]
    assert os.path.isfile(fn), f'No structure found for {d["dataset"]}.'

    return xtal_compounds