import itertools
import logging
import warnings
from collections import defaultdict
from collections.abc import Iterable
from enum import Enum
from pathlib import Path
from typing import List, Optional, Tuple, Union # noqa: F401
import pandas as pd
import yaml
from drugforge.data.services import postera
from drugforge.data.util.stringenum import StringEnum
# static path to the spec
# use the get_path_string helper function that uses importlib.resources to get the path to the yaml file
from drugforge.data.util.utils import get_path_string
logger = logging.getLogger(__name__)
# util function to open a yaml file and return the data
[docs]
def load_yaml(yaml_path: Union[str, Path]) -> dict:
with open(yaml_path) as f:
data = yaml.safe_load(f)
return data
# we define a new Enum class with some handy methods
[docs]
class TagEnumBase(StringEnum):
@classmethod
def is_in_values(cls, tag: str) -> bool:
vals = cls.get_values()
return tag in vals
@classmethod
def all_in_values(cls, query: list[str], allow: list[str] = []) -> bool:
return all([cls.is_in_values(q) for q in query if q not in allow])
[docs]
@classmethod
def from_iterable(cls, name: str, iter: Iterable) -> Enum:
"""
Create a new Enum class from a set of tags
"""
enum_data = {tag: tag for tag in iter}
return cls(name, enum_data)
@classmethod
def filter_dataframe_cols(
cls, df: pd.DataFrame, allow: Optional[list[str]] = None
) -> pd.DataFrame:
# construct list of allowed columns
allowed_columns = cls.get_values()
if allow is not None:
allowed_columns.extend(allow)
# drop columns that are not allowed
extra_cols = [col for col in df.columns if col not in allowed_columns]
if len(extra_cols) > 0:
warnings.warn(
f"Columns {extra_cols} are not allowed. Dropping them from the dataframe"
)
logger.warn(
f"Columns {extra_cols} are not allowed. Dropping them from the dataframe"
)
return df.drop(columns=extra_cols)
@classmethod
def get_values_underscored(cls):
return [e.value.replace("-", "_") for e in cls]
def get_value_underscored(self):
return self.value.replace("-", "_")
# OK finally we can actually make the enums
# static path to the spec
manifold_data_spec = get_path_string(postera) + "/manifold_data_tags.yaml"
# make target enum and set
(
TargetTags,
target_tag_set,
TargetVirusMap,
VirusTargetMap,
TargetProteinMap,
) = make_target_tags(manifold_data_spec)
VirusTags = make_virus_tags(manifold_data_spec)
# make Output enum and set
OutputTags, output_tag_set, MANIFOLD_PREFIX_POSTFIX_DICT = make_output_tags(
manifold_data_spec
)
# make static and legacy enum and set
StaticTags, static_tag_set = make_static_tags(manifold_data_spec)
[docs]
def make_manifold_tag_name_from_components(
pref: str, target: str, product: str, postf: str
) -> str:
"""
Make a tag name from the components
Parameters
----------
pref : str
Prefix
target : str
Target
product : str
Product
postf : str
Postfix
"""
return pref + "_" + target + "_" + product + "_" + postf
[docs]
def make_tag_combinations_and_combine_with_static(
target_tags: set, output_tags: set, static_tags: set, prefix_postfix_dict: dict
) -> tuple[Enum, set]:
"""
Make all possible combinations of target_tags and output_tags
then add in the static and legacy tags
"""
combos = set(itertools.product(output_tags, target_tags))
combined = set()
for combo in combos:
product, target = combo
pref, postf = prefix_postfix_dict[product]
name = make_manifold_tag_name_from_components(pref, target, product, postf)
combined.add(name)
final_tags = combined.union(static_tags)
# sort the tags so that they are in alphabetical order
final_tags = sorted(final_tags)
return TagEnumBase.from_iterable("ManifoldAllowedTags", final_tags), final_tags
ManifoldAllowedTags, _ = make_tag_combinations_and_combine_with_static(
target_tag_set, output_tag_set, static_tag_set, MANIFOLD_PREFIX_POSTFIX_DICT
)
[docs]
def map_output_col_to_manifold_tag(output_tags: Enum, target: str) -> dict[str, str]:
"""
Build Postera tags given output tags and target. Only valid output tags in the enum
are mapped to Postera tags.
Parameters
----------
output_tags : Enum
Enum of output tags to produce Postera tags for.
Returns
-------
mapping
Output tags as keys, Postera tags as values.
"""
mapping = {}
for col in output_tags:
if col.value in OutputTags.get_values():
pref, post = MANIFOLD_PREFIX_POSTFIX_DICT[col.value]
mapping[col.value] = make_manifold_tag_name_from_components(
pref, target, col.value, post
)
return mapping
[docs]
def drop_non_output_columns(
df: pd.DataFrame, allow: Optional[list[str]] = []
) -> pd.DataFrame:
"""
Drop columns of a docking result dataframe that are not allowed OutputTags
ie the members of OutputTags.get_values() and StaticTags.get_values()
Parameters
----------
df : pd.DataFrame
Pandas dataframe of docking results
allow : list[str], optional
List of additional columns to allow
Returns
-------
df : pd.DataFrame
Pandas dataframe with invalid columns dropped
"""
output_cols = OutputTags.get_values()
static_cols = StaticTags.get_values()
# combine output and static columns
output_cols.extend(static_cols)
# add allowed columns
output_cols.extend(allow)
# drop all columns that are not in the output
df = df.drop(columns=[col for col in df.columns if col not in output_cols])
return df
[docs]
def rename_output_columns_for_manifold(
df: pd.DataFrame,
target: str,
output_enums: list[Enum],
manifold_validate: Optional[bool] = True,
drop_non_output: Optional[bool] = True,
allow: Optional[list[str]] = [],
) -> pd.DataFrame:
"""
Rename columns of a result dataframe that are available to be
updated in the Postera Manifold for a specific target. i.e inject the
target name into the column name to satisfy validation for Postera Manifold.
for example:
Also optionally drop columns of a result dataframe that are not allowed output tags
ie the members of OutputTags.get_values() and StaticTags.get_values()
docking-pose-POSIT -> in-silico_SARS-CoV-2-Mpro_docking-pose-POSIT_msk
Parameters
----------
df : pd.DataFrame
Pandas dataframe of docking results
target : str
Target name
output_enums : list[Enum]
List of enums to rename the columns of
manifold_validate : bool, optional
If True, validate that the columns are valid for Postera Manifold
drop_non_output : bool, optional
If True, drop columns that are not allowed output tags
allow : list[str], optional
List of additional columns to allow when dropping
Returns
-------
df : pd.DataFrame
Pandas dataframe with invalid columns dropped and valid columns renamed
"""
if not TargetTags.is_in_values(target):
raise ValueError(
f"Target {target} is not valid. Valid targets are: {TargetTags.get_values()}"
)
if drop_non_output:
df = drop_non_output_columns(df, allow=allow)
mapping = {}
for col_enum in output_enums:
mapping.update(map_output_col_to_manifold_tag(col_enum, target))
if manifold_validate:
if not ManifoldAllowedTags.all_in_values(mapping.values()):
raise ValueError(
f"Columns in dataframe {mapping.values()} are not all valid for updating in postera. Valid columns are: {ManifoldAllowedTags.get_values()}"
)
# rename columns
df = df.rename(columns=mapping)
return df