Source code for drugforge.data.schema.schema_base

from __future__ import annotations

import json
from enum import Enum
from pathlib import Path

from pydantic import BaseModel, ByteSize, ConfigDict, Field

_SCHEMA_VERSION = "0.1.0"



[docs]
class DataStorageType(str, Enum):
    sdf = "sdf"
    pdb = "pdb"
    b64oedu = "b64oedu"




[docs]
def read_file_directly(file: str | Path) -> str:
    with open(str(file)) as f:
        contents = f.read()
    return contents




[docs]
def write_file_directly(file: str | Path, data: str, mode: str = "w") -> None:
    if mode not in ["w", "a"]:
        raise ValueError(f"mode must be either 'w' or 'a', got {mode}")
    with open(str(file), mode) as f:
        f.write(data)




[docs]
def utf8len(s: str) -> int:
    return len(s.encode("utf-8"))




[docs]
def check_strings_for_equality_with_exclusion(string1, string2, exclusion_string):
    lines1 = [line for line in string1.split("\n") if exclusion_string not in line]
    lines2 = [line for line in string2.split("\n") if exclusion_string not in line]
    return lines1 == lines2




[docs]
class DataModelAbstractBase(BaseModel):
    """
    Base class for drugforge pydantic models that simplify dictionary, JSON
    and other behaviour
    """

    def __hash__(self) -> int:
        return self.model_dump_json().__hash__()

    @classmethod
    def from_dict(cls, dict):
        return cls.model_validate(dict)

    @classmethod
    def from_json(cls, json_str):
        return cls.model_validate(json.loads(json_str))

    @classmethod
    def from_json_file(cls, file: str | Path):
        # first load the file, then use the json parser
        contents = read_file_directly(file)
        return cls.from_json(contents)

    def to_json_file(self, file: str | Path):
        write_file_directly(file, self.model_dump_json())

    @property
    def size(self) -> ByteSize:
        """Size of the resulting JSON object for this class"""
        return ByteSize(utf8len(self.model_dump_json())).human_readable()

    def full_equal(self, other: DataModelAbstractBase) -> bool:
        return self.model_dump() == other.model_dump()

    def data_equal(self, other: DataModelAbstractBase) -> bool:
        return self.data == other.data

    def get_schema_version(self) -> str:
        return _SCHEMA_VERSION

    # use data_equal instead
    def __eq__(self, other: DataModelAbstractBase) -> bool:
        # check if has a data attribute
        if hasattr(self, "data"):
            return self.data_equal(other)
        else:
            return self.full_equal(other)

    # use data_equal instead
    def __ne__(self, other: DataModelAbstractBase) -> bool:
        return not self.__eq__(other)

    model_config = ConfigDict(validate_assignment=True)




[docs]
def schema_dict_get_val_overload(obj: dict | BaseModel):
    """
    Overload for Schema and Dict to get values easily

    Parameters
    ----------
    obj : Union[Dict, Schema]
        Object to get values from

    Returns
    -------
    Iterable[Any]
    """
    if isinstance(obj, dict):
        return obj.values()
    elif isinstance(obj, BaseModel):
        return obj.model_dump().values()
    else:
        raise TypeError(f"Unsupported type {type(obj)}")




[docs]
class MoleculeComponent(str, Enum):
    PROTEIN = "protein"
    LIGAND = "ligand"
    WATER = "water"
    OTHER = "other"




[docs]
class MoleculeFilter(BaseModel):
    """Filter for selecting components of a molecule."""

    model_config = ConfigDict(extra="forbid")

    protein_chains: list = Field(
        list(),
        description="List of chains containing the desired protein. An empty list will return all chains.",
    )
    ligand_chain: str = Field(
        None,
        description="Chain containing the desired ligand. An empty list will return all chains.",
    )
    water_chains: list = Field(
        list(),
        description="List of chains containing the desired water. An empty list will return all chains.",
    )
    other_chains: list = Field(
        list(),
        description="List of chains containing other items. An empty list will return all chains.",
    )
    components_to_keep: list[MoleculeComponent] = Field(
        ["protein", "ligand", "water", "other"],
        description="List of components to keep. An empty list will return all components.",
    )