Source code for cyto_dl.dataframe.transforms.misc

from typing import Sequence

import numpy as np
import pandas as pd


[docs]def append_one_hot(dataframe: pd.DataFrame, column: str):
    """Modifies its argument by appending the one hot encoding columns into the given dataframe.
    Calls function one_hot_encoding.

    Parameters
    -----------
    dataframe: pd.DataFrame
        Input dataframe

    column: str
        Column to convert into one hot encoding
    """

    # import here to optimize CLIs / Fire usage
    from sklearn.preprocessing import OneHotEncoder

    one_hot = OneHotEncoder(sparse=False).fit_transform(dataframe[[column]])

    for idx in range(one_hot.shape[1]):
        dataframe[f"{column}_one_hot_{idx}"] = one_hot[:, idx]

    return dataframe


[docs]def append_labels_to_integers(dataframe: pd.DataFrame, column: str):
    """Modifies its argument by appending the integer-encoded values of `column` into the given
    dataframe.

    Parameters
    -----------
    dataframe: pd.DataFrame
        Input dataframe

    column: str
        Column to convert into one hot encoding
    """

    # import here to optimize CLIs / Fire usage
    from sklearn.preprocessing import LabelEncoder

    dataframe[f"{column}_integer"] = LabelEncoder().fit_transform(dataframe[[column]])

    return dataframe


[docs]def append_class_weights(dataframe: pd.DataFrame, column: str):
    """Add class weights (based on `column`) to a dataframe.

    Parameters
    -----------
    dataframe: pd.DataFrame
        Input dataframe

    column: str
        Column to base the weights on
    """
    labels_unique, counts = np.unique(dataframe[column], return_counts=True)
    class_weights = [sum(counts) / c for c in counts]
    class_weights_dict = dict(zip(labels_unique, class_weights))
    weights = [class_weights_dict[e] for e in dataframe[column]]
    dataframe[f"{column}_class_weights"] = weights
    return dataframe


[docs]def make_random_df(columns: Sequence[str] = list("ABCD"), n_rows: int = 100):
    """Generate a random dataframe. Useful to test data wrangling pipelines.

    Parameters
    ----------
    columns: Sequence[str] = ["A","B","C","D"]
        List of columns to add to the random dataframe. If none are provided,
        a dataframe with columns ["A","B","C","D"] is created

    n_rows: int = 100
        Number of rows to create for the random dataframe
    """

    data = np.random.randn(n_rows, len(columns))
    return pd.DataFrame(data, columns=columns)