Source code for cyto_dl.dataframe.transforms.misc
from typing import Sequence
import numpy as np
import pandas as pd
[docs]def append_one_hot(dataframe: pd.DataFrame, column: str):
"""Modifies its argument by appending the one hot encoding columns into the given dataframe.
Calls function one_hot_encoding.
Parameters
-----------
dataframe: pd.DataFrame
Input dataframe
column: str
Column to convert into one hot encoding
"""
# import here to optimize CLIs / Fire usage
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder(sparse=False).fit_transform(dataframe[[column]])
for idx in range(one_hot.shape[1]):
dataframe[f"{column}_one_hot_{idx}"] = one_hot[:, idx]
return dataframe
[docs]def append_labels_to_integers(dataframe: pd.DataFrame, column: str):
"""Modifies its argument by appending the integer-encoded values of `column` into the given
dataframe.
Parameters
-----------
dataframe: pd.DataFrame
Input dataframe
column: str
Column to convert into one hot encoding
"""
# import here to optimize CLIs / Fire usage
from sklearn.preprocessing import LabelEncoder
dataframe[f"{column}_integer"] = LabelEncoder().fit_transform(dataframe[[column]])
return dataframe
[docs]def append_class_weights(dataframe: pd.DataFrame, column: str):
"""Add class weights (based on `column`) to a dataframe.
Parameters
-----------
dataframe: pd.DataFrame
Input dataframe
column: str
Column to base the weights on
"""
labels_unique, counts = np.unique(dataframe[column], return_counts=True)
class_weights = [sum(counts) / c for c in counts]
class_weights_dict = dict(zip(labels_unique, class_weights))
weights = [class_weights_dict[e] for e in dataframe[column]]
dataframe[f"{column}_class_weights"] = weights
return dataframe
[docs]def make_random_df(columns: Sequence[str] = list("ABCD"), n_rows: int = 100):
"""Generate a random dataframe. Useful to test data wrangling pipelines.
Parameters
----------
columns: Sequence[str] = ["A","B","C","D"]
List of columns to add to the random dataframe. If none are provided,
a dataframe with columns ["A","B","C","D"] is created
n_rows: int = 100
Number of rows to create for the random dataframe
"""
data = np.random.randn(n_rows, len(columns))
return pd.DataFrame(data, columns=columns)