Source code for actk.steps.raw.raw

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import logging
from pathlib import Path
from typing import Union

import dask.dataframe as dd
import pandas as pd
from datastep import Step, log_run_params

from ...constants import DatasetFields
from ...utils import dataset_utils

###############################################################################

log = logging.getLogger(__name__)

###############################################################################

# This is the merge of all other steps required fields.
# Reasoning here is that the user will only want to upload the raw
# if the user is doing a full pipeline run
REQUIRED_DATASET_FIELDS = DatasetFields.AllExpectedInputs

###############################################################################


[docs]class Raw(Step):
    def __init__(
        self,
        filepath_columns=[
            DatasetFields.SourceReadPath,
            DatasetFields.NucleusSegmentationReadPath,
            DatasetFields.MembraneSegmentationReadPath,
        ],
        metadata_columns=[DatasetFields.FOVId],
        **kwargs,
    ):
        super().__init__(
            filepath_columns=filepath_columns,
            metadata_columns=metadata_columns,
            **kwargs,
        )

[docs]    @log_run_params
    def run(self, dataset: Union[str, Path, pd.DataFrame, dd.DataFrame], **kwargs):
        """
        Simple passthrough to store the dataset in local_staging/raw.
        This does not copy any the image files to local_staging/raw, only the manifest.
        This is an optional step that will only run if you want to upload the raw data.

        Parameters
        ----------
        dataset: Union[str, Path, pd.DataFrame, dd.DataFrame]
            The dataset to use for the rest of the pipeline run.

            **Required dataset columns:** *["CellId", "CellIndex", "FOVId",
            "SourceReadPath", "NucleusSegmentationReadPath",
            "MembraneSegmentationReadPath", "ChannelIndexDNA", "ChannelIndexMembrane",
            "ChannelIndexStructure", "ChannelIndexBrightfield"]*

        Returns
        -------
        manifest_save_path: Path
            The path to the manifest in local_staging with the raw data.
        """
        if isinstance(dataset, (str, Path)):
            dataset = Path(dataset).expanduser().resolve(strict=True)

            # Read dataset
            dataset = pd.read_csv(dataset)

        # Check dataset and manifest have required fields
        dataset_utils.check_required_fields(
            dataset=dataset,
            required_fields=REQUIRED_DATASET_FIELDS,
        )

        # Save manifest to CSV
        self.manifest = dataset
        manifest_save_path = self.step_local_staging_dir / "manifest.csv"
        self.manifest.to_csv(manifest_save_path, index=False)

        return manifest_save_path