Source code for actk.steps.single_cell_images.single_cell_images

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import logging
from pathlib import Path
from typing import List, NamedTuple, Optional, Union

import aicsimageio
import aicsimageprocessing as proc
import dask.dataframe as dd
import numpy as np
import pandas as pd
from aics_dask_utils import DistributedHandler
from aicsimageio import AICSImage, transforms
from aicsimageio.writers import OmeTiffWriter
from datastep import Step, log_run_params
from imageio import imwrite

from ...constants import Channels, DatasetFields
from ...utils import dataset_utils, image_utils
from ..single_cell_features import SingleCellFeatures

###############################################################################

log = logging.getLogger(__name__)

###############################################################################

REQUIRED_DATASET_FIELDS = [
    DatasetFields.CellId,
    DatasetFields.StandardizedFOVPath,
    DatasetFields.CellFeaturesPath,
]


[docs]class CellImagesResult(NamedTuple):
    cell_id: Union[int, str]
    path_3d: Path
    path_2d_all_proj: Path
    path_2d_yx_proj: Path


[docs]class CellImagesError(NamedTuple):
    cell_id: Union[int, str]
    error: str


###############################################################################


[docs]class SingleCellImages(Step):
    def __init__(
        self,
        direct_upstream_tasks=[SingleCellFeatures],
        filepath_columns=[
            DatasetFields.CellImage3DPath,
            DatasetFields.CellImage2DAllProjectionsPath,
            DatasetFields.CellImage2DYXProjectionPath,
        ],
        **kwargs,
    ):
        super().__init__(
            direct_upstream_tasks=direct_upstream_tasks,
            filepath_columns=filepath_columns,
            **kwargs,
        )

    @staticmethod
    def _get_registered_image_size(row_index: int, row: pd.Series) -> List[int]:
        # Open cell features JSON
        with open(row.CellFeaturesPath, "r") as read_in:
            cell_features = json.load(read_in)

        # Return registered image size
        return cell_features["imsize_registered"]

    @staticmethod
    def _generate_single_cell_images(
        row_index: int,
        row: pd.Series,
        cell_ceiling_adjustment: int,
        bounding_box: np.ndarray,
        projection_method: str,
        cell_images_3d_dir: Path,
        cell_images_2d_all_proj_dir: Path,
        cell_images_2d_yx_proj_dir: Path,
        overwrite: bool,
    ) -> Union[CellImagesResult, CellImagesError]:
        # Don't use dask for image reading
        aicsimageio.use_dask(False)

        # Get the ultimate end save paths for this cell
        cell_image_3d_save_path = cell_images_3d_dir / f"{row.CellId}.ome.tiff"
        cell_image_2d_all_proj_save_path = (
            cell_images_2d_all_proj_dir / f"{row.CellId}.png"
        )
        cell_image_2d_yx_proj_save_path = (
            cell_images_2d_yx_proj_dir / f"{row.CellId}.png"
        )

        # Check skip
        if (
            not overwrite
            # Only skip if all images exist for this cell
            and all(
                p.is_file()
                for p in [
                    cell_image_3d_save_path,
                    cell_image_2d_all_proj_save_path,
                    cell_image_2d_yx_proj_save_path,
                ]
            )
        ):
            log.info(f"Skipping single cell image generation for CellId: {row.CellId}")
            return CellImagesResult(
                row.CellId,
                cell_image_3d_save_path,
                cell_image_2d_all_proj_save_path,
                cell_image_2d_yx_proj_save_path,
            )

        # Overwrite or didn't exist
        log.info(f"Beginning single cell image generation for CellId: {row.CellId}")

        # Wrap errors for debugging later
        try:
            # Initialize image object with standardized FOV
            standardized_image = AICSImage(row.StandardizedFOVPath)
            channels = standardized_image.get_channel_names()

            # Preload image data
            standardized_image.data

            # Select and adjust cell shape ceiling for this cell
            image = image_utils.select_and_adjust_segmentation_ceiling(
                # Unlike most other operations, we can read in normal "CZYX" dimension
                # order here as all future operations are expecting it
                image=standardized_image.get_image_data("CYXZ", S=0, T=0),
                cell_index=row.CellIndex,
                cell_ceiling_adjustment=cell_ceiling_adjustment,
            )

            # Perform a rigid registration on the image
            image, _, _ = proc.cell_rigid_registration(
                image,
                # Reorder bounding box as image is currently CYXZ
                bbox_size=bounding_box[[0, 2, 3, 1]],
            )

            # Reduce size
            crop_3d = image * 255
            crop_3d = crop_3d.astype(np.uint8)

            # Transpose to CZYX for saving
            crop_3d = transforms.transpose_to_dims(crop_3d, "CYXZ", "CZYX")

            # Save to OME-TIFF
            with OmeTiffWriter(cell_image_3d_save_path, overwrite_file=True) as writer:
                writer.save(
                    crop_3d,
                    dimension_order="CZYX",
                    channel_names=standardized_image.get_channel_names(),
                    pixels_physical_size=standardized_image.get_physical_pixel_size(),
                )

            # Generate 2d image projections
            # Crop raw channels using segmentations
            image = image_utils.crop_raw_channels_with_segmentation(image, channels)

            # Transpose to CZYX for projections
            image = transforms.transpose_to_dims(image, "CYXZ", "CZYX")

            # Select the DNA, Membrane, and Structure channels
            image = image[
                [
                    channels.index(target)
                    for target in [Channels.DNA, Channels.Membrane, Channels.Structure]
                ]
            ]

            # Set RGB colors
            # This will set:
            # DNA to Blue
            # Membrane to Red
            # Structure to Green
            colors = [[0, 0, 1], [1, 0, 0], [0, 1, 0]]

            # Get all axes projection image
            all_proj = proc.imgtoprojection(
                image,
                proj_all=True,
                proj_method=projection_method,
                local_adjust=False,
                global_adjust=True,
                colors=colors,
            )

            # Convert to YXC for PNG writing
            all_proj = transforms.transpose_to_dims(all_proj, "CYX", "YXC")

            # Drop size to uint8
            all_proj = all_proj.astype(np.uint8)

            # Save to PNG

            imwrite(cell_image_2d_all_proj_save_path, all_proj)

            # Get YX axes projection image
            yx_proj = proc.imgtoprojection(
                image,
                proj_all=False,
                proj_method=projection_method,
                local_adjust=False,
                global_adjust=True,
                colors=colors,
            )

            # Convert to YXC for PNG writing
            yx_proj = transforms.transpose_to_dims(yx_proj, "CYX", "YXC")

            # Drop size to uint8
            yx_proj = yx_proj.astype(np.uint8)

            # Save to PNG
            imwrite(cell_image_2d_yx_proj_save_path, yx_proj)

            log.info(f"Completed single cell image generation for CellId: {row.CellId}")

            # Return ready to save image
            return CellImagesResult(
                row.CellId,
                cell_image_3d_save_path,
                cell_image_2d_all_proj_save_path,
                cell_image_2d_yx_proj_save_path,
            )

        # Catch and return error
        except Exception as e:
            log.info(
                f"Failed single cell image generation for CellId: {row.CellId}. "
                "Error: {e}"
            )
            return CellImagesError(row.CellId, str(e))

[docs]    @log_run_params
    def run(
        self,
        dataset: Union[str, Path, pd.DataFrame, dd.DataFrame],
        cell_ceiling_adjustment: int = 0,
        bounding_box_percentile: float = 95.0,
        projection_method: str = "max",
        distributed_executor_address: Optional[str] = None,
        batch_size: Optional[int] = None,
        overwrite: bool = False,
        bbox: Union[tuple, list, dict] = None,
        **kwargs,
    ):
        """
        Provided a dataset of cell features and standardized FOV images, generate 3D
        single cell crops and 2D projections.

        Parameters
        ----------
        dataset: Union[str, Path, pd.DataFrame, dd.DataFrame]
            The primary cell dataset to generate 3D single cell images for.

            **Required dataset columns:** *["CellId", "StandardizedFOVPath",
            "CellFeaturesPath"]*

        cell_ceiling_adjustment: int
            The adjust to use for raising the cell shape ceiling. If <= 0, this will be
            ignored and cell data will be selected but not adjusted.
            Default: 0

        bounding_box_percentile: float
            A float used to generate the actual bounding box for all cells by finding
            provided percentile of all cell image sizes.
            Default: 95.0

        bbox: tuple, list, dict
            Hard coded ZYX dimensions to set the bounding box.
            Note: This overrides the `bounding_box_percentile` parameter.
            Example: (64, 168, 104)

        projection_method: str
            The method to use for generating the flat projection.
            Default: max

            More details:
            https://allencellmodeling.github.io/aicsimageprocessing/aicsimageprocessing.html#aicsimageprocessing.imgToProjection.imgtoprojection

        distributed_executor_address: Optional[str]
            An optional executor address to pass to some computation engine.
            Default: None

        batch_size: Optional[int]
            An optional batch size to process n features at a time.
            Default: None (Process all at once)

        overwrite: bool
            If this step has already partially or completely run, should it overwrite
            the previous files or not.
            Default: False (Do not overwrite or regenerate files)

        Returns
        -------
        manifest_save_path: Path
            Path to the produced manifest with the various cell image path fields added.
        """
        # Handle dataset provided as string or path
        if isinstance(dataset, (str, Path)):
            dataset = Path(dataset).expanduser().resolve(strict=True)

            # Read dataset
            dataset = pd.read_csv(dataset)

        # Check dataset and manifest have required fields
        dataset_utils.check_required_fields(
            dataset=dataset,
            required_fields=REQUIRED_DATASET_FIELDS,
        )

        # Create save directories
        cell_images_3d_dir = self.step_local_staging_dir / "cell_images_3d"
        cell_images_2d_all_proj_dir = (
            self.step_local_staging_dir / "cell_images_2d_all_proj"
        )
        cell_images_2d_yx_proj_dir = (
            self.step_local_staging_dir / "cell_images_2d_yx_proj"
        )
        cell_images_3d_dir.mkdir(exist_ok=True)
        cell_images_2d_all_proj_dir.mkdir(exist_ok=True)
        cell_images_2d_yx_proj_dir.mkdir(exist_ok=True)

        # Process each row
        with DistributedHandler(distributed_executor_address) as handler:
            # Start processing
            bbox_results = handler.batched_map(
                self._get_registered_image_size,
                # Convert dataframe iterrows into two lists of items to iterate over
                # One list will be row index
                # One list will be the pandas series of every row
                *zip(*list(dataset.iterrows())),
                batch_size=batch_size,
            )

            if bbox:
                num_of_channels = bbox_results[0][0]
                bbox = [num_of_channels] + list(bbox)
                bbox_results = [bbox for i in range(len(bbox_results))]
                log.info(f"Using hard coded bounding box with ZYX dimensions: {bbox}.")

            # Compute bounding box with percentile
            bbox_results = np.array(bbox_results)
            bounding_box = np.percentile(bbox_results, bounding_box_percentile, axis=0)
            bounding_box = np.ceil(bounding_box)

            # Generate bounded arrays
            results = handler.batched_map(
                self._generate_single_cell_images,
                # Convert dataframe iterrows into two lists of items to iterate over
                # One list will be row index
                # One list will be the pandas series of every row
                *zip(*list(dataset.iterrows())),
                # Pass the other parameters as list of the same thing for each
                # mapped function call
                [cell_ceiling_adjustment for i in range(len(dataset))],
                [bounding_box for i in range(len(dataset))],
                [projection_method for i in range(len(dataset))],
                [cell_images_3d_dir for i in range(len(dataset))],
                [cell_images_2d_all_proj_dir for i in range(len(dataset))],
                [cell_images_2d_yx_proj_dir for i in range(len(dataset))],
                [overwrite for i in range(len(dataset))],
                batch_size=batch_size,
            )

        # Generate single cell images dataset rows
        single_cell_images_dataset = []
        errors = []
        for r in results:
            if isinstance(r, CellImagesResult):
                single_cell_images_dataset.append(
                    {
                        DatasetFields.CellId: r.cell_id,
                        DatasetFields.CellImage3DPath: r.path_3d,
                        DatasetFields.CellImage2DAllProjectionsPath: r.path_2d_all_proj,
                        DatasetFields.CellImage2DYXProjectionPath: r.path_2d_yx_proj,
                    }
                )
            else:
                errors.append({DatasetFields.CellId: r.cell_id, "Error": r.error})

        # Convert features paths rows to dataframe
        single_cell_images_dataset = pd.DataFrame(single_cell_images_dataset)

        # Drop the various single cell image columns if they already exist
        drop_columns = []
        if DatasetFields.CellImage3DPath in dataset.columns:
            drop_columns.append(DatasetFields.CellImage3DPath)
        if DatasetFields.CellImage2DAllProjectionsPath in dataset.columns:
            drop_columns.append(DatasetFields.CellImage2DAllProjectionsPath)
        if DatasetFields.CellImage2DYXProjectionPath in dataset.columns:
            drop_columns.append(DatasetFields.CellImage2DYXProjectionPath)

        dataset = dataset.drop(columns=drop_columns)

        # Join original dataset to the fov paths
        self.manifest = dataset.merge(
            single_cell_images_dataset, on=DatasetFields.CellId
        )

        # Save manifest to CSV
        manifest_save_path = self.step_local_staging_dir / "manifest.csv"
        self.manifest.to_csv(manifest_save_path, index=False)

        # Save errored cells to JSON
        with open(self.step_local_staging_dir / "errors.json", "w") as write_out:
            json.dump(errors, write_out)

        return manifest_save_path