Source code for aicsimageio.readers.tiff_glob_reader

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import glob
import re
from collections import OrderedDict
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union

import dask.array as da
import numpy as np
import pandas as pd
import xarray as xr
from fsspec.spec import AbstractFileSystem
from tifffile import TiffFile, TiffFileError, TiffSequence, imread
from tifffile.tifffile import TiffTags

from .. import constants, exceptions, types
from ..dimensions import (
from ..metadata import utils as metadata_utils
from ..utils import io_utils
from .reader import Reader


[docs]class TiffGlobReader(Reader): r""" Wraps the tifffile imread API to provide the same aicsimageio Reader API but for multifile tiff datasets (and other tifffile supported) images. Parameters ---------- glob_in: Union[PathLike, Iterable[PathLike]] Glob string that identifies all files to be loaded or an iterable of paths to the files as returned by glob. indexer: Union[Callable, pandas.DataFrame] If callable, should consume each filename and return a pd.Series with series index corresponding to the dimensions and values corresponding to the array index of that image file within the larger array. Default: None (Look for 4 numbers in the file name and use them as S, T, C, and Z indices.) scene_glob_character: str Character to represent different scenes. Default: "S" chunk_dims: Union[str, List[str]] Which dimensions to create chunks for. Default: DEFAULT_CHUNK_DIMS Note: Dimensions.SpatialY, Dimensions.SpatialX, and DimensionNames.Samples, will always be added to the list if not present during dask array construction. dim_order: Optional[Union[List[str], str]] A string of dimensions to be applied to all array(s) or a list of string dimension names to be mapped onto the list of arrays provided to image. I.E. "TYX". Default: None (guess dimensions for single array or multiple arrays) channel_names: Optional[Union[List[str], List[List[str]]]] A list of string channel names to be applied to all array(s) or a list of lists of string channel names to be mapped onto the list of arrays provided to image. Default: None (create OME channel IDs for names for single or multiple arrays) single_file_shape : Optional[Tuple] Expected shape for a single file of the set. If not provided, will attempt to determine the shape from the first file found in the glob. Default : None single_file_dims : Optional[Tuple] Dimensions that correspond to the data dimensions of a single file in the glob. Default : ('Y', 'X') fs_kwargs: Dict[str, Any] Any specific keyword arguments to pass down to the fsspec created filesystem. Default: {} Examples -------- # Given files with names like "Position001_t002_c03_z04.tif" reader = TiffGlobReader("path/to/data/*.tif") # We can use this to read single image tiffs generated by MicroManager # Micromanager creates directories for each position so we need to recursively glob # for the images files and pass the list to TiffGlobReader. Note that all images are # named according to img_channel000_position001_time000000003_z004.tif" import glob files = glob.glob("path/to/data/**/*.tif", recursive=True) # since the numbers in Micromanager files are not in STCZ order we # need to use a different indexer than default. For convenience # when working MicroManager generated files you can use the provided # TiffGlobReader.MicroManagerIndexer mm_reader = TiffGlobReader(files, indexer=TiffGlobReader.MicroManagerIndexer) # as an example of making a custom indexer you can manually create # the MicroManagerIndexer like so: import pandas as pd from pathlib import Path import re def mm_indexer(path_to_img): inds = re.findall(r"\d+", Path(path_to_img).name) series = pd.Series(inds, index=['C', 'S', 'T', 'Z']).astype(int) return series mm_reader = TiffGlobReader(files, indexer=mm_indexer) """ @staticmethod def _is_supported_image( fs: AbstractFileSystem, path: types.PathLike, **kwargs: Any ) -> bool: try: with as open_resource: with TiffFile(open_resource): return True except (TiffFileError, TypeError): return False def __init__( self, glob_in: Union[types.PathLike, Iterable[types.PathLike]], indexer: Union[pd.DataFrame, Callable] = None, scene_glob_character: str = "S", chunk_dims: Union[str, List[str]] = DEFAULT_CHUNK_DIMS, dim_order: Optional[Union[List[str], str]] = None, channel_names: Optional[Union[List[str], List[List[str]]]] = None, single_file_shape: Optional[Tuple] = None, single_file_dims: Sequence[str] = ( DimensionNames.SpatialY, DimensionNames.SpatialX, ), fs_kwargs: Dict[str, Any] = {}, **kwargs: Any, ): # Assemble glob list if given a string if isinstance(glob_in, str): file_series = pd.Series(glob.glob(glob_in)) elif isinstance(glob_in, Path) and "*" in str(glob_in): file_series = pd.Series(glob.glob(str(glob_in))) elif isinstance(glob_in, pd.Series): # Ensure all of our indices line up file_series = glob_in.reset_index(drop=True, inplace=False) elif isinstance(glob_in, Iterable): file_series = pd.Series(glob_in) else: raise TypeError(f"Invalid type glob_in - got type {type(glob_in)}") if len(file_series) == 0: raise ValueError("No files found matching glob pattern") self.scene_glob_character = scene_glob_character if indexer is None: series_idx = [ self.scene_glob_character, DimensionNames.Time, DimensionNames.Channel, DimensionNames.SpatialZ, ] # By default we will attempt to parse 4 numbers out of the filename # and assign them in order to be the corresponding S, T, C, and Z indices. # So indexer("path/to/data/S0_T1_C2_Z3.tif") returns # pd.Series([0,1,2,3], index=['S','T','C', 'Z']) def indexer(x: str) -> pd.Series: return pd.Series( re.findall(r"\d+", Path(x).name), index=series_idx ).astype(int) if callable(indexer): self._all_files = file_series.apply(indexer) self._all_files["filename"] = file_series elif isinstance(indexer, pd.DataFrame): # make a copy of the indexing dataframe and reset it index # to ensure that we don't generate nans when aligning with # file_series. self._all_files = indexer.reset_index(drop=True, inplace=False) self._all_files["filename"] = file_series # If a dim doesn't exist on the file set the column value for that dim to zero. # If the dim is present, add it to the sort order. Because we are using # the default dimension ordering, this will naturally create a sort order # based off the standard dimension order. sort_order = [] for dim in DEFAULT_DIMENSION_ORDER_LIST_WITH_SAMPLES: if dim not in self._all_files.columns and dim not in single_file_dims: self._all_files[dim] = 0 if dim in self._all_files.columns: sort_order.append(dim) self._all_files = self._all_files.sort_values(sort_order).reset_index(drop=True) # run tests on a single file (?) self._fs, self._path = io_utils.pathlike_to_fs( self._all_files.iloc[0].filename, fs_kwargs=fs_kwargs, ) # Store params if isinstance(chunk_dims, str): self.chunk_dims = list(chunk_dims) elif isinstance(chunk_dims, list) and isinstance(chunk_dims[0], str): self.chunk_dims = chunk_dims else: raise ValueError("chunk_dims must be str or list of str") # Run basic checks on dims and channel names if isinstance(dim_order, list): if len(dim_order) != len(self.scenes): raise exceptions.ConflictingArgumentsError( f"Number of dimension strings provided does not match the " f"number of scenes found in the file. " f"Number of scenes: {len(self.scenes)}, " f"Number of provided dimension order strings: {len(dim_order)}" ) self._channel_names = channel_names # If provided a list if isinstance(channel_names, list): # If provided a list of lists if len(channel_names) > 0 and isinstance(channel_names[0], list): # Ensure that the outer list is the number of scenes if len(channel_names) != len(self.scenes): raise exceptions.ConflictingArgumentsError( f"Number of channel name lists provided does not match the " f"number of scenes found in the file. " f"Number of scenes: {len(self.scenes)}, " f"Provided channel name lists: {dim_order}" ) self._channel_names = channel_names for dim in REQUIRED_CHUNK_DIMS: if dim not in self.chunk_dims: self.chunk_dims.append(dim) # Safety measure / "feature" self.chunk_dims = [d.upper() for d in self.chunk_dims] if dim_order is not None: self._dim_order = dim_order else: self._dim_order = "".join( d for d in DEFAULT_DIMENSION_ORDER if d in self._all_files.columns or d in self.chunk_dims ) if single_file_shape is None: with as open_resource: with TiffFile(open_resource) as tiff: self._single_file_shape = tiff.series[0].shape else: self._single_file_shape = single_file_shape if len(single_file_dims) != len(self._single_file_shape): raise exceptions.ConflictingArgumentsError( f"Number of single file dimensions does not match the" f"number of dimensions in a test file. " f"Number of dimensions in file: {len(self._single_file_shape)}, " f"Provided number of dimensions: {len(single_file_dims)}." ) else: self._single_file_dims = list(single_file_dims) self._single_file_sizes = dict( zip(self._single_file_dims, self._single_file_shape) ) # Enforce valid image if not self._is_supported_image(self._fs, self._path): raise exceptions.UnsupportedFileFormatError( self.__class__.__name__, self._path ) @property def scenes(self) -> Tuple[str, ...]: if self._scenes is None: self._scenes = tuple( metadata_utils.generate_ome_image_id(s) for s in range(self._all_files[self.scene_glob_character].nunique()) ) return self._scenes def _read_delayed(self) -> xr.DataArray: scene_files = self._all_files.loc[ self._all_files[self.scene_glob_character] == self.current_scene_index ] scene_files = scene_files.drop(self.scene_glob_character, axis=1) scene_nunique = scene_files.nunique() tiff_tags = self._get_tiff_tags(TiffFile(scene_files.filename.iloc[0])) group_dims = [ x for x in scene_files.columns if x not in ["filename", *self.chunk_dims] ] # xxx_sizes are modeled after xr.DataArray.sizes # These are OrderedDicts that map a dimension name to a shape. # Use these to align and reshape the arrays that come from imread # dims and sizes are not always necessary but they keep things much # clearer internally. # sizes of dimensions we grouping by i.e. not chunks group_sizes = OrderedDict([(d, scene_nunique[d]) for d in group_dims]) # sizes of each chunk chunk_sizes = self._get_chunk_sizes(scene_nunique, group_dims) # sizes that will be used to reshape the array representing # the full glob into separate dimensions. unpack_sizes = OrderedDict( [ (d, s) for d, s in scene_nunique.items() if d in chunk_sizes.keys() - group_sizes.keys() ] ) reshape_sizes = tuple(unpack_sizes.values()) + tuple( self._single_file_sizes.values() ) # after unpacking the result of imread we sometimes need to rearrange dims # in case they are in the glob and single files. axes_order = self._get_axes_order(chunk_sizes, unpack_sizes, group_sizes) # expand the sizes with singleton dimensions to facilitate da.block at the end expanded_blocks_sizes, expanded_chunk_sizes = self._get_expanded_shapes( group_sizes, chunk_sizes ) # Assemble the dask array if len(group_dims) > 0: # use groupby to assemble array out of chunks blocks = np.zeros(tuple(group_sizes.values()), dtype="object") for i, (idx, val) in enumerate(scene_files.groupby(group_dims)): with TiffSequence(val.filename.tolist()) as tif: with tif.aszarr() as zarr_im: darr = da.from_zarr(zarr_im).rechunk(-1) # unpack the first dimension if it contains multiple axes darr = darr.reshape(reshape_sizes) # Then reorder dimensions so matching ones from the glob # and the file are adjacent (glob then file) darr = darr.transpose(axes_order) # Then reshape the array to chunk_sizes darr = darr.reshape(tuple(expanded_chunk_sizes.values())) blocks[idx] = darr blocks = blocks.reshape(tuple(expanded_blocks_sizes.values())) d_data = da.block(blocks.tolist()) dims = list(expanded_blocks_sizes.keys()) else: # assemble array in a single chunk zarr_im = imread(scene_files.filename.tolist(), aszarr=True, level=0) darr = da.from_zarr(zarr_im).rechunk(-1) darr = darr.reshape(reshape_sizes) darr = darr.transpose(axes_order) d_data = darr.reshape(tuple(chunk_sizes.values())) dims = list(expanded_chunk_sizes.keys()) # Assign dims and coords to construct xarray channel_names = self._get_channel_names_for_scene(dims, d_data.shape) coords = self._get_coords( dims, d_data.shape, self.current_scene_index, channel_names ) # Try accepted processed metadata try: attrs = { constants.METADATA_UNPROCESSED: tiff_tags, constants.METADATA_PROCESSED: tiff_tags[ TIFF_IMAGE_DESCRIPTION_TAG_INDEX ], } except KeyError: attrs = {constants.METADATA_UNPROCESSED: tiff_tags} x_data = xr.DataArray(d_data, dims=dims, coords=coords, attrs=attrs) x_data = x_data.transpose(*self._dim_order) return x_data def _read_immediate(self) -> xr.DataArray: # Set up scene specific information scene_files = self._all_files.loc[ self._all_files[self.scene_glob_character] == self.current_scene_index ] scene_files = scene_files.drop(self.scene_glob_character, axis=1) scene_nunique = scene_files.nunique() tiff_tags = self._get_tiff_tags(TiffFile(scene_files.filename.iloc[0])) chunk_sizes = self._get_chunk_sizes(scene_nunique) unpack_sizes = OrderedDict( [(d, s) for d, s in scene_nunique.items() if d in chunk_sizes.keys()] ) reshape_sizes = tuple(unpack_sizes.values()) + tuple( self._single_file_sizes.values() ) axes_order = self._get_axes_order(chunk_sizes, unpack_sizes) # Assemble array arr = imread(scene_files.filename.tolist(), level=0) arr = arr.reshape(reshape_sizes) arr = arr.transpose(axes_order) arr = arr.reshape(tuple(chunk_sizes.values())) # Assign dims and coords to construct xarray dims = scene_files.columns.drop("filename").values.tolist() file_dims = [x for x in self._single_file_dims if x not in dims] dims += file_dims channel_names = self._get_channel_names_for_scene(dims, arr.shape) coords = self._get_coords( dims, arr.shape, self.current_scene_index, channel_names ) # Try accepted processed metadata try: attrs = { constants.METADATA_UNPROCESSED: tiff_tags, constants.METADATA_PROCESSED: tiff_tags[ TIFF_IMAGE_DESCRIPTION_TAG_INDEX ], } except KeyError: attrs = {constants.METADATA_UNPROCESSED: tiff_tags} x_data = xr.DataArray( arr, dims=dims, coords=coords, attrs=attrs, ) return x_data def _get_axes_order( self, chunk_sizes: OrderedDict, unpack_sizes: OrderedDict, group_sizes: OrderedDict = OrderedDict(), ) -> Tuple: axes_order: Tuple[int, ...] = () for d in chunk_sizes: if d in unpack_sizes: axes_order += (list(unpack_sizes.keys()).index(d),) if d in self._single_file_sizes: axes_order += ( len(unpack_sizes) + list(self._single_file_sizes.keys()).index(d), ) return axes_order def _get_chunk_sizes( self, scene_files_nunique: pd.Series, group_dims: List[str] = [] ) -> OrderedDict: sizes = OrderedDict() for i, x in scene_files_nunique.items(): if i not in ["filename", *group_dims]: if i not in self._single_file_dims: sizes[i] = x else: sizes[i] = self._single_file_sizes[i] * x for d, s in self._single_file_sizes.items(): if d not in self.chunk_dims and d not in sizes: sizes[d] = s for i, x in self._single_file_sizes.items(): if i not in scene_files_nunique.index: sizes[i] = x return sizes def _get_expanded_shapes( self, group_sizes: OrderedDict, chunk_sizes: OrderedDict ) -> Tuple[OrderedDict, OrderedDict]: expanded_blocks_sizes = OrderedDict() expanded_chunk_sizes = OrderedDict() for i, (d, s) in enumerate(group_sizes.items()): if d in chunk_sizes: if d not in expanded_blocks_sizes: d_idx_in_chunks = list(chunk_sizes.keys()).index(d) for j in range(d_idx_in_chunks): c_key = list(chunk_sizes.keys())[j] if c_key not in expanded_blocks_sizes: expanded_blocks_sizes[c_key] = 1 expanded_blocks_sizes[d] = s if d not in chunk_sizes: if len(expanded_blocks_sizes) <= i: expanded_blocks_sizes[d] = s else: for d2 in expanded_blocks_sizes: expanded_chunk_sizes[d2] = chunk_sizes[d2] expanded_chunk_sizes[d] = 1 expanded_blocks_sizes[d] = group_sizes[d] for d2, s2 in chunk_sizes.items(): if d2 not in expanded_chunk_sizes: expanded_chunk_sizes[d2] = s2 for d, s in chunk_sizes.items(): if d not in expanded_blocks_sizes: expanded_blocks_sizes[d] = 1 if len(expanded_chunk_sizes) == 0: expanded_chunk_sizes = chunk_sizes return expanded_blocks_sizes, expanded_chunk_sizes def _get_channel_names_for_scene( self, dims: List[str], image_shape: Tuple[int, ...] ) -> Optional[List[str]]: # Fast return in None case if self._channel_names is None: return None # If channels was provided as a list of lists if isinstance(self._channel_names[0], list): scene_channels = self._channel_names[self.current_scene_index] elif all(isinstance(c, str) for c in self._channel_names): scene_channels = self._channel_names # type: ignore else: return None # If scene channels isn't None and no channel dimension raise error if DimensionNames.Channel not in dims: raise exceptions.ConflictingArgumentsError( f"Provided channel names for scene with no channel dimension. " f"Scene dims: {dims}, " f"Provided channel names: {scene_channels}" ) # If scene channels isn't the same length as the size of channel dim if len(scene_channels) != image_shape[dims.index(DimensionNames.Channel)]: raise exceptions.ConflictingArgumentsError( f"Number of channel names provided does not match the " f"size of the channel dimension for this scene. " f"Scene shape: {image_shape}, " f"Dims: {dims}, " f"Provided channel names: {self._channel_names}", ) return scene_channels # type: ignore @staticmethod def _get_coords( dims: List[str], shape: Tuple[int, ...], scene_index: int, channel_names: Optional[List[str]], ) -> Dict[str, Any]: # Use dims for coord determination coords: Dict[str, Any] = {} if channel_names is None: # Get ImageId for channel naming image_id = metadata_utils.generate_ome_image_id(scene_index) # Use range for channel indices if DimensionNames.Channel in dims: coords[DimensionNames.Channel] = [ metadata_utils.generate_ome_channel_id( image_id=image_id, channel_id=i ) for i in range(shape[dims.index(DimensionNames.Channel)]) ] else: coords[DimensionNames.Channel] = channel_names return coords def _get_tiff_tags(self, tiff: TiffFile) -> TiffTags: unprocessed_tags = tiff.series[0].pages[0].tags # Create dict of tag and value tags: Dict[int, str] = {} for code, tag in unprocessed_tags.items(): tags[code] = tag.value return tags
[docs] @staticmethod def MicroManagerIndexer(path_to_img: Union[str, Path]) -> pd.Series: """ An indexer function to transform Micromanager MDA tiff filenames to indices. To use:: reader = TiffGlobReader(files, indexer=TiffGlobReader.MicroManagerIndexer) Expects images to have names of the form: img_channel_[0-9]+_position[0-9]+_time[0-9]+_z[0-9]+.tif[f] Parameters ---------- path_to_img : [str, Path] The path to an image. Returns ------- pd.Series """ inds = re.findall(r"\d+", Path(path_to_img).name) series = pd.Series(inds, index=["C", "S", "T", "Z"]).astype(int) return series