Source code for cyto_dl.utils.download_test_data

import shutil
from pathlib import Path

import boto3
import pandas as pd
import pyrootutils
from botocore import UNSIGNED
from botocore.client import Config

EXAMPLE_DATA_DIR = None
EXAMPLE_DATA_FILENAME = "s3_paths.csv"


DATA_PATHS = {
    "raw": "s3://allencell/aics/variance_project_dataset/fov_path/008f53c7_3500001156_100X_20170807_1-Scene-09-P9-E07.ome.tiff",
    "seg": "s3://allencell/aics/variance_project_dataset/fov_seg_path/a7c64690_3500001156_100X_20170807_1-Scene-09-P9-E07_CellNucSegCombined.ome.tiff",
    "struct": "SEC61B",
}


[docs]def parse_s3_path(fn): path = Path(fn) assert path.parts[0] == "s3:", f"Expected an s3 path, got {fn}" return path.parts[1], "/".join(path.parts[2:]), path.parts[-1]
[docs]def setup_paths(): global EXAMPLE_DATA_DIR root = pyrootutils.setup_root( search_from=__file__, project_root_env_var=True, dotenv=True, pythonpath=True, cwd=False, # do NOT change working directory to root (would cause problems in DDP mode) indicator=("pyproject.toml", "README.md"), ) EXAMPLE_DATA_DIR = root / "data" / "example_experiment_data" for subdir in ("s3_data", "segmentation", "labelfree"): (EXAMPLE_DATA_DIR / subdir).mkdir(exist_ok=True, parents=True)
[docs]def download_test_data(limit=-1): setup_paths() s3 = boto3.client("s3", config=Config(signature_version=UNSIGNED)) df = pd.DataFrame(DATA_PATHS, index=[0]) if limit > 0: df = df.iloc[:limit] local_data = {col: [] for col in df.columns} local_data["struct"] = DATA_PATHS["struct"] for col in ("raw", "seg"): for fn in df[col].unique(): bucket, s3_path, local_file_name = parse_s3_path(fn) local_file_path = EXAMPLE_DATA_DIR / "s3_data" / local_file_name if not local_file_path.exists(): s3.download_file(bucket, s3_path, str(local_file_path)) local_data[col].append(local_file_path) local_data = pd.DataFrame(local_data) labelfree_data = local_data[["raw", "raw"]] labelfree_data.columns = ["brightfield", "signal"] for data, data_type in zip((local_data, labelfree_data), ("segmentation", "labelfree")): for csv_type in ("train", "test", "valid"): data.to_csv(EXAMPLE_DATA_DIR / data_type / f"{csv_type}.csv")
[docs]def delete_test_data(): for subdir in ("segmentation", "labelfree", "s3_data"): shutil.rmtree(EXAMPLE_DATA_DIR / subdir)