Source code for datastep.file_utils

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import hashlib
import json
import logging
from pathlib import Path
from shutil import rmtree
from typing import Any, Dict, List, Optional, Union

import pandas as pd

###############################################################################

log = logging.getLogger(__name__)

###############################################################################


[docs]def resolve_filepath(f: Union[str, Path], strict: bool = True) -> Path: # Resolve f = Path(f).expanduser() # Check is not dir if f.is_dir(): raise IsADirectoryError(f) # Check if we want to fully resolve if strict: f = f.resolve(strict=True) else: if not f.exists(): raise FileNotFoundError(f) return f
[docs]def resolve_directory( d: Union[str, Path], make: bool = False, strict: bool = True ) -> Path: # Expand user d = Path(d).expanduser() # Check if file if d.is_file(): raise FileExistsError(d) # Make the directory if specified if make: d.mkdir(parents=True, exist_ok=True) # Check if we want to fully resolve if strict: # Fully resolve d = d.resolve(strict=True) else: if not d.exists(): raise FileNotFoundError(d) return d
[docs]def create_unique_logical_key(physical_key: Union[str, Path]) -> str: # Fully resolve the phyiscal key pk = Path(physical_key).expanduser().resolve(strict=True) # Creat short hash from fully resolved physical key short_hash = hashlib.sha256(str(pk).encode("utf-8")).hexdigest()[:8] # Return the unique logical key return f"{short_hash}_{pk.name}"
[docs]def make_json_serializable( value: Any, context: Optional[str] = None ) -> Union[bool, float, int, str, List, Dict]: # Try dumping to JSON string try: json.dumps(value) return value # It isn't explicitly JSON serializable, convert to string except TypeError: if context is None: context = "" log.debug(f"Casting {value} to string to make JSON serializable. {context}") return str(value)
def _filepath_rel2abs(filepath: Path, prefixpath: Path = Path(".")) -> Path: return (prefixpath / filepath).resolve() def _filepath_abs2rel(filepath: Path, otherpath: Path = Path(".")) -> Path: return (otherpath / filepath).resolve().relative_to(otherpath.resolve())
[docs]def manifest_filepaths_rel2abs( manifest: pd.DataFrame, filepath_columns: List[str], relative_dir: Path ): # Make a copy of the manifest manifest = manifest.copy(deep=True) # Run for each column in filepath columns for col in filepath_columns: manifest[col] = manifest[col].apply( lambda x: str(_filepath_rel2abs(Path(x), relative_dir)) ) return manifest
[docs]def manifest_filepaths_abs2rel( manifest: pd.DataFrame, filepath_columns: List[str], relative_dir: Path ): # Make a copy of the manifest manifest = manifest.copy(deep=True) # Run for each column in filepath columns for col in filepath_columns: manifest[col] = manifest[col].apply( lambda x: str(_filepath_abs2rel(Path(x), relative_dir)) ) return manifest
def _clean(dirpath: Path) -> Optional[Exception]: # Remove anything in step staging dir rmtree(dirpath) # Create it again as empty dir dirpath.mkdir(parents=True, exist_ok=True) def _sanitize_name(input_str: str): return input_str.replace(" ", "_")