Source code for hdnnpy.preprocess.standardization

# coding: utf-8

"""Scale all feature values to be zero-mean and unit-variance."""

import numpy as np

from hdnnpy.preprocess.preprocess_base import PreprocessBase
from hdnnpy.utils import (MPI, pprint)


[docs]class Standardization(PreprocessBase): """Scale all feature values to be zero-mean and unit-variance.""" name = 'standardization' """str: Name of this class.""" def __init__(self): super().__init__() self._mean = {} self._std = {} @property def mean(self): """dict [~numpy.ndarray]: Initialized mean values in each feature dimension and each element.""" return self._mean @property def std(self): """dict [~numpy.ndarray]: Initialized standard deviation values in each feature dimension and each element.""" return self._std
[docs] def apply(self, dataset, elemental_composition, verbose=True): """Apply the same pre-processing for each element to dataset. It accepts 1 or 2 for length of ``dataset``, each element of which is regarded as ``0th-order``, ``1st-order``, ... Args: dataset (list [~numpy.ndarray]): Input dataset to be scaled. elemental_composition (list [str]): Element symbols corresponding to 1st dimension of ``dataset``. verbose (bool, optional): Print log to stdout. Returns: list [~numpy.ndarray]: Processed dataset to be zero-mean and unit-variance. """ order = len(dataset) - 1 assert 0 <= order <= 2 self._initialize_params(dataset[0], elemental_composition, verbose) mean = np.array( [self._mean[element] for element in elemental_composition]) std = np.array( [self._std[element] for element in elemental_composition]) if order >= 0: dataset[0] -= mean dataset[0] /= std if order >= 1: dataset[1] /= std[..., None] if order >= 2: dataset[2] /= std[..., None, None] return dataset
[docs] def dump_params(self): """Dump its own parameters as :obj:`str`. Returns: str: Formed parameters. """ params_str = '' for element in self._elements: mean = self._mean[element] std = self._std[element] mean_str = ' '.join(map(str, mean)) std_str = ' '.join(map(str, std)) params_str += f''' {element} {mean.shape[0]} # mean {mean_str} # standard deviation {std_str} ''' return params_str
[docs] def load(self, file_path, verbose=True): """Load internal parameters for each element. Only root MPI process loads parameters. Args: file_path (~pathlib.Path): File path to load parameters. verbose (bool, optional): Print log to stdout. """ if MPI.rank == 0: ndarray = np.load(file_path) self._elements = ndarray['elements'].item() self._mean = {element: ndarray[f'mean:{element}'] for element in self._elements} self._std = {element: ndarray[f'std:{element}'] for element in self._elements} if verbose: pprint(f'Loaded Standardization parameters from {file_path}.')
[docs] def save(self, file_path, verbose=True): """Save internal parameters for each element. Only root MPI process saves parameters. Args: file_path (~pathlib.Path): File path to save parameters. verbose (bool, optional): Print log to stdout. """ if MPI.rank == 0: info = {'elements': self._elements} mean = {f'mean:{k}': v for k, v in self._mean.items()} std = {f'std:{k}': v for k, v in self._std.items()} np.savez(file_path, **info, **mean, **std) if verbose: pprint(f'Saved Standardization parameters to {file_path}.')
def _initialize_params(self, data, elemental_composition, verbose): """Initialize parameters only once for new elements.""" for element in set(elemental_composition) - self._elements: n_feature = data.shape[2] mask = np.array(elemental_composition) == element X = data[:, mask].reshape(-1, n_feature) self._elements.add(element) self._mean[element] = X.mean(axis=0) self._std[element] = X.std(axis=0, ddof=1) if verbose: pprint(f'Initialized Standardization parameters for {element}')