Source code for bspysmg.data.postprocess

"""
File containing functions for postprocessing raw data gathered from the sampler and information for the model's info dictionary.
"""
import os

import numpy as np

from brainspy.utils.io import load_configs
from bspysmg.utils.plots import output_hist
from typing import Tuple
from datetime import timedelta


[docs]def get_sampling_data(filename: str, activation_electrode_no: int,
                      readout_electrode_no: int) -> Tuple[np.array, np.array]:
    """
    Reads the sampling data from a text file (IO.dat) and returs the values loaded in numpy arrays.

    Parameters
    ----------
    filename : str
        Path to the file containing comma separated values read during the data gathering
        process. Typically, named IO.dat.
    activation_electrode_no : int
        Number of activation electrodes used for the device during the data gathering process.
    readout_electrode_no : int
        Number of current readout/output electrodes used for the device during the data gathering
        process.

    Returns
    -------
    inputs : np.array
        Array containing all the inputs that were sent to the device during sampling.
    outputs : np.array
        Array containing all the outputs of the device obtained during sampling, which correspond
        to the inputs to the device.
    """
    print("\nLoading file: " + filename)
    print("This may take some time. Please wait.\n")
    assert type(activation_electrode_no
                ) is int, "Activation electrode number expected to be int"
    assert type(readout_electrode_no
                ) is int, "Readout electrode number expected to be int"
    data = np.loadtxt(filename)
    assert data.shape[1] == (
        activation_electrode_no + readout_electrode_no
    ), "Data from the file has a different electrode configuration. Check the activation electrode no and the readout electrode no"
    inputs = data[:, :activation_electrode_no]
    outputs = data[:, -readout_electrode_no:]
    return inputs, outputs


[docs]def post_process(data_dir: str,
                 clipping_value="default",
                 charging_signal_batch_no: int = 40,
                 reference_signal_batch_no: int = 15,
                 filename: str = "postprocessed_data",
                 **kwargs) -> Tuple[np.array, np.array, dict]:
    """
    Postprocesses the data, cleans any clipping (optional), and merges data sets if needed. The data
    arrays are merged into a single array and cropped given the clipping_values. The function also
    plots and saves the histogram of the data.

    Parameters
    ----------
    data_dir: str
        A string with path to the directory with the data: it is assumed at least two
        files exist, named sampler_configs.json and a IO.dat respectively.
    clipping_value : [float,float]
        Will apply a clipping to the input and output sampling data within the
        specified values. The the setups have a limit in the range they can read.
        They typically clip at approximately +-4 V. Note that in order to
        calculate the clipping_range, it needs to be multiplied by the
        amplification value of the setup. (e.g., in the Brains setup the
        amplification is 28.5, is the clipping_value is +-4 (V), therefore, the
        clipping value should be +-4 * 28.5, which is [-110,110] (nA) ).
        This variable represents a lower and upper clipping_value to crop data.
        It can be either None, 'default' or [float,float]. The 'default' str
        input will automatically take the clipping value by multiplying the
        amplification of the data by -4 and 4. The None input will not apply any
        clipping. 
        
        N O T E: When the clipping value is set to None, the model will accurately
        represent the hardware setup (feedback resistance of the operational
        amplifier). When clipping value set to the values that
        are clipping, the model will extrapolate the results outside of the clipping
        range caused by the hardaware setup.
    charging_signal_batch_no: [int]
        Number of batches that will be used for extracting the charging signal.
    reference_signal_batch_no: [int]
        Number of batches that will be used for extracting the reference signal.
    filename: [str]
        The name of the file that will be produced after postprocessing. By default: postprocessed_data.npz
    kwargs: Optional kwargs are as follows:
        1. list_data: A list of strings indicating directories with postprocessed_data.npz
        containing input and output data relationships from the device, as well
        as the configuration with which the data was acquired.

    Examples
    --------

    >>> inputs, outputs, configs = post_process('tmp/data/training/TEST/17-02-2021/')

    Notes
    -----
    The postprocessed data is a .npz file called postprocessed_data.npz
    with keys: inputs, outputs and info (dict)

    1. inputs: np.array
    The input(s) is(are) gathered for all activation electrodes. The units is in Volts.

    2. outputs: The output(s) is(are) gathered from all the readout electrodes. The units are in nA.
    The output data is raw. Additional amplification correction might be needed, this is
    left for the user to decide.

    3. info: dict
    Data structure of output and input are arrays of NxD, where N is the number of samples
    and D is the dimension.

    The configs dictionary contains a copy of the configurations used for sampling the data.
    In addition, the configs dictionary has a key named electrode_info, which is created
    during the postprocessing step. The electrode_info key contains the following keys:
    3.1 electrode_no: int
    Total number of electrodes in the device

    3.2 activation_electrodes: dict

    3.2.1 electrode_no: int
    Number of activation electrodes used for gathering the data

    3.2.2 voltage_ranges: list
    Voltage ranges used for gathering the data. It contains the ranges per
    electrode, where the shape is (electrode_no,2). Being 2 the minimum and
    maximum of the ranges, respectively.

    3.3 output_electrodes: dict
    
    3.3.1 electrode_no : int
    Number of output electrodes used for gathering the data

    3.3.2 clipping_value: list[float,float]
    Value used to apply a clipping to the sampling data within the specified
    values.

    3.3.3 amplification: float
    Amplification correction factor used in the device to correct the
    amplification applied to the output current in order to convert it into
    voltage before its readout.

    """
    assert type(charging_signal_batch_no
                ) is int, "charging_signal_batch_no should be an integer"
    assert type(reference_signal_batch_no
                ) is int, "reference_signal_batch_no should be an integer"
    assert data_dir is not None
    configs = load_configs(os.path.join(data_dir, "sampler_configs.json"))
    activation_electrode_no = configs["input_data"]["activation_electrode_no"]
    readout_electrode_no = configs["input_data"]["readout_electrode_no"]

    # If the data comes from multiple sources. Merge them first.
    # if "list_data" in kwargs.keys():
    #     inputs, outputs, configs = data_merger(
    #         data_dir,
    #         kwargs["list_data"],
    #         activation_electrode_no=activation_electrode_no,
    #         readout_electrode_no=readout_electrode_no)
    # elif len(kwargs.keys()) > 0:
    #     assert (
    #         False
    #     ), f"{list(kwargs.keys())} not recognized! kwargs must be list_data"

    inputs, outputs = get_sampling_data(
        os.path.join(data_dir, "IO.dat"),
        activation_electrode_no=activation_electrode_no,
        readout_electrode_no=readout_electrode_no)

    batch_length = int(
        configs["input_data"]["batch_time"] *
        configs["driver"]["instruments_setup"]["activation_sampling_frequency"]
    )
    nr_raw_samples = len(outputs)
    print("Number of raw samples: ", nr_raw_samples)
    assert (nr_raw_samples == configs["input_data"]["number_batches"] *
            batch_length), "Data size mismatch!"
    output_scales = [np.min(outputs), np.max(outputs)]
    print(f"Output scales: [Min., Max.] = {output_scales}")
    # input_scales = list(zip(np.min(inputs, axis=0), np.max(inputs, axis=0)))
    print(f"Lower bound input scales: {np.min(inputs,axis=0)}")
    print(f"Upper bound input scales: {np.max(inputs,axis=0)}\n")
    # Get charging signals
    # charging_batches = int(
    #     60 * 30 /
    #     configs["input_data"]["batch_time"])  # ca. 30 min charging signal
    print("Charging signal contains " + str(charging_signal_batch_no) +
          " batches. Total time: " + str(
              timedelta(seconds=int(charging_signal_batch_no *
                                    configs['input_data']['batch_time']))))
    save_npz(
        data_dir,
        "charging_signal",
        inputs[-charging_signal_batch_no * batch_length:],
        outputs[-charging_signal_batch_no * batch_length:],
        configs,
    )
    # # Get reference batches
    # refs_batches = int(
    #     600 / configs["input_data"]["batch_time"])  # ca. 600s reference signal
    print("\nReference signal contains " + str(reference_signal_batch_no) +
          " batches. Total time: " + str(
              timedelta(seconds=int(reference_signal_batch_no *
                                    configs['input_data']['batch_time']))))
    save_npz(
        data_dir,
        "reference_batch",
        inputs[-reference_signal_batch_no * batch_length:],
        outputs[-reference_signal_batch_no * batch_length:],
        configs,
    )
    # Plot samples histogram and save
    output_hist(outputs[::3], data_dir, bins=100)

    # Clean data
    configs["electrode_info"] = get_electrode_info(configs, clipping_value)
    if configs["electrode_info"]["output_electrodes"][
            "clipping_value"] is not None:
        inputs, outputs = clip_data(
            inputs,
            outputs,
            configs["electrode_info"]["output_electrodes"]["clipping_value"],
        )
        print("% of points cropped: ",
              (1 - len(outputs) / nr_raw_samples) * 100)
        print("\n")
    # save data
    save_npz(data_dir, filename, inputs, outputs, configs)

    return inputs, outputs, configs


[docs]def save_npz(data_dir: str, file_name: str, inputs: np.array,
             outputs: np.array, configs: dict) -> None:
    """
    Stores the input, outputs and sampling configurations in an .npz file.
    The saved file needs to be opened with the option pickle=True, since it
    contains a dictionary.

    Parameters
    ----------
    data_dir : str
        Folder where the data is going to be stored.
    file_name : [type]
        The name of the data that wants to be stored.
    inputs : np.array
        Array containing all the inputs that were sent to the device during sampling.
    outputs : np.array
        Array containing all the outputs of the device obtained during sampling, which correspond
        to the inputs to the device.
    configs : dict
        Sampling configurations with the following keys:

        1. save_directory: str
        Directory where the all the sampling data will be stored.

        2. data_name: str
        Inside the path specified on the variable save_directory, a folder will be created,
        with the format: <data_name>+<current_timestamp>. This variable specified the
        prefix of that folder before the timestamp.

        3. driver: dict
        Dictionary containing the driver configurations. For more information check the
        documentation about this configuration file, check the documentation of
        brainspy.processors.hardware.drivers.ni.setup.NationalInstrumentsSetup

        4. input_data : dict
        Dictionary containing the information necessary to create the input sampling data.
        4.1 input_distribution: str
        It determines the wave shape of the input. Two main options availeble 'sawtooth'
        and 'sine'. The first option will create saw-like signals, and the second
        sine-wave signals. Sawtooth signals have more coverage on the edges of the
        input range.

        4.2 activation_electrode_no: int
        Number of activation electrodes in the device that wants to be sampled.

        4.3 readout_electrode_no : int
        Number of readout electrodes in the device that wants to be sampled.

        4.4 input_frequency: list
        Base frequencies of the input waves that will be created. In order to optimise
        coverage, irrational numbers are recommended. The list should have the same
        length as the activation electrode number. E.g., for 7 activation electrodes:
        input_frequency = [2, 3, 5, 7, 13, 17, 19]

        4.5 phase : float
        Horizontal shift of the input signals. It is recommended to have random numbers
        which are different for the training, validation and test datasets. These
        numbers will be square rooted and multiplied by a given factor.

        4.6 factor : float
        Given factor by which the input frequencies will be multiplied after square
        rooting them.

        4.7 amplitude : Optional[list[float]]
        Amplitude of the generated input wave signal. It is calculated according to the
        minimum and maximum ranges of each electrode. Where the amplitude value should
        correspond with (max_range_value - min_range_value) / 2. If no amplitude is
        given it will be automatically calculated from the driver configurations for
        activation electrode ranges. If it wants to be manually set, the offset
        variable should also be included in the dictionary.

        4.8 offset: Optional[list[float]]
        Vertical offset of the generated input wave signal. It is calculated according
        to the minimum and maximum ranges of each electrode. Where the offset value
        should correspond with (max_range_value + min_range_value) / 2. If no offset
        is given it will be automatically calculated from the driver configurations for
        activation electrode ranges. If it wants to be manually set, the offset
        variable should also be included in the dictionary.

        4.9 ramp_time: float
        Time that will be taken before sending each batch to go from zero to the first
        point of the batch and to zero from the last point of the batch.

        4.10 batch_time:
        Time that the sampling of each batch will take.

        4.11 number_batches: int
        Number of batches that will be sampled. A default value of 3880 is reccommended.
    """
    save_to = os.path.join(data_dir, file_name)
    print(f"Data saved to: {save_to}.npz")
    np.savez(save_to, inputs=inputs, outputs=outputs, sampling_configs=configs)


[docs]def get_electrode_info(configs: dict, clipping_value) -> dict:
    """
    Retrieve electrode information from the data sampling configurations.

    Parameters
    ----------
    configs : dict
        Sampling configurations with the following keys:
        1. driver: dict
        Dictionary containing the driver configurations. For more information check the
        documentation about this configuration file, check the documentation of
        brainspy.processors.hardware.drivers.ni.setup.NationalInstrumentsSetup
        
        2. input_data : dict
        Dictionary containing the information necessary to create the input sampling data.
        2.1 activation_electrode_no: int
        Number of activation electrodes in the device that wants to be sampled.

        2.2 readout_electrode_no : int
        Number of readout electrodes in the device that wants to be sampled.

        2.3 amplitude : [list[float]]
        Amplitude of the generated input wave signal. It is calculated according to the
        minimum and maximum ranges of each electrode. Where the amplitude value should
        correspond with (max_range_value - min_range_value) / 2. If no amplitude is
        given it will be automatically calculated from the driver configurations for
        activation electrode ranges. If it wants to be manually set, the offset
        variable should also be included in the dictionary.

        2.4 offset: [list[float]]
        Vertical offset of the generated input wave signal. It is calculated according
        to the minimum and maximum ranges of each electrode. Where the offset value
        should correspond with (max_range_value + min_range_value) / 2. If no offset
        is given it will be automatically calculated from the driver configurations for
        activation electrode ranges. If it wants to be manually set, the offset
        variable should also be included in the dictionary.
    clipping_value : str or list
        The value that will be used to clip the sampling data within a specific range. if
        default is passed, a default clipping value will be used. 

    Returns
    -------
    electrode_info : dict
        Configuration dictionary containing all the keys related to the electrode information:
        1. electrode_no: int
        Total number of electrodes in the device

        2. activation_electrodes: dict
        2.1 electrode_no: int
        Number of activation electrodes used for gathering the data

        2.2 voltage_ranges: list
        Voltage ranges used for gathering the data. It contains the ranges per
        electrode, where the shape is (electrode_no,2). Being 2 the minimum and maximum
        of the ranges, respectively.

        3. output_electrodes: dict
        3.1 electrode_no : int
        Number of output electrodes used for gathering the data

        3.2 clipping_value: list[float,float]
        Value used to apply a clipping to the sampling data within the specified values.

        3.3 amplification: float
        Amplification correction factor used in the device to correct the amplification
        applied to the output current in order to convert it into voltage before its
        readout.
    """
    electrode_info = {}
    electrode_info["electrode_no"] = (
        configs["input_data"]["activation_electrode_no"] +
        configs["input_data"]["readout_electrode_no"])
    electrode_info["activation_electrodes"] = {}
    electrode_info["activation_electrodes"]["electrode_no"] = configs[
        "input_data"]["activation_electrode_no"]
    electrode_info["activation_electrodes"][
        "voltage_ranges"] = get_voltage_ranges(
            configs["input_data"]["offset"],
            configs["input_data"]["amplitude"])
    electrode_info["output_electrodes"] = {}
    electrode_info["output_electrodes"]["electrode_no"] = configs[
        "input_data"]["readout_electrode_no"]
    electrode_info["output_electrodes"]["amplification"] = configs["driver"][
        "amplification"]
    if clipping_value == "default":
        electrode_info["output_electrodes"]["clipping_value"] = (
            electrode_info["output_electrodes"]["amplification"] *
            np.array([-4, 4])).tolist()
    else:
        electrode_info["output_electrodes"]["clipping_value"] = clipping_value

    print_electrode_info(electrode_info)
    return electrode_info


[docs]def get_voltage_ranges(offset: list, amplitude: list) -> np.array:
    """
    Calculate the voltage ranges of the device out of the information about the
    amplitude and the vertical offset that was used to compute the input waves
    during the data gathering process.

    Parameters
    ----------
    offset : list
        A list of all the offset values to vertically displace the input signal
        in such a way that it fits the activation electrode ranges. The list would
        contain one value per activation electrode.
    amplitude : list
        A list of all the amplitude values to amplify the input signal in such
        a way that it fits the activation electrode ranges.

    Returns
    -------
    np.array
        Array containing the ranges per electrode, where the shape is (electrode_no,2). Being
        2 the minimum and maximum of the ranges, respectively.
    """
    offset = np.array(offset, dtype=np.float32)
    amplitude = np.array(amplitude, dtype=np.float32)
    min_voltage = (offset - amplitude)[:, np.newaxis]
    max_voltage = (offset + amplitude)[:, np.newaxis]
    return np.concatenate((min_voltage, max_voltage), axis=1)


[docs]def print_electrode_info(configs: dict) -> None:
    """
    Prints on screen the information about the electrodes that was gathered
    from the configuration file used for gathering the data from the device.

    Parameters
    ----------
    configs : dict
        Configuration dictionary containing all the keys related to the electrode information:
        1. electrode_no: int
        Total number of electrodes in the device

        2. activation_electrodes: dict
        2.1 electrode_no: int
        Number of activation electrodes used for gathering the data

        2.2 voltage_ranges: list
        Voltage ranges used for gathering the data. It contains the ranges per
        electrode, where the shape is (electrode_no,2). Being 2 the minimum and maximum
        of the ranges, respectively.
        3. output_electrodes: dict
        3.1 electrode_no : int
        Number of output electrodes used for gathering the data

        3.2 clipping_value: list[float,float]
        Value used to apply a clipping to the sampling data within the specified values.

        3.3 amplification: float
        Amplification correction factor used in the device to correct the amplification
        applied to the output current in order to convert it into voltage before its
        readout.
    """
    print(
        "\nThe following data is inferred from the input data. Please check if it is correct. "
    )
    print(
        f"Data is gathered from a device with {configs['electrode_no']} electrodes, from which: "
    )
    print(
        f"There are {configs['activation_electrodes']['electrode_no']} activation electrodes: "
    )
    print("\t * Lower bound of voltage ranges: " +
          str(configs["activation_electrodes"]["voltage_ranges"][:, 0]))
    print("\t * Upper bound of voltage ranges: " +
          str(configs["activation_electrodes"]["voltage_ranges"][:, 1]))
    print(
        f"There are {configs['output_electrodes']['electrode_no']} output electrodes: "
    )
    print("\t * Clipping value: " +
          str(configs["output_electrodes"]["clipping_value"]))
    print("\t * Amplification correction value: " +
          str(configs["output_electrodes"]["amplification"]))


[docs]def clip_data(inputs: np.array, outputs: np.array,
              clipping_value_range: list) -> Tuple[np.array, np.array]:
    """
    Removes all the outputs and corresponding inputs where the output is outside a given maximum
    and minimum range.

    Parameters
    ----------
    inputs : np.array
        Array containing all the inputs that were sent to the device during sampling.
    outputs : np.array
        Array containing all the outputs of the device obtained during sampling, which correspond
        to the inputs to the device.
    clipping_value_range : list[float,float]
        A list of length two. The first element will be the lower clipping range, and the second
        element will be the higher clipping range.

    Returns
    -------
    inputs : np.array
        Array containing all the inputs that were sent to the device during sampling, except for
        those  values for which its corresponding output is above and below the specified clipping
        range.
    outputs : np.array
        Array containing all the outputs of the device obtained during sampling, except for those
        values for which its corresponding output is above and below the specified clipping range.
    """

    mean_output = np.mean(outputs, axis=1)

    # Get cropping mask
    if type(clipping_value_range) is list:
        cropping_mask = (mean_output < clipping_value_range[1]) * (
            mean_output > clipping_value_range[0])
        print(
            f"\nClipping data outside range {clipping_value_range[0]} and {clipping_value_range[1]}"
        )
        outputs = outputs[cropping_mask]
        inputs = inputs[cropping_mask, :]
        return inputs, outputs
    elif clipping_value_range is None:
        return inputs, outputs
    else:
        raise TypeError(
            f"Clipping value not recognized! Must be list with lower and upper bound or float, was {type(clipping_value_range)}"
        )


# def merge_postprocessed_data(file_names,
#                              output_file_name='merged_postprocessed_data.npz'):
#     """[summary]

#     Parameters
#     ----------
#     file_names : [type]
#         [description]
#     output_file_name : str, optional
#         [description], by default 'merged_postprocessed_data.npz'

#     Example
#     ----------
#     file_names = ['tmp/data/training/Brains_testing_2020_09_04_182557/postprocessed_data.npz',
#      'tmp/data/training/Brains_testing_2020_09_11_093200/postprocessed_data.npz']
#     merge_postprocessed_data(file_names)
#     """
#     ref_data = dict(np.load(file_names[0], allow_pickle='True'))
#     for i in range(1, len(file_names)):
#         data = np.load(file_names[i])
#         for key in list(data):
#             if key != 'info':
#                 ref_data[key] = np.append(ref_data[key], data[key], axis=0)
#     np.savez(output_file_name, **ref_data)

# def data_merger(main_dir, activation_electrode_no=7, readout_electrode_no=1):
#     # EXAMPLE
#     #  main_dir = "tmp/output/model_nips"
#     # The post_process function should have a clipping value which is in an amplified scale.
#     # E.g., for an amplitude of 100 -> 345.5
#     # process_multiple(main_dir)
#     shape = 0
#     dirs = list([
#         name for name in os.listdir(main_dir)
#         if os.path.isdir(os.path.join(main_dir, name))
#         and not name.startswith('.')
#     ])

#     assert len(dirs) > 0
#     for i in range(len(dirs)):
#         shape += np.load(os.path.join(main_dir, dirs[i],
#                                       'postprocessed_data.npz'),
#                          allow_pickle=True)['inputs'].shape[0]

#     input_results = np.zeros([shape, activation_electrode_no])
#     output_results = np.zeros([shape, readout_electrode_no])
#     previous_shape = 0
#     for i in range(len(dirs)):
#         data = np.load(os.path.join(main_dir, dirs[i],
#                                     'postprocessed_data.npz'),
#                        allow_pickle=True)
#         current_shape = previous_shape + data['inputs'].shape[0]
#         input_results[previous_shape:current_shape] = data['inputs']
#         output_results[previous_shape:current_shape] = data['outputs']
#         previous_shape = current_shape
#         info = data['info']

#     info = dict(np.ndenumerate(info))[()]
#     info['input_data']['input_distribution'] = 'mixed'
#     info['input_data']['phase'] = 'mixed'
#     index = np.random.permutation(np.arange(shape))
#     input_results = input_results[index]
#     output_results = output_results[index]

#     limit = int(shape * 0.75)

#     np.savez(os.path.join(main_dir, 'training_data'),
#              inputs=input_results[:limit],
#              outputs=output_results[:limit],
#              info=info)
#     np.savez(os.path.join(main_dir, 'test_data'),
#              inputs=input_results[limit:],
#              outputs=output_results[limit:],
#              info=info)

#if __name__ == "__main__":
# import matplotlib

# matplotlib.use('TkAgg')
# main_dir = "C:/Users/Unai/Documents/github/brainspy-smg/tmp/brains_setup/sampling_data_1KSPS_arsenic_test_2022_03_17_171248"
# inputs, outputs, info = post_process(main_dir,
#                                      clipping_value=[-114, 114],
#                                      filename="postprocessed_data_clipped")
# dirs = list(
#     [
#         name
#         for name in os.listdir(main_dir)
#         if os.path.isdir(os.path.join(main_dir, name)) and not name.startswith(".")
#     ]
# )

# assert len(dirs) > 0
# for i in range(len(dirs)):
# inputs, outputs, info = post_process(main_dir)
# output_hist(outputs, os.path.join(main_dir, dirs[i]), bins=1000, show=True)