NeurodataWithoutBorders · ehennestad · Nov 23, 2024 · Nov 23, 2024 · Nov 23, 2024 · Jan 21, 2025
diff --git a/+io/+config/+internal/computeChunkSizeFromConfig.m b/+io/+config/+internal/computeChunkSizeFromConfig.m
@@ -0,0 +1,73 @@
+function chunkSize = computeChunkSizeFromConfig(A, datasetConfig)
+% computeChunkSizeFromConfig - Compute the chunk size for a dataset using the provided configuration.
+%   This function determines the chunk size for a dataset based on the chunk
+%   dimensions provided in the datasetConfig structure. It adjusts dimensions 
+%   according to rules: 'max' uses the dataset size, fixed numbers use their 
+%   value, and 'null' calculates the dimension size to approximate the target 
+%   chunk size in bytes.
+%
+%   Inputs:
+%       A - A numeric dataset whose chunk size is to be computed.
+%       datasetConfig (1,1) struct - Struct defining chunk dimensions and chunk target size.
+%
+%   Output:
+%       chunkSize - A vector specifying the chunk size for each dimension.
+
+    arguments
+        A {mustBeNumeric}
+        datasetConfig (1,1) struct
+    end
+
+    assert(isfield(datasetConfig, 'chunk_dimensions'), ...
+        'Expected datasetConfig to have field "chunk_dimensions"')
+    assert(isfield(datasetConfig, 'target_chunk_size'), ...
+        'Expected datasetConfig to have field "target_chunk_size"')
+
+    % Get dataset size
+    dataSize = size(A);
+    dataSize = fliplr(dataSize);  % matnwb quirk
+    numDimensions = numel(dataSize);
+
+    % Extract relevant configuration parameters
+    chunkDimensions = datasetConfig.chunk_dimensions;
+    if iscell(chunkDimensions)
+        numChunkDimensions = cellfun(@numel, chunkDimensions);
+        chunkDimensions = chunkDimensions{numChunkDimensions == numDimensions};
+    end
+
+    defaultChunkSize = datasetConfig.target_chunk_size.value; % in bytes
+    dataByteSize = io.config.internal.getDataByteSize(A);
+
+    % Initialize chunk size array
+    chunkSize = zeros(1, numDimensions);
+
+    % Calculate chunk size for each dimension
+    for dim = 1:numDimensions
+        if dim > numel(chunkDimensions)
+            % Use full size for dimensions beyond the specification
+            chunkSize(dim) = dataSize(dim);
+        else
+            dimSpec = chunkDimensions{dim};
+            if isempty(dimSpec)
+                % Compute chunk size for 'null' dimensions
+                % Estimate proportional size based on remaining chunk size
+                remainingChunkSize = defaultChunkSize / dataByteSize; % scale factor for all dimensions
+                nullDimensions = find(cellfun(@isempty, chunkDimensions));
+                proportionalSize = nthroot(remainingChunkSize, numel(nullDimensions));
+                chunkSize(dim) = max(1, round(proportionalSize*dataSize(dim)));
+            elseif isnumeric(dimSpec)
+                % Fixed chunk size
+                chunkSize(dim) = dimSpec;
+            elseif ischar(dimSpec) && strcmp(dimSpec, 'max')
+                % Use full dimension size
+                chunkSize(dim) = dataSize(dim);
+            else
+                error('Invalid chunk specification for dimension %d.', dim);
+            end
+        end
+    end
+
+    % Ensure chunk size does not exceed dataset dimensions
+    chunkSize = min(chunkSize, dataSize);
+    chunkSize = fliplr(chunkSize);
+end
diff --git a/+io/+config/+internal/configureDataPipeFromData.m b/+io/+config/+internal/configureDataPipeFromData.m
@@ -0,0 +1,42 @@
+function dataPipe = configureDataPipeFromData(numericData, datasetConfig)
+% configureDataPipeFromData - Configure a DataPipe from numeric data and dataset configuration
+
+    import io.config.internal.computeChunkSizeFromConfig
+    import types.untyped.datapipe.properties.DynamicFilter
+
+    chunkSize = computeChunkSizeFromConfig(numericData, datasetConfig);
+    maxSize = size(numericData);
+
+    dataPipeArgs = {...
+        "data", numericData, ...
+        "maxSize", maxSize, ...
+        "chunkSize", chunkSize };
+
+    hasShuffle = ~isempty(datasetConfig.compression.prefilters)...
+                 && contains(datasetConfig.compression.prefilters, 'shuffle');
+
+    if strcmpi(datasetConfig.compression.algorithm, "Deflate")
+        % Use standard compression filters
+        dataPipeArgs = [ dataPipeArgs, ...
+            {'hasShuffle', hasShuffle, ...
+            'compressionLevel', datasetConfig.compression.level} ...
+            ];
+    else
+        % Create property list of custom filters for dataset creation
+        compressionFilter = DynamicFilter( ...
+            datasetConfig.compression.algorithm, ...
+            datasetConfig.compression.level );
+
+        if hasShuffle
+            shuffleFilter = types.untyped.datapipe.properties.Shuffle();
+            filters = [shuffleFilter compressionFilter];
+        else
+            filters = compressionFilter;
+        end
+        dataPipeArgs = [ dataPipeArgs, ...
+            {'filters', filters} ];
+    end
+
+    % Create the datapipe.
+    dataPipe = types.untyped.DataPipe( dataPipeArgs{:} );
+end
diff --git a/+io/+config/+internal/getDataByteSize.m b/+io/+config/+internal/getDataByteSize.m
@@ -0,0 +1,7 @@
+function byteSize = getDataByteSize(data)
+% getDataByteSize - Get bytesize of a numeric array
+    dataType = class(data);
+    bytesPerDataPoint = io.getMatTypeSize(dataType);
+
+    byteSize = numel(data) .* bytesPerDataPoint;
+end
diff --git a/+io/+config/+internal/reconfigureDataPipe.m b/+io/+config/+internal/reconfigureDataPipe.m
@@ -0,0 +1,3 @@
+function dataPipe = reconfigureDataPipe(dataPipe, datasetConfig)
+    % todo
+end
diff --git a/+io/+config/+internal/resolveDatasetConfigForDataType.m b/+io/+config/+internal/resolveDatasetConfigForDataType.m
@@ -0,0 +1,81 @@
+function resolvedOptions = resolveDatasetConfigForDataType(datasetConfig, nwbObject, datasetName)
+% resolveDatasetConfigForDataType - Resolve the dataset configuration for individual neurodata types
+%   This function resolves the dataset configuration options for a given NWB object
+%   by traversing the object hierarchy and combining options from the most specific
+%   type to the base type, as defined in the datasetConfig structure.
+%
+%   Input:
+%       datasetConfig (struct): A struct representation of the dataset configuration JSON.
+%       nwbObject (types.untyped.MetaClass): An NWB object whose dataset configuration will be resolved.
+%
+%   Output:
+%       resolvedOptions (struct): A struct containing the resolved dataset configuration options.
+
+    arguments
+        datasetConfig (1,1) struct
+        nwbObject (1,1) types.untyped.MetaClass
+        datasetName (1,1) string
+    end
+
+    % Initialize resolvedOptions with default options.
+    resolvedOptions = datasetConfig.Default;
+
+    % Get the NWB object type hierarchy (from most specific to base type)
+    typeHierarchy = getTypeHierarchy(nwbObject);
+
+    % Traverse the type hierarchy to resolve options
+    for i = numel(typeHierarchy):-1:1
+        typeName = typeHierarchy{i};
+
+        % Check if the neurodata type has a datasetConfig
+        if isfield(datasetConfig, typeName)
+            typeOptions = datasetConfig.(typeName);
+
+            % Is datasetName part of typeOptions?
+            if isfield(typeOptions, datasetName)
+                % Merge options into resolvedOptions
+                datasetOptions = typeOptions.(datasetName);
+                resolvedOptions = mergeStructs(resolvedOptions, datasetOptions);
+            end
+        end
+    end
+end
+
+function typeHierarchy = getTypeHierarchy(nwbObject)
+% getTypeHierarchy - Retrieve the type hierarchy of an NWB object.
+%   This function returns a cell array of type names, starting from the specific
+%   type of the given NWB object up to its base type.
+
+    typeHierarchy = {};  % Initialize an empty cell array
+    currentType = class(nwbObject); % Start with the specific type
+
+    while ~isempty(currentType)
+        shortClassName = regexp(currentType, '[^.]+$', 'match', 'once');
+        typeHierarchy{end+1} = shortClassName; %#ok<AGROW>
+
+        % Use MetaClass information to get the parent type
+        metaClass = meta.class.fromName(currentType);
+        if isempty(metaClass.SuperclassList)
+            break; % Reached the base type
+        end
+        currentType = metaClass.SuperclassList(1).Name;
+    end
+end
+
+function merged = mergeStructs(baseStruct, newStruct)
+% mergeStructs - Merge two structs, with fields in newStruct overriding those in baseStruct.
+
+    merged = baseStruct; % Start with the base struct
+
+    fields = fieldnames(newStruct);
+    for i = 1:numel(fields)
+        field = fields{i};
+        if isstruct(newStruct.(field)) && isfield(baseStruct, field) && isstruct(baseStruct.(field))
+            % Recursively merge if both fields are structs
+            merged.(field) = mergeStructs(baseStruct.(field), newStruct.(field));
+        else
+            % Otherwise, override the field
+            merged.(field) = newStruct.(field);
+        end
+    end
+end
diff --git a/+io/+config/applyDatasetConfiguration.m b/+io/+config/applyDatasetConfiguration.m
@@ -0,0 +1,84 @@
+function applyDatasetConfiguration(nwbObject, datasetConfiguration, options)
+% applyDatasetConfiguration - Apply dataset configuration to datasets of an NWB object
+
+    arguments
+        nwbObject (1,1) NwbFile
+        datasetConfiguration (1,1) struct = io.config.readDatasetConfiguration()
+        options.OverrideExisting (1,1) logical = false
+    end
+
+    import io.config.internal.resolveDatasetConfigForDataType
+
+    neurodataObjects = getNeurodataObjectsFromNwbFile(nwbObject);
+
+    for iNeurodataObject = 1:numel(neurodataObjects)
+        thisNeurodataObject = neurodataObjects{iNeurodataObject};
+        thisNeurodataClassName = class(thisNeurodataObject);
+
+        % A dataset can be defined on multiple levels of the class hierarchy,
+        % so need to keep track of which datasets have been processed.
+        processedDatasets = string.empty;
+
+        isFinished = false;
+        while ~isFinished % Iterate over type and it's ancestor types (superclasses)
+
+            datasetNames = schemes.listDatasetsOfNeurodataType( thisNeurodataClassName );
+            for thisDatasetName = datasetNames % Iterate over all datasets of a type
+
+                if ismember(thisDatasetName, processedDatasets)
+                    continue
+                end
+
+                datasetConfig = resolveDatasetConfigForDataType(...
+                    datasetConfiguration, ...
+                    thisNeurodataObject, ...
+                    thisDatasetName);
+
+                datasetData = thisNeurodataObject.(thisDatasetName);
+
+                if isnumeric(datasetData)
+                    % Create a datapipe object for a numeric dataset value.
+                    dataByteSize = io.config.internal.getDataByteSize(datasetData);
+                    if dataByteSize > datasetConfig.target_chunk_size.value
+                        dataPipe = io.config.internal.configureDataPipeFromData(datasetData, datasetConfig);
+                    end
+                elseif isa(datasetData, 'types.untyped.DataPipe')
+                    if options.OverrideExisting
+                        dataPipe = io.config.internal.reconfigureDataPipe(datasetData, datasetConfig);
+                    end
+                elseif isa(datasetData, 'types.untyped.DataStub')
+                    % todo
+                    % error('Not implemented for files obtained by nwbRead')
+                else
+                    disp( class(datasetData) )
+                end
+
+                if exist('dataPipe', 'var')
+                    thisNeurodataObject.(thisDatasetName) = dataPipe;
+                    processedDatasets = [processedDatasets, thisDatasetName]; %#ok<AGROW>
+                    clear dataPipe
+                end
+            end
+
+            parentType = matnwb.common.getParentType(thisNeurodataClassName);
+
+            if isempty(parentType)
+                isFinished = true;
+            else
+                thisNeurodataClassName = parentType;
+            end
+        end
+    end
+end
+
+function neurodataObjects = getNeurodataObjectsFromNwbFile(nwbObject)
+% getNeurodataObjectsFromNwbObject - Return all neurodata objects in a NwbFile object
+
+    objectMap = nwbObject.searchFor('types.');
+
+    neurodataObjects = objectMap.values();
+    neurodataClassNames = cellfun(@(c) class(c), neurodataObjects, 'uni', 0);
+
+    toIgnore = startsWith(neurodataClassNames, "types.untyped");
+    neurodataObjects(toIgnore) = [];
+end
diff --git a/+io/+config/readDatasetConfiguration.m b/+io/+config/readDatasetConfiguration.m
@@ -0,0 +1,44 @@
+function datasetConfig = readDatasetConfiguration(profile)
+% READDATASETCONFIGURATION Reads the default dataset configuration from a JSON file.
+%
+% Syntax:
+%  configObject = io.config.READDATASETCONFIGURATION() loads the default
+%  dataset configuration parameters from a JSON file located in the
+%  "configuration" folder in the MatNWB root directory.
+%
+%  configObject = io.config.READDATASETCONFIGURATION(profile) loads the
+%  dataset configuration parameters for the specified "configuration profile"
+%  from a JSON file located in the "configuration" folder in the MatNWB root 
+%  directory.
+%
+% Output Arguments:
+%   - datasetConfig - A MATLAB structure containing the dataset configuration
+%                     parameters (chunking & compression) defined in the JSON 
+%                     configuration file.
+%
+% Example 1 - Load default dataset configurations::
+%
+%    % Load the default dataset configuration
+%    datasetConfig = io.config.readDatasetConfiguration();
+%    disp(datasetConfig);
+
+    arguments
+        profile (1,1) string {mustBeMember(profile, [ ...
+            "default", ...
+            "cloud", ...
+            "archive"
+            ])} = "default"
+    end
+
+    switch profile
+        case "default"
+            filename = 'default_dataset_configuration.json';
+        case "cloud"
+            filename = 'cloud_dataset_configuration.json';
+        case "archive"
+            filename = 'archive_dataset_configuration.json';
+    end
+
+    configFilePath = fullfile(misc.getMatnwbDir, 'configuration', filename);
+    datasetConfig = jsondecode(fileread(configFilePath));
+end
diff --git a/+matnwb/+common/getParentType.m b/+matnwb/+common/getParentType.m
@@ -0,0 +1,7 @@
+function parentTypeClassName = getParentType(typeClassName)
+    mc = meta.class.fromName(typeClassName);
+    parentTypeClassName = mc.SuperclassList(1).Name;
+    if strcmp(parentTypeClassName, "types.untyped.MetaClass")
+        parentTypeClassName = string.empty;
+    end
+end
diff --git a/+schemes/listDatasetsOfNeurodataType.m b/+schemes/listDatasetsOfNeurodataType.m
@@ -0,0 +1,50 @@
+function datasetNames = listDatasetsOfNeurodataType(typeClassName)
+% listDatasetsOfNeurodataType - List names of datasets of a neurodata type
+%
+% Input Arguments:
+%   - typeClassName (string) -
+%     Full MatNWB class name for a neurodata type, i.e "types.core.TimeSeries"
+%
+% Output Arguments:
+%   - datasetNames (string) - 
+%     Names of datasets contained in the specified neurodata type
+
+    arguments
+        typeClassName (1,1) string
+    end
+
+    classNameSplit = string( split(typeClassName, '.') );
+    typesIdx = find(classNameSplit == "types");
+
+    assert(~isempty(typesIdx), 'Expected class name to contain "types"')
+    namespaceName = classNameSplit(typesIdx+1);
+    namespaceName = strrep(namespaceName, '_', '-');
+    namespace = schemes.loadNamespace(namespaceName, misc.getMatnwbDir);
+
+    neurodataTypeName = classNameSplit(typesIdx+2);
+    typeScheme = namespace.registry(neurodataTypeName);
+
+    switch typeScheme('class_type')
+        case 'groups'
+            if isKey(typeScheme, 'datasets')
+                datasetMaps = typeScheme('datasets');
+
+                datasetNames = repmat("", size(datasetMaps));
+                for i = 1:numel(datasetMaps)
+                    if isKey(datasetMaps{i}, 'name')
+                        datasetNames(i) = datasetMaps{i}('name');
+                    else
+                        keyboard
+                    end
+                end
+                datasetNames(datasetNames=="") = [];
+            else
+                datasetNames = string.empty;
+            end
+
+        case 'datasets'
+            datasetNames = "data";
+        otherwise
+            error('Unexpected class type')
+    end
+end