Home > database > ensemble_combine_datastructs.m

ensemble_combine_datastructs

PURPOSE ^

Combines two Ensemble datastructs in one of the three following ways:

SYNOPSIS ^

function out_st = ensemble_combine_datastructs(dst,params)

DESCRIPTION ^

 Combines two Ensemble datastructs in one of the three following ways:
 (1) If the set of variables is identical, the function checks for an 
     exact match of rows across all variables. If the match is not exact,
     ensemble_concat_datastruct is called to concatenate the
     data structures.

 (2) If the two sets of variables differ, a union of the variables will be
     created if and only if all of the rows of the originally intersecting
     variable sets match.

 (3) If params.heuristic = 'merge', variables from data structure 2 are
     added to data structure 1, populating rows in data structure 1 whose
     values on the set of intersecting variables match. For each
     structure, uniqueness of rows (created from intersecting variables)
     is required and tested.

 In order to accomplish the multiple variable merge, it was necessary to
 utilize the categorial data types from the Statistics Toolbox.

 See also: ensemble_concat_datastruct

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function out_st = ensemble_combine_datastructs(dst,params)
0002 % Combines two Ensemble datastructs in one of the three following ways:
0003 % (1) If the set of variables is identical, the function checks for an
0004 %     exact match of rows across all variables. If the match is not exact,
0005 %     ensemble_concat_datastruct is called to concatenate the
0006 %     data structures.
0007 %
0008 % (2) If the two sets of variables differ, a union of the variables will be
0009 %     created if and only if all of the rows of the originally intersecting
0010 %     variable sets match.
0011 %
0012 % (3) If params.heuristic = 'merge', variables from data structure 2 are
0013 %     added to data structure 1, populating rows in data structure 1 whose
0014 %     values on the set of intersecting variables match. For each
0015 %     structure, uniqueness of rows (created from intersecting variables)
0016 %     is required and tested.
0017 %
0018 % In order to accomplish the multiple variable merge, it was necessary to
0019 % utilize the categorial data types from the Statistics Toolbox.
0020 %
0021 % See also: ensemble_concat_datastruct
0022 
0023 % 26Jan2013 Petr Janata
0024 % 22Aug2014 PJ - added merge heuristic
0025 % 03Sep2014 PJ - added support for merge based on multiple intersecting
0026 %                variables
0027 % 18Sep2014 PJ - fixed functioning if only the first input argument is
0028 %                passed in
0029 
0030 if ~iscell(dst) || length(dst) ~= 2
0031   error('%s: Two data structures are required', mfilename)
0032 end
0033 
0034 if nargin < 2
0035   params = struct;
0036 end
0037 
0038 % Determine whether the sets of variables differ
0039 diffVars = setxor(dst{1}.vars, dst{2}.vars);
0040 
0041 if isempty(diffVars) % Sets of variables are the same
0042   % See if the two structures are the same
0043   diffRows = setdiff(dst{1}.data, dst{2}.data,'rows');
0044   
0045   if ~isempty(diffRows)
0046     out_st = ensemble_concat_datastruct(dst);
0047   else
0048     fprintf('%s: Input data structures are the same. Nothing combined.\n', mfilename);
0049   end
0050 else % Create a union of the variables if intersection matches
0051   % Get the set of intersecting variables
0052   intersectVars = intersect(dst{1}.vars, dst{2}.vars);
0053   numIntersect = length(intersectVars);
0054   
0055   % Get indices into data structure for each data structure
0056   ds_idxs = cell(1,2);
0057   for ids = 1:2
0058     [~,ds_idxs{ids}] = ismember(intersectVars, dst{ids}.vars);
0059   end
0060   
0061   % Compare each of the intersecting variables
0062   varDiffers = false(1,numIntersect);
0063   for ivar = 1:numIntersect
0064     % Make sure that if we are dealing with a cell array that contains
0065     % strings that any empty values are empty strings, rather than empty
0066     for ist = 1:2
0067       if iscell(dst{ist}.data{ds_idxs{ist}(ivar)})
0068         emptyMask = cellfun('isempty',dst{ist}.data{ds_idxs{ist}(ivar)});
0069         [dst{ist}.data{ds_idxs{ist}(ivar)}{emptyMask}] = deal('');
0070       end
0071     end
0072     varDiffers(ivar) = ~isempty(setxor(dst{1}.data{ds_idxs{1}(ivar)}, dst{2}.data{ds_idxs{2}(ivar)}));
0073   end
0074   
0075   if any(varDiffers)
0076     if ~isfield(params,'heuristic')
0077       fprintf('%s: Data structures differ in variables and rows. Nothing combined.\n', mfilename);
0078     end
0079     out_st = [];
0080   else
0081     % Initialize the output data struct with the first data struct
0082     out_st = dst{1};
0083     
0084     % Determine which of the variables from the second data struct need to
0085     % be copied
0086     copyVars = setdiff(dst{2}.vars, intersectVars);
0087     
0088     % Append the variables that need to be copied to the output data
0089     % structure list
0090     out_st.vars = [out_st.vars, copyVars];
0091     
0092     % Get the locations of the variables to be copied from the second data
0093     % struct
0094     [~,srcCols] = ismember(copyVars, dst{2}.vars);
0095 
0096     % Copy the data struct 2 data to the output structure
0097     out_st.data = [out_st.data dst{2}.data(srcCols)];
0098   end
0099 end
0100 
0101 % See if the out_st is still empty. If so, check whether a different
0102 % heurisitic has been specified.
0103 CAN_HANDLE_MULTIPLE_VARS = 1;
0104 if isfield(params,'heuristic')
0105   switch params.heuristic
0106     case 'merge'
0107       % Get the intersecting variable
0108       intersectVars = intersect(dst{1}.vars, dst{2}.vars);
0109       
0110       numIntersect = length(intersectVars);
0111       if numIntersect > 1 && ~CAN_HANDLE_MULTIPLE_VARS
0112         error('Too many intersecting variables')
0113       end
0114       
0115       % Create tables for each data structure consisting only of the
0116       % intersecting variables. We can the test these tables for
0117       % uniqueness of rows, and find row matches between the two data
0118       % structs.
0119       % To accomplish this, we have to first convert the data to a
0120       % categorical class (requires Statistics Toolbox), which each of the
0121       % variables being a nominal type. Unique rows can then be found.
0122       ordinalTbl = cell(1,2);
0123       for ids = 1:2
0124         [~,idxs] = ismember(intersectVars,dst{ids}.vars);
0125         tbl = dst{ids}.data(idxs);
0126         
0127         % Convert tbl into a table type if possible
0128         if exist('table','class')
0129           ordinalTbl{ids} = table(tbl{:});
0130           usingTableClass = true;
0131         else
0132           usingTableClass = false;
0133           for iint = 1:length(intersectVars)
0134             ordinalTbl{ids}(:,iint) = ordinal(tbl{iint});
0135           end
0136         end
0137         
0138         % Check for uniqueness of rows
0139         uniqueRows = unique(ordinalTbl{ids},'rows');
0140         numNonUnique = size(ordinalTbl{ids},1) - size(uniqueRows,1);
0141         if numNonUnique
0142           fprintf('Found %d non-unique rows in data struct: %s\n',numNonUnique,dst{1}.name);
0143           % Find the non-unique rows
0144           [~,idxs] = ismember(ordinalTbl{ids},uniqueRows,'rows');
0145           
0146           % Tabulate idxs to see which appears more than once
0147           t = tabulate(idxs);
0148           
0149           % Display the nonUnique rows
0150           uniqueRows(t(:,2)>1,:)
0151           
0152           fprintf('Cannot merge data ...\n');
0153           return
0154         end       
0155       end
0156       
0157       % Convert our ordinal table to a nominal table in order to accomplish
0158       % the following ismember operation
0159       if ~usingTableClass
0160         [matchMask, srcIdxs] = ismember(nominal(ordinalTbl{1}),nominal(ordinalTbl{2}),'rows');
0161       else
0162         [matchMask, srcIdxs] = ismember(ordinalTbl{1},ordinalTbl{2},'rows');
0163       end
0164       
0165       d1cols = set_var_col_const(dst{1}.vars);
0166       d2cols = set_var_col_const(dst{2}.vars);
0167       
0168       % Figure out which variables we'll be appending to dst1
0169       copyVars = setdiff(dst{2}.vars, dst{1}.vars);
0170       dst{1}.vars = [dst{1}.vars copyVars];
0171       d1cols = set_var_col_const(dst{1}.vars);
0172       nvars = length(copyVars);
0173       for ivar = 1:nvars
0174         currVar = copyVars{ivar};
0175         
0176         % Initialize the d1 data cell to the correct type
0177         if iscell(dst{2}.data{d2cols.(currVar)}(1))
0178           dst{1}.data{d1cols.(currVar)} = cell(size(matchMask));
0179         elseif isnumeric(dst{2}.data{d2cols.(currVar)}(1))
0180           dst{1}.data{d1cols.(currVar)} = nan(size(matchMask));
0181         elseif islogical(dst{2}.data{d2cols.(currVar)}(1))
0182           dst{1}.data{d1cols.(currVar)} = false(size(matchMask));
0183         end
0184         dst{1}.data{d1cols.(currVar)}(matchMask) = dst{2}.data{d2cols.(currVar)}(srcIdxs(matchMask));
0185       end
0186       out_st = dst{1};
0187     otherwise
0188       fprintf('%s: Unknown heuristic: %s\nData structures not combined ...\n', mfilename, params.heuristic);
0189   end
0190 end % if isfield (params.'heuristic')
0191 
0192 return

Generated on Sun 25-Aug-2019 04:00:39 by m2html © 2003