0001 function out_st = ensemble_combine_datastructs(dst,params)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030 if ~iscell(dst) || length(dst) ~= 2
0031 error('%s: Two data structures are required', mfilename)
0032 end
0033
0034 if nargin < 2
0035 params = struct;
0036 end
0037
0038
0039 diffVars = setxor(dst{1}.vars, dst{2}.vars);
0040
0041 if isempty(diffVars)
0042
0043 diffRows = setdiff(dst{1}.data, dst{2}.data,'rows');
0044
0045 if ~isempty(diffRows)
0046 out_st = ensemble_concat_datastruct(dst);
0047 else
0048 fprintf('%s: Input data structures are the same. Nothing combined.\n', mfilename);
0049 end
0050 else
0051
0052 intersectVars = intersect(dst{1}.vars, dst{2}.vars);
0053 numIntersect = length(intersectVars);
0054
0055
0056 ds_idxs = cell(1,2);
0057 for ids = 1:2
0058 [~,ds_idxs{ids}] = ismember(intersectVars, dst{ids}.vars);
0059 end
0060
0061
0062 varDiffers = false(1,numIntersect);
0063 for ivar = 1:numIntersect
0064
0065
0066 for ist = 1:2
0067 if iscell(dst{ist}.data{ds_idxs{ist}(ivar)})
0068 emptyMask = cellfun('isempty',dst{ist}.data{ds_idxs{ist}(ivar)});
0069 [dst{ist}.data{ds_idxs{ist}(ivar)}{emptyMask}] = deal('');
0070 end
0071 end
0072 varDiffers(ivar) = ~isempty(setxor(dst{1}.data{ds_idxs{1}(ivar)}, dst{2}.data{ds_idxs{2}(ivar)}));
0073 end
0074
0075 if any(varDiffers)
0076 if ~isfield(params,'heuristic')
0077 fprintf('%s: Data structures differ in variables and rows. Nothing combined.\n', mfilename);
0078 end
0079 out_st = [];
0080 else
0081
0082 out_st = dst{1};
0083
0084
0085
0086 copyVars = setdiff(dst{2}.vars, intersectVars);
0087
0088
0089
0090 out_st.vars = [out_st.vars, copyVars];
0091
0092
0093
0094 [~,srcCols] = ismember(copyVars, dst{2}.vars);
0095
0096
0097 out_st.data = [out_st.data dst{2}.data(srcCols)];
0098 end
0099 end
0100
0101
0102
0103 CAN_HANDLE_MULTIPLE_VARS = 1;
0104 if isfield(params,'heuristic')
0105 switch params.heuristic
0106 case 'merge'
0107
0108 intersectVars = intersect(dst{1}.vars, dst{2}.vars);
0109
0110 numIntersect = length(intersectVars);
0111 if numIntersect > 1 && ~CAN_HANDLE_MULTIPLE_VARS
0112 error('Too many intersecting variables')
0113 end
0114
0115
0116
0117
0118
0119
0120
0121
0122 ordinalTbl = cell(1,2);
0123 for ids = 1:2
0124 [~,idxs] = ismember(intersectVars,dst{ids}.vars);
0125 tbl = dst{ids}.data(idxs);
0126
0127
0128 if exist('table','class')
0129 ordinalTbl{ids} = table(tbl{:});
0130 usingTableClass = true;
0131 else
0132 usingTableClass = false;
0133 for iint = 1:length(intersectVars)
0134 ordinalTbl{ids}(:,iint) = ordinal(tbl{iint});
0135 end
0136 end
0137
0138
0139 uniqueRows = unique(ordinalTbl{ids},'rows');
0140 numNonUnique = size(ordinalTbl{ids},1) - size(uniqueRows,1);
0141 if numNonUnique
0142 fprintf('Found %d non-unique rows in data struct: %s\n',numNonUnique,dst{1}.name);
0143
0144 [~,idxs] = ismember(ordinalTbl{ids},uniqueRows,'rows');
0145
0146
0147 t = tabulate(idxs);
0148
0149
0150 uniqueRows(t(:,2)>1,:)
0151
0152 fprintf('Cannot merge data ...\n');
0153 return
0154 end
0155 end
0156
0157
0158
0159 if ~usingTableClass
0160 [matchMask, srcIdxs] = ismember(nominal(ordinalTbl{1}),nominal(ordinalTbl{2}),'rows');
0161 else
0162 [matchMask, srcIdxs] = ismember(ordinalTbl{1},ordinalTbl{2},'rows');
0163 end
0164
0165 d1cols = set_var_col_const(dst{1}.vars);
0166 d2cols = set_var_col_const(dst{2}.vars);
0167
0168
0169 copyVars = setdiff(dst{2}.vars, dst{1}.vars);
0170 dst{1}.vars = [dst{1}.vars copyVars];
0171 d1cols = set_var_col_const(dst{1}.vars);
0172 nvars = length(copyVars);
0173 for ivar = 1:nvars
0174 currVar = copyVars{ivar};
0175
0176
0177 if iscell(dst{2}.data{d2cols.(currVar)}(1))
0178 dst{1}.data{d1cols.(currVar)} = cell(size(matchMask));
0179 elseif isnumeric(dst{2}.data{d2cols.(currVar)}(1))
0180 dst{1}.data{d1cols.(currVar)} = nan(size(matchMask));
0181 elseif islogical(dst{2}.data{d2cols.(currVar)}(1))
0182 dst{1}.data{d1cols.(currVar)} = false(size(matchMask));
0183 end
0184 dst{1}.data{d1cols.(currVar)}(matchMask) = dst{2}.data{d2cols.(currVar)}(srcIdxs(matchMask));
0185 end
0186 out_st = dst{1};
0187 otherwise
0188 fprintf('%s: Unknown heuristic: %s\nData structures not combined ...\n', mfilename, params.heuristic);
0189 end
0190 end
0191
0192 return