Description of repack

0001 function as = repack_formdata(as,ed)
0002 % repackages data from one or more forms into a more useable format
0003 %
0004 % as = repack_formdata(as,ed)
0005 %
0006 % as -- analysis structure (see init_analysis_struct.m for details)
0007 % ed -- experiment data
0008 %
0009 % Multiple instances of a stimulus are associated with unique entries along the 3rd dimension
0010 % of the .data field and the 2nd dimension of the .stimidx field.  Scripts that
0011 % subsequently operate on the as structure can easily locate the multiple
0012 % instances by searching for a specific stimidx in the stimidx field.
0013 
0014 % 08/19/05 Petr Janata
0015 % 02/19/06 PJ - Added checking to make sure that there are entries in the form
0016 %               to be processed.
0017 % 03/05/06 PJ - Added code to make sure that the rows in the .num and .txt data
0018 %               matrices correspond to the same subjects.
0019 %               Added writing of datenum information.
0020 % 07/18/06 PJ - Improved handling of stimulus vs non-stimulus numerical
0021 %               responses
0022 % 07/19/06 PJ - Resolved handling of multiple iterations of a stimulus.
0023 % 07/27/06 PJ - Added provisions for two types of timestamps, one that is
0024 %               stimulus based, and one that is question based
0025 % 08/03/06 PJ - Eliminated separate handling of single iteration data since
0026 %               this was causing minor problems.
0027 
0028 NUMERIC = 1;
0029 
0030 DEBUG = 2;
0031 
0032 numeric_types = {'int16','int32','int64','double','enum'};
0033 
0034 p = as.params;
0035 
0036 nforms = length(as.forms);
0037 
0038 iter_cnt_num = [];
0039 iter_cnt_txt = [];
0040 
0041 % We need to go through the forms twice.
0042 % The first time is to extract all of the existing subject IDs and then create
0043 % a sorted list of these.  This list will structure the .data tables and will
0044 % ensure that the rows of the .num and .txt data structures are in register.
0045 % It is much easier to ensure that this reasonable assumption is true at this
0046 % stage rather than having to do explicit matching by subject ID in scripts
0047 % that analyze the analysis structure. Note that mismatches in subids may
0048 % persist across elements of an analysis structure, e.g. as(1) vs as(2).
0049 
0050 master_sublist = {};
0051 for ifrm = 1:nforms
0052   cfname = as.forms{ifrm};  % get the current form name
0053   frm_idx = strmatch(cfname,ed.form_names,'exact');
0054 
0055   % copy the current formdata
0056   fd = ed.form_data{frm_idx};
0057 
0058   % make sure there are entries to process
0059   nentries = size(fd.data{1},1);
0060   if nentries == 0
0061     continue
0062   end
0063   
0064   % Map our form column constants
0065   FD = set_form_col_const(fd.vars);
0066 
0067   % Get the list of subjects for whom we have data in this form
0068   subids = fd.data{FD.SUB_ID};   
0069   
0070   % Update the master list
0071   master_sublist = union(master_sublist,subids);
0072 end  % for ifrm=
0073 
0074 % set the subid fields to have the master list
0075 as.num.subid = master_sublist;
0076 as.txt.subid = master_sublist;
0077 
0078 for ifrm = 1:nforms
0079   cfname = as.forms{ifrm};  % get the current form name
0080   frm_idx = strmatch(cfname,ed.form_names,'exact');
0081   
0082   % Make a local copy of the form data
0083   fd = ed.form_data{frm_idx};
0084   nentries = size(fd.data{1},1);
0085   
0086   if nentries == 0
0087     continue
0088   end
0089   
0090   % Map our form column constants
0091   FD = set_form_col_const(fd.vars);
0092 
0093   % Get some basic subject info
0094   subids = fd.data{FD.SUB_ID};   
0095   unique_subs = unique(subids);
0096   nsub = length(unique_subs);
0097   
0098   % Determine what the questions are on this form and create appropriate
0099   % entries in the output structures if they don't yet exist
0100   
0101   qid = cat(2,fd.data{[FD.QUEST_ID FD.SUBQUEST_ID]});  % get question IDs
0102   compqid = qid(:,1)+qid(:,2)/10;
0103 
0104   [unique_quest_ids, quest_idxs] = unique(qid,'rows'); % get the unique questions
0105   nquest = size(unique_quest_ids,1);
0106   
0107   % Create the output question ID field
0108   out_ids = unique_quest_ids(:,1)+unique_quest_ids(:,2)/10; % qid.sqid
0109   
0110   % Retrieve the data formats
0111   qid_str = sprintf('(question_id=%d AND subquestion=%d) OR ', unique_quest_ids');
0112   qid_str(end-3:end) = [];  
0113   mysql_str = sprintf(['SELECT type, data_format_id FROM data_format ' ...
0114     'RIGHT JOIN question_x_data_format ON' ...
0115     ' data_format.data_format_id=question_x_data_format.answer_format_id ' ...
0116     'WHERE (%s);'], qid_str);
0117   [types, dfid] = mysql(p.conn_id,mysql_str);
0118   
0119   % Find out which of the entries have to do with a stimulus ID
0120   stim_mask = ~isnan(fd.data{FD.STIM_ID});
0121   
0122   % Determine which ones are numeric
0123   is_numeric = ismember(types,numeric_types);
0124   
0125   for itype = 1:2
0126     
0127     % Define a few variables that depend on the data type
0128     if itype == NUMERIC  % numeric data
0129       ts = as.num;
0130       type_mask = is_numeric;
0131       src_col = FD.RESP_ENUM;  % column in fd.data that we pull data from
0132       iter_cnt = iter_cnt_num;
0133     else  % non-numeric data
0134       ts = as.txt;
0135       type_mask = ~is_numeric;
0136       src_col = FD.RESP_TXT;  % column in fd.data that we pull data from
0137       iter_cnt = iter_cnt_txt;
0138     end
0139     
0140     % Check to see if we need to handle any data of this type
0141     if ~any(type_mask)
0142       continue
0143     end
0144     
0145     % Check to see which ones we need to add to the numeric and text output structures
0146     proc_qid_idxs = find(~ismember(out_ids,ts.qid) & type_mask);
0147     nproc = length(proc_qid_idxs);
0148   
0149     if nproc
0150       % copy the new question IDs
0151       insert_idxs = (length(ts.qid)+1):(length(ts.qid)+nproc);
0152       ts.qid(insert_idxs) = out_ids(proc_qid_idxs);
0153       
0154       % copy the new data format IDs (for decoding enums)
0155       if itype == 1
0156     ts.dfid(insert_idxs) = dfid(proc_qid_idxs);
0157       end
0158       
0159       % copy the question text
0160       tmp = cat(2,fd.data{[FD.QUEST_TXT FD.SUBQUEST_TXT]});
0161       for iproc = 1:nproc
0162     if ~strcmp(tmp{quest_idxs(iproc),1},tmp{quest_idxs(proc_qid_idxs(iproc)),2})
0163       ts.qtxt{insert_idxs(iproc)} = ...
0164           cell2str(tmp(quest_idxs(proc_qid_idxs(iproc)),:),'\n');
0165     else
0166       ts.qtxt{insert_idxs(iproc)} = tmp{quest_idxs(proc_qid_idxs(iproc)),1};
0167     end
0168       end
0169       
0170       % Initialize some iteration handling variables
0171 %       if isempty(ts.niter)
0172 %     last_idx = nsub;
0173 %       else
0174 %     last_idx = size(ts.niter,1);
0175 %       end
0176 %       ts.niter(last_idx,insert_idxs) = 0  % Total number of iterations/subject/question
0177 %       iter_cnt(last_idx,insert_idxs) = 0;
0178     end % if nproc
0179     
0180     %
0181     % COPY THE DATA FOR EACH SUBJECT.
0182     %
0183     % Exactly how this happens depends on the how we want to handle multiple
0184     % responses to the same questions, as happens when the same form is
0185     % presented multiple times. This can happen either in the context of
0186     % non-stimulus based forms, e.g. PANAS, or with forms that are associated
0187     % with a stimulus ID.
0188     %
0189     % The easiest way to handle this might be to place repetitions into a 3rd
0190     % dimension in the data field.
0191     %
0192     
0193     for isub = 1:nsub
0194       sid = unique_subs{isub};
0195       
0196       % Check to see if we already have an entry for this subject and create if
0197       % necessary.
0198       % This conditional code should not be entered given the changes made of
0199       % initializing the subid lists in the .num and .txt structures to be the
0200       % same (same elements, same order).
0201       
0202       row_idx = find(strcmp(ts.subid,sid));
0203       if isempty(row_idx)
0204     if DEBUG == 2
0205       fprintf('Creating Type %d entry for subject: %s\n', itype, sid);
0206     end
0207     ts.subid{end+1} = sid;
0208     row_idx = length(ts.subid);
0209       end
0210 
0211       % Make sure our iteration counters are up to date
0212       if (size(iter_cnt,1) < row_idx) | (size(iter_cnt,2) < length(ts.qid))
0213     ts.niter(row_idx,length(ts.qid)) = 0;
0214     iter_cnt(row_idx,length(ts.qid)) = 0;
0215       end
0216       
0217       % Determine which entries in the form data belong to this subject
0218       submask = strcmp(subids,sid);
0219       
0220       % Determine destination columns for the data. compqid(submask) pulls out
0221       % the question IDs that we have responses for for this subject, and
0222       % ts.qid is a vector that maps each question ID to a column in the output
0223       % data matrix.  have_dest indicates which of the responses can be copied,
0224       % and dest_idx gives the appropriate column in the output data matrix.
0225 
0226       destmask = zeros(size(submask));
0227       [have_dest,dest_idx] = ismember(compqid(submask),ts.qid);
0228 
0229       % Make sure we have some data to enter
0230       if any(have_dest)
0231     destmask(submask) = have_dest;
0232     
0233     if itype == NUMERIC & (DEBUG == NUMERIC)
0234       [dest_idx(have_dest) fd.data{src_col}(submask&destmask)]
0235       [sum(submask&destmask) length(dest_idx(have_dest))]
0236     end
0237     
0238     % update the iteration count
0239     dest_cols = unique(dest_idx(have_dest));
0240     ndest = length(dest_cols);
0241     if ndest > 1
0242       ts.niter(row_idx,dest_cols) = ...
0243           ts.niter(row_idx,dest_cols) + ...
0244           hist(dest_idx(have_dest),dest_cols);
0245     else
0246       ts.niter(row_idx,dest_cols) = ...
0247           ts.niter(row_idx,dest_cols) + ...
0248           sum(have_dest);
0249     end
0250     
0251     % Now copy the data.
0252     
0253       % Now loop over destination columns (individual questions)
0254       for idest = 1:ndest
0255         col_idx = dest_cols(idest);
0256         % Find which responses pertain to this question.
0257         colmask = compqid == ts.qid(col_idx);
0258         
0259         curr_mask = colmask&submask;
0260         clear colmask
0261         
0262         % check to see if we are dealing with stimulus IDs
0263         is_stim = any(fd.data{FD.STIM_ID}(curr_mask&stim_mask));
0264         
0265         if is_stim
0266           curr_mask = curr_mask&stim_mask;
0267         end
0268       
0269         % Figure out how many responses we have to this particular question
0270         nresp = sum(curr_mask);
0271         
0272         % Copy the data. If we aren't dealing with a stimulus, then it is easy
0273         % because we don't have to go into specific stim_id slots, and can
0274         % just pack things into the last place we left off.
0275         %
0276         % NOTE: This is a somewhat weak solution, meaning that there might
0277         % be situations where we want things aligned by iteration even if we
0278         % aren't dealing with a stimulus.  So, the handling of this may need
0279         % to be changed in the future.
0280         if ~is_stim
0281           % Figure out where the responses are going to go in the 3D data
0282           % field.
0283           rep_idxs = ...
0284           iter_cnt(row_idx,col_idx)+1:iter_cnt(row_idx,col_idx)+nresp;
0285           
0286           % update the iteration count
0287           iter_cnt(row_idx,col_idx) = max(rep_idxs);
0288           
0289           % copy the data
0290           ts.data(row_idx,col_idx,rep_idxs) = ...
0291           fd.data{src_col}(curr_mask);
0292           
0293           ts.datenum.by_question(row_idx,col_idx,rep_idxs) = fd.data{FD.DATE_TIME}(curr_mask);
0294           %diff(fd.data{FD.RESP_ID}(curr_mask)) < 0
0295         else
0296           % Handle each stimulus individually.
0297           % We have to go through some extra hoops to preserve the temporal
0298           % order in which they occurred.
0299           %
0300           % HANDLING OF MULTIPLE ITERATIONS OF A STIMULUS
0301           %
0302           % Multiple iterations of a stimulus ID are handled by simply
0303           % adding another instance of the stimulus ID to the .data and
0304           % .stimidx matrices (the 3rd and 2nd dimensions, respectively),
0305           % rather than creating an extra dimension to handle multiple
0306           % stimulus iterations.  The advantage of this scheme is that it
0307           % preserves the temporal ordering of stimulus presentation in
0308           % these variables. Thus, unique instances of a stimulus are now
0309           % identified by a combination of their stimulus ID and timestamp.
0310           
0311           [curr_stim_ids,idx1,idx2] = ...
0312           unique(fd.data{FD.STIM_ID}(curr_mask));
0313           curr_mask_idxs = find(curr_mask);
0314           
0315           % Make sure we have no NaNs among the stim ids
0316           bad_ids = isnan(curr_stim_ids);
0317           if any(bad_ids)
0318         warning(sprintf('Found %d NaNs among the stim IDs. subid=%s\n', sum(bad_ids), subid))
0319           end
0320           
0321           % sort them into original temporal order
0322           curr_stim_ids = curr_stim_ids(idx2);
0323           curr_stim_times = fd.data{FD.DATE_TIME}(curr_mask_idxs);
0324           
0325           % Make sure that there are no duplicate stimulus submissions
0326           dup_idxs = find((diff(curr_stim_ids)==0) & ...
0327           (diff(curr_stim_times)==0))+1;
0328 
0329           if ~isempty(dup_idxs)
0330         warning(sprintf(['\n%d duplicated responses with same stimulus ' ...
0331               'ID (%s) and timestamp: subject (%s)\n'], length(dup_idxs), sprintf('%d,', curr_stim_ids(dup_idxs)), sid))
0332         curr_stim_ids(dup_idxs) = [];
0333         curr_stim_times(dup_idxs) = [];
0334         nresp = nresp-1;
0335           end
0336           
0337           num_presented_stims = length(curr_stim_ids);
0338           
0339           % issue a warning if there are more responses than stimuli,
0340           % i.e. some stimulus was responded to more than once
0341           if num_presented_stims ~= nresp
0342         warning(sprintf('Encountered %d stimuli and %d responses\n', num_presented_stims, nresp))
0343           end
0344           
0345           for istim = 1:num_presented_stims
0346         curr_stim_id = curr_stim_ids(istim);
0347         
0348         % Check to see if this particular stim_id already exists in the
0349         % master stimulus list. If not, add it to the list.
0350         master_stim_idx = find(as.stims.ids==curr_stim_id);
0351         
0352         if isempty(master_stim_idx)
0353           master_stim_idx = length(as.stims.ids)+1;
0354           
0355           % Copy the stim ID to this location
0356           as.stims.ids(master_stim_idx) = curr_stim_id;
0357         end % if isempty(master_stim_idx)
0358         
0359         % Now, check to see if this particular master_stim_idx already
0360         % exists in the subject's stimulus list, and if it doesn't,
0361         % pack it into the first available slot
0362         
0363         % Make sure we've started a row in the stimidx matrix for this subject
0364         if size(ts.stimidx,1) < row_idx
0365           ts.stimidx(row_idx,1) = 0;
0366         end
0367         
0368         ts.stimidx(row_idx,istim) = master_stim_idx;
0369         ts.datenum.by_stim(row_idx,istim) = curr_stim_times(istim);
0370         
0371         % make a mask for this instance of the stimulus
0372         curr_stimmask = (fd.data{FD.STIM_ID} == curr_stim_id) & ...
0373             (fd.data{FD.DATE_TIME} == curr_stim_times(istim)); 
0374         
0375         % Copy the data to this location
0376         tmpdata = fd.data{src_col}(curr_mask&curr_stimmask);
0377         
0378         % Only 1 data value should be returned, though in the case of a
0379         % doubly submitted response, there will be more than
0380         % one. Therefore, explicitly take the first.
0381         ts.data(row_idx,col_idx,istim) = tmpdata(1);
0382         
0383         % update the iteration count
0384         iter_cnt(row_idx,col_idx) = iter_cnt(row_idx,col_idx)+1;
0385           end % for istim=
0386         end % if ~is_stim
0387       end % for idest=
0388 
0389       end % if any(have_dest)
0390     end % for isub=
0391 
0392     % put the temporary structure back into the proper output structure
0393     if itype == NUMERIC
0394       as.num = ts; 
0395       iter_cnt_num = iter_cnt;
0396     else 
0397       as.txt = ts; 
0398       iter_cnt_txt;
0399     end
0400     
0401   end % for itype=
0402 end % for ifrm=
repack_formdata

PURPOSE

SYNOPSIS

DESCRIPTION

CROSS-REFERENCE INFORMATION

SOURCE CODE