Home > database > ensemble_csv2datastruct.m

ensemble_csv2datastruct

PURPOSE ^

Loads a CSV file into an Ensemble data structure

SYNOPSIS ^

function data_st = ensemble_csv2datastruct(in_st,params)

DESCRIPTION ^

 Loads a CSV file into an Ensemble data structure

 USAGE:
 Either 
    data_st = ensemble_csv2datastruct(fname)
 or
    data_st = ensemble_csv2datastruct([],params)
    where there is a field params.fname

 KNOWN ISSUES:
   Fields containing strings with more than one comma will not parse
   correctly. Need to improve the regexp statement.

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SOURCE CODE ^

0001 function data_st = ensemble_csv2datastruct(in_st,params)
0002 % Loads a CSV file into an Ensemble data structure
0003 %
0004 % USAGE:
0005 % Either
0006 %    data_st = ensemble_csv2datastruct(fname)
0007 % or
0008 %    data_st = ensemble_csv2datastruct([],params)
0009 %    where there is a field params.fname
0010 %
0011 % KNOWN ISSUES:
0012 %   Fields containing strings with more than one comma will not parse
0013 %   correctly. Need to improve the regexp statement.
0014 
0015 % 09Aug2014 Petr Janata
0016 % 26Jan2014 PJ - made compatible with ensemble_jobman
0017 % 08Feb2015 PJ - parsing of input lines is now better able to handle empty
0018 %                values, while retaining ability to preserve commas in
0019 %                quotes
0020 % 20Aug2015 PJ - enabled dynamic setting of USE_MATCH_HEURISTIC (default is false);
0021 
0022 if nargin < 2
0023   if ischar(in_st)
0024     fname = in_st;
0025   else
0026     error('%s: argument must be string if only one argument is passed in', mfilename)
0027   end
0028   params = struct;
0029 elseif nargin == 2
0030   if ~isfield(params, 'fname')
0031     if ~isempty(in_st)
0032       fname = in_st;
0033     else
0034       error('%s: name of file to load must be provided in 1st argument or as the fname field in 2nd argument', mfilename)
0035     end
0036   else
0037     fname = params.fname;
0038   end
0039 end
0040 
0041 data_st = ensemble_init_data_struct;
0042 
0043 if ~exist(fname,'file')
0044   error('%s: File %s does not exist', mfilename, fname)
0045 end
0046 [~,fstub] = fileparts(fname);
0047 data_st.name = fstub;
0048 data_st.type = 'csvfile';
0049 
0050 % Oopen the file
0051 fid = fopen(fname,'rt');
0052 
0053 % Read the headerline
0054 cl = fgetl(fid);
0055 
0056 % Parse the line
0057 vars = regexp(cl,',','split');
0058 nvars = length(vars);
0059 
0060 % Sanitize the variable names, replacing whitespace with underscores
0061 vars = regexprep(vars,'\s+','_');
0062 data_st.vars = vars;
0063 
0064 % Read the rest of the file
0065 data  = cell(1,nvars);
0066 numRows = 0;
0067 
0068 if isfield(params, 'USE_MATCH_HEURISTIC')
0069   USE_MATCH_HEURISTIC = params.USE_MATCH_HEURISTIC;
0070 else
0071   USE_MATCH_HEURISTIC = 0;
0072 end
0073 
0074 while ~feof(fid)
0075   cl = fgetl(fid); % read the line
0076   numRows = numRows+1;
0077   
0078   
0079   % Parse the line, taking care to preserve commas in quotes
0080   % The problem with this approach is that if there is a missing value, the
0081   % number of tokens is less than the number of columns
0082   if USE_MATCH_HEURISTIC
0083     pat = '(".+")|([^,]*)'; %  '(".+")|([^,]*)'
0084     tokens = regexp(cl,pat,'match');
0085   else
0086     % Split the string on commas, but not those occuring in quotes
0087     % KNOWN ISSUE: Will give an erroneous result if the string within quotes
0088     % contains more than one comma. Need a better lookaround assertion in
0089     % the regexp pattern.
0090     pat = '(?!(?=,[\w\d\s]+")),';
0091     tokens = regexp(cl,pat,'split');
0092 
0093   end
0094   
0095   % Replace double quotes
0096   tokens = regexprep(tokens,'"','');
0097   ntok = length(tokens);
0098   
0099   % Make sure the number of tokens is equal to the number of variables
0100   if ntok ~= nvars
0101     % Check whether the data for the last variable is simply empty. This
0102     % would be true if the last character in the current line is a comma
0103     if ntok == (nvars-1) && strcmp(',',cl(end))
0104       tokens{end+1} = ' ';
0105       ntok = length(tokens);
0106     else
0107       error('%s: Number of tokens (%d) does not match number of variables (%d)', mfilename, ntok, nvars)
0108     end
0109   end
0110   
0111   % Determine whether variables are numeric or not
0112   if numRows == 1
0113     varIsNumeric = ~isnan(str2double(tokens));
0114   end
0115   
0116   for itok = 1:ntok
0117     % See if we should convert the value of each token to numeric
0118     % Place the token into the data slot
0119     if varIsNumeric(itok)
0120       data{itok}(numRows,1) = str2double(tokens{itok});
0121     else
0122       data{itok}{numRows,1} = tokens{itok};
0123     end
0124     
0125   end % for itok
0126 
0127 end % while ~feof(fid)
0128 
0129 % Close the file
0130 fclose(fid);
0131 
0132 % Make sure that the number of variables matches the number of columns
0133 ncols = size(data,2);
0134 if nvars ~= ncols
0135   error('%s: Number of columns (%d) does not match number of variables (%d)',mfilename, ncols, nvars);
0136 end
0137 
0138 data_st.data = data;
0139 return

Generated on Wed 20-Sep-2023 04:00:50 by m2html © 2003