Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions assimilation_code/modules/utilities/parse_args_mod.f90
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ module parse_args_mod
private

public :: get_args_from_string, &
get_csv_words_from_string, &
get_name_val_pairs_from_string, &
get_next_arg

Expand Down Expand Up @@ -207,6 +208,178 @@ subroutine get_args_from_string(inline, argcount, argwords)

end subroutine get_args_from_string

!------------------------------------------------------------------------------
! parse a single string up into delimeter-separated words

subroutine get_csv_words_from_string(inline, delim, wordcount, words)

character(len=*), intent(in) :: inline
character, intent(in) :: delim
integer, intent(out) :: wordcount
character(len=*), intent(out) :: words(:)

! in all these offsets, they are relative to 1, left hand char in string:
! firstoff is offset to next delimiter character starting a word
! thisoff is offset to the current character
! finaloff is offset of the last non-delimiter character in the string
! inword is a logical which toggles when inside a word or not
! maxw are the max number of words, defined by what the caller passes in
! maxl is the max length of any one word, again defined by the size of the
! incoming array.

integer :: firstoff, finaloff, thisoff
logical :: inword
integer :: maxw, maxl
integer :: wordlen, i

character(len=len(inline)) :: wordline
character(len=512) :: msgstring, msgstring2
character :: endword, thisc
character(len=*), parameter :: routine = 'get_csv_words_from_string'

logical :: debug = .false. ! true to debug this routine, warning verbose


! maxw is max number of 'words' allowed
! maxl is the max length of any one 'word'

maxw = size(words)
maxl = len(words(1))

words = ''
wordcount = 0

finaloff = len_trim(inline)
if (finaloff <= 0) return

wordline = inline

firstoff = 1
thisoff = 1
inword = .true.
wordlen = 0
endword = delim

if (debug) print *, 'line = ', '"'//trim(wordline)//'"'

NEXTCHAR: do
! end of input?
if (thisoff > finaloff) then
! if currently in a word, complete it
! todo: if quoted string is last, strip final quote
if (inword) then
wordcount = wordcount + 1
if (wordcount > maxw) exit NEXTCHAR
wordlen = thisoff-firstoff-1
if (debug) print *, 'thisoff, firstoff, wordlen = ', thisoff, firstoff, wordlen
if (wordlen > maxl) exit NEXTCHAR
words(wordcount) = wordline(firstoff:firstoff+wordlen)
if (debug) print *, 'word ', wordcount, ' is ', '"'//wordline(firstoff:firstoff+wordlen)//'"'
endif
exit NEXTCHAR
endif

! next character on line
thisc = wordline(thisoff:thisoff)

if (debug) print *, 'thisoff, finaloff, inword, endword, thisc = ', thisoff, finaloff, &
inword, '"'//endword//'"', ' ', '"'//thisc//'"'

! this (escape by backslash) doesn't seem to be universially supported
! by CSV files but i can't see that it hurts.

! escaped chars - backslash prevents interpretation of next char
if (thisc == '\') then
! move the remainder of the string over, overwriting the \ and
! skipping the next char.
do i=thisoff, finaloff-1
wordline(i:i) = wordline(i+1:i+1)
enddo
wordline(finaloff:finaloff) = ' '
finaloff = finaloff-1
thisoff = thisoff+1
cycle NEXTCHAR
endif

! transition into a word? this is slightly more complex than blank
! separated words. in a CSV file, the delimiters separate fields, so
! the first one doesn't start with one, and the last field doesn't end
! with one. quotes can be used immediately after a delimiter to keep
! field data together. the next char after a closing quote should be
! the field delimieter.

! start of a delimiter-separated string.
! unlike strings of blanks, you can't skip strings of consecutive delimiters
! and the first and last fields aren't enclosed by delimiters.
if (.not. inword) then
if (thisc == delim) then
inword = .true.
thisoff = thisoff+1 ! skip delimeter
firstoff = thisoff ! first char of field
endword = thisc
else
write(msgstring, *) "error? not in word, next char not delimiter"
call error_handler(E_ERR,routine,msgstring,source)
endif
cycle NEXTCHAR
endif

! transition out of a word?
! also, if the first character of a word is a quote, the
! word continues until the closing quote.
if (inword) then
! if first char of string is a quote, skip it and mark it as
! the new delimiter
if ((thisoff == firstoff) .and. &
(thisc == '"' .or. thisc == "'")) then
endword = thisc
thisoff = thisoff+1
firstoff = thisoff ! reset start of field
cycle NEXTCHAR
endif
! if we come to a delimiter, check for quote and remove it
! and reset the delimiter char
if (thisc == endword) then
inword = .false.
wordlen = thisoff-firstoff-1
if (thisc == '"' .or. thisc == "'") then
endword = delim ! todo: necessary?
thisoff = thisoff+1 ! skip quote
endif
wordcount = wordcount + 1
if (wordcount > maxw) exit NEXTCHAR
if (debug) print *, 'thisoff, firstoff, wordlen = ', thisoff, firstoff, wordlen
if (wordlen > maxl) exit NEXTCHAR
words(wordcount) = wordline(firstoff:firstoff+wordlen)
if (debug) print *, 'word ', wordcount, ' is ', '"'//wordline(firstoff:firstoff+wordlen)//'"'
cycle NEXTCHAR
endif
thisoff = thisoff + 1 ! normal case, word contents OR end of word, skip delimiter
cycle NEXTCHAR
endif

enddo NEXTCHAR

if (wordcount > maxw) then
write(msgstring,*) 'more delimeter-separated words than max number allowed by calling code, ', maxw
call error_handler(E_ERR,routine,msgstring,source)
endif

if (wordlen > maxl) then
write(msgstring,*) 'one or more words longer than max length allowed by calling code, ', maxl
call error_handler(E_ERR,routine,msgstring,source)
endif

if (debug) then
print *, 'wordcount = ', wordcount
do i=1, wordcount
print *, 'word', i, ' is "'//trim(words(i))//'"'
enddo
endif


end subroutine get_csv_words_from_string

!------------------------------------------------------------------------------
! parse a single string up into blank-separated name=value words
! and return an array of names and values, plus a flag indicating
Expand Down
103 changes: 6 additions & 97 deletions assimilation_code/modules/utilities/read_csv_mod.f90

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently there is no documentation for this read_csv_mod module.
It would be good to list what it does and does not do.

I believe it assumes that all values in a row are reals or ints or stings, not a mixture.
If a value is unsuccessfully read, it is put in as MISSING_R8 or MISSING_I8 rather than failing.
Forcing a delimeter vs detecting a delimeter.
...

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add documentation of the new module after I merge Nancy's PR

Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ module read_csv_mod
use utilities_mod, only : error_handler, E_ERR, find_textfile_dims, &
open_file, close_file, to_upper, &
string_to_real, string_to_integer
use parse_args_mod, only : get_args_from_string
use parse_args_mod, only : get_csv_words_from_string

implicit none
private
Expand Down Expand Up @@ -124,7 +124,7 @@ subroutine csv_get_field_char(cf, varname, varvals, context)
call error_handler(E_ERR, routine, string1, context)
endif

call split_fields(line, cf%delim, nfields, entries)
call get_csv_words_from_string(line, cf%delim, nfields, entries)

! Parse the column entry. If it's _EMPTY_ then
! treat it as empty string to make it MISSING
Expand Down Expand Up @@ -213,98 +213,6 @@ integer function csv_get_nrows_from_file(fname, context) result(nrows)
end function csv_get_nrows_from_file


!---------------------------------------------------
! Adapt get_args_from_string after adjusting delims
subroutine split_fields(line, delim, nfields, fields)

character(len=*), intent(in) :: line
character, intent(in) :: delim
integer, intent(out) :: nfields
character(len=*), intent(out) :: fields(:)

character(len=MAX_FIELDS_LEN) :: work

! Clean the line then parse it
work = normalize_delims(line, delim)
call get_args_from_string(work, nfields, fields)

end subroutine split_fields


!----------------------------------------------------------------------
! Replace ',' and ';' with blanks to use above parsers.
! We also need to treat empty fields so that we don't
! collapse with the spaces and cause any column drifts.
! This serves as a wrapper for 'get_args_from_string'
! Example:
! A;B;;;;C;; --> A B _EMPTY_ _EMPTY_ _EMPTY_ C _EMPTY_ _EMPTY_
function normalize_delims(line, delim) result(out_line)

character(len=*), intent(in) :: line
character, intent(in) :: delim

character(len=MAX_FIELDS_LEN) :: out_line
integer :: i, j, L, k, lee
logical :: prev_is_delim

! Start as with a delimiter
out_line = ' '
prev_is_delim = .true.

j = 1
L = len_trim(line)

lee = len(EMPTY_ENTRY)

! Go over the line 1 character at a time
do i = 1, L
if (line(i:i) == char(13)) cycle
if (line(i:i) == delim) then
! Found a delim
if (prev_is_delim) then
! insert placeholder + 1 space
out_line(j:j+lee-1) = EMPTY_ENTRY
j = j+lee
out_line(j:j) = ' '

j = j+1
else
! normal delimiter
out_line(j:j) = ' '
j = j+1
endif
prev_is_delim = .true.
if (j > MAX_FIELDS_LEN - 64) exit ! prevent overflow; 64 is a small cushion
else
out_line(j:j) = line(i:i)

j = j+1
prev_is_delim = .false.
if (j > MAX_FIELDS_LEN - 64) exit
endif
enddo

! Trailing empty field: line ends with a delimiter (or several)
if (L > 0 .and. line(L:L) == delim) then
out_line(j:j+lee-1) = EMPTY_ENTRY
j = j + lee
endif

! Trim right spaces
k = j - 1
do while (k >= 1 .and. out_line(k:k) == ' ')
k = k - 1
enddo

if (k < 1) then
out_line = ''
else
out_line = out_line(1:k)
endif

end function normalize_delims


!---------------------------------------------------
! Find field index using cached header in csv_file_type.
integer function csv_find_field(cf, key) result(idx)
Expand Down Expand Up @@ -460,10 +368,11 @@ end function csv_get_nrows
! Open a CSV handle: cache header/dims.
! By doing so, we won't need to open the file
! every time to read header or get dimensions.
subroutine csv_open(fname, cf, context)
subroutine csv_open(fname, cf, forced_delim, context)

character(len=*), intent(in) :: fname
type(csv_file_type), intent(out) :: cf
character(len=*), intent(in), optional :: forced_delim
character(len=*), intent(in), optional :: context

character(len=*), parameter :: routine = 'csv_open'
Expand Down Expand Up @@ -496,9 +405,9 @@ subroutine csv_open(fname, cf, context)
endif

! Can also enforce a specific delim as a second argument
cf%delim = detect_delim(line)
cf%delim = detect_delim(line, forced_delim)

call split_fields(line, cf%delim, cf%ncols, cf%fields)
call get_csv_words_from_string(line, cf%delim, cf%ncols, cf%fields)

cf%is_open = .true.

Expand Down
Loading