From 0976d04b21679a66041a5b21a8df2604288c8fc4 Mon Sep 17 00:00:00 2001 From: Alex Shinn Date: Wed, 23 Oct 2024 23:17:03 +0900 Subject: [PATCH] Adding initial CSV library. --- lib/chibi/csv-test.sld | 72 +++++++++++ lib/chibi/csv.scm | 286 +++++++++++++++++++++++++++++++++++++++++ lib/chibi/csv.sld | 9 ++ 3 files changed, 367 insertions(+) create mode 100644 lib/chibi/csv-test.sld create mode 100644 lib/chibi/csv.scm create mode 100644 lib/chibi/csv.sld diff --git a/lib/chibi/csv-test.sld b/lib/chibi/csv-test.sld new file mode 100644 index 00000000..3842667d --- /dev/null +++ b/lib/chibi/csv-test.sld @@ -0,0 +1,72 @@ + +(define-library (chibi csv-test) + (import (scheme base) + (srfi 227) + (chibi csv) + (chibi test)) + (export run-tests) + (begin + (define string->csv + (opt-lambda (str (reader (csv-read->list))) + (reader (open-input-string str)))) + (define (run-tests) + (test-begin "(chibi csv)") + (test-assert (eof-object? (string->csv ""))) + (test '("1997" "Ford" "E350") + (string->csv "1997,Ford,E350")) + (test '("1997" "Ford" "E350") + (string->csv "\n1997,Ford,E350")) + (test '(" ") + (string->csv " \n1997,Ford,E350")) + (test '("" "") + (string->csv ",\n1997,Ford,E350")) + (test '("1997" "Ford" "E350") + (string->csv "\"1997\",\"Ford\",\"E350\"")) + (test '("1997" "Ford" "E350" "Super, luxurious truck") + (string->csv "1997,Ford,E350,\"Super, luxurious truck\"")) + (test '("1997" "Ford" "E350" "Super, \"luxurious\" truck") + (string->csv "1997,Ford,E350,\"Super, \"\"luxurious\"\" truck\"")) + (test '("1997" "Ford" "E350" "Go get one now\nthey are going fast") + (string->csv "1997,Ford,E350,\"Go get one now +they are going fast\"")) + (test '("1997" "Ford" "E350") + (string->csv + "# this is a comment\n1997,Ford,E350" + (csv-read->list + (csv-parser (csv-grammar '((comment-chars #\#))))))) + (test '("1997" "Fo\"rd" "E3\"50") + (string->csv "1997\tFo\"rd\tE3\"50" + (csv-read->list (csv-parser default-tsv-grammar)))) + (test '#("1997" "Ford" "E350") + (string->csv "1997,Ford,E350" (csv-read->vector))) + (test '#("1997" "Ford" "E350") + (string->csv "1997,Ford,E350" (csv-read->fixed-vector 3))) + (test-error + (string->csv "1997,Ford,E350" (csv-read->fixed-vector 2))) + (let ((city-csv "Los Angeles,34°03′N,118°15′W +New York City,40°42′46″N,74°00′21″W +Paris,48°51′24″N,2°21′03″E")) + (test '(*TOP* + (row (col-0 "Los Angeles") + (col-1 "34°03′N") + (col-2 "118°15′W")) + (row (col-0 "New York City") + (col-1 "40°42′46″N") + (col-2 "74°00′21″W")) + (row (col-0 "Paris") + (col-1 "48°51′24″N") + (col-2 "2°21′03″E"))) + ((csv->sxml) (open-input-string city-csv))) + (test '(*TOP* + (city (name "Los Angeles") + (latitude "34°03′N") + (longitude "118°15′W")) + (city (name "New York City") + (latitude "40°42′46″N") + (longitude "74°00′21″W")) + (city (name "Paris") + (latitude "48°51′24″N") + (longitude "2°21′03″E"))) + ((csv->sxml 'city '(name latitude longitude)) + (open-input-string city-csv)))) + (test-end)))) diff --git a/lib/chibi/csv.scm b/lib/chibi/csv.scm new file mode 100644 index 00000000..6138d36b --- /dev/null +++ b/lib/chibi/csv.scm @@ -0,0 +1,286 @@ + +;;> \section{CSV Grammars} + +;;> CSV is a simple and compact format for tabular data, which has +;;> made it popular for a variety of tasks since the early days of +;;> computing. Unfortunately, there are many incompatible dialects +;;> requiring a grammar to specify all of the different options. + +(define-record-type Csv-Grammar + (make-csv-grammar separator-chars quote-char escape-char record-separator comment-chars) + csv-grammar? + (separator-chars csv-grammar-separator-chars csv-grammar-separator-chars-set!) + (quote-char csv-grammar-quote-char csv-grammar-quote-char-set!) + (escape-char csv-grammar-escape-char csv-grammar-escape-char-set!) + (record-separator csv-grammar-record-separator csv-grammar-record-separator-set!) + (comment-chars csv-grammar-comment-chars csv-grammar-comment-chars-set!)) + +;; TODO: Consider some minimal low-level parsing options. In general +;; this is intended to be performed by the parser, but if we can skip +;; intermediate string generation (e.g. parsing numbers directly) it +;; can save a considerable amount of garbage when parsing large files. + +;;> Creates a new CSV grammar from the given spec, an alist of symbols +;;> to values. The following options are supported: +;;> +;;> \itemlist[ +;;> \item{\scheme{'separator-chars} - A non-empty list of characters used to delimit fields, by default \scheme{'(#\,)} (comma-separated).} +;;> \item{\scheme{'quote-char} - A single character used to quote fields containing special characters, or \scheme{#f} to disable quoting, by default \scheme{#\"} (a double-quote).} +;;> \item{\scheme{'escape-char} - A single character used to escape characters within quoted fields, or \scheme{#f} to disable escapes, by default \scheme{#\"} (a double-quote). If this is the same character as the \scheme{quote-char}, then the quote char can be doubled to escape, but no other characters can be escaped.} +;;> \item{\scheme{'record-separator} - A single character used to delimit the record (row), or one of the symbols \scheme{'cr}, \scheme{'crlf}, \scheme{'lf} or \scheme{'lax}. These correspond to sequences of carriage return and line feed, or in the case of \scheme{'lax} any of the other three sequences. Defaults to \scheme{'lax}.} +;;> \item{\scheme{'comment-chars} - A list of characters which if found at the start of a record indicate it is a comment, discarding all characters through to the next record-separator. Defaults to the empty list (no comments).} +;;> ] +(define (csv-grammar spec) + (let ((grammar (make-csv-grammar '(#\,) #\" #\" 'lax '()))) + (for-each + (lambda (x) + (case (car x) + ((separator-chars delimiter) + (csv-grammar-separator-chars-set! grammar (cdr x))) + ((quote-char) + (csv-grammar-quote-char-set! grammar (cdr x))) + ((escape-char) + (csv-grammar-escape-char-set! grammar (cdr x))) + ((record-separator newline-type) + (let ((rec-sep + (case (cdr x) + ((crlf lax) (cdr x)) + ((cr) #\return) + ((lf) #\newline) + (else + (if (char? (cdr x)) + (cdr x) + (error "invalid record-separator, expected a char or one of 'lax or 'crlf" (cdr x))))))) + (csv-grammar-escape-char-set! grammar (cdr x)))) + ((comment-chars) + (csv-grammar-comment-chars-set! grammar (cdr x))) + (else + (error "unknown csv-grammar spec" x)))) + spec) + grammar)) + +;;> The default CSV grammar for convenience, with all of the defaults +;;> from \scheme{csv-grammar}, i.e. comma-delimited with \scheme{#\"} +;;> for quoting, doubled to escape. +(define default-csv-grammar + (csv-grammar '())) + +;;> The default TSV grammar for convenience, splitting fields only on +;;> tabs, with no quoting or escaping. +(define default-tsv-grammar + (csv-grammar '((separator-chars #\tab) (quote-char . #f) (escape-char . #f)))) + +;;> \section{CSV Parsers} + +;;> Parsers are low-level utilities to perform operations on records a +;;> field at a time. You generally want to work with readers, which +;;> build on this to build records into familiar data structures. + +;;> Parsers follow the rules of a grammar to parse a single CSV +;;> record, possible comprised of multiple fields. A parser is a +;;> procedure of three arguments which performs a fold operation over +;;> the fields of the record. The parser signature is: +;;> \scheme{(parser kons knil in)}, where \scheme{kons} itself is +;;> a procedure of three arguments: \scheme{(proc acc index field)}. +;;> \scheme{proc} is called on each field of the record, in order, +;;> along with its zero-based \scheme{index} and the accumulated +;;> result of the last call, starting with \scheme{knil}. + +;;> Returns a new CSV parser for the given \var{grammar}. +(define csv-parser + (opt-lambda ((grammar default-csv-grammar)) + (lambda (kons knil in) + (when (pair? (csv-grammar-comment-chars grammar)) + (let lp () + (when (memv (peek-char in) (csv-grammar-comment-chars grammar)) + (csv-skip-line in grammar) + (lp)))) + (let lp ((acc knil) + (index 0) + (out (open-output-string))) + (define (finish-row) + (let ((field (get-output-string out))) + (if (and (zero? index) (equal? field "")) + ;; empty row, read again + (lp acc index out) + (kons acc index field)))) + (let ((ch (read-char in))) + (cond + ((eof-object? ch) + (let ((field (get-output-string out))) + (if (and (zero? index) (equal? field "")) + ;; no data + ch + (kons acc index field)))) + ((memv ch (csv-grammar-separator-chars grammar)) + (lp (kons acc index (get-output-string out)) + (+ index 1) + (open-output-string))) + ((eqv? ch (csv-grammar-quote-char grammar)) + ;; TODO: Consider a strict mode to enforce no text + ;; before/after the quoted text. + (csv-read-quoted in out grammar) + (lp acc index out)) + ((eqv? ch (csv-grammar-record-separator grammar)) + (finish-row)) + ((and (eqv? ch #\return) + (memq (csv-grammar-record-separator grammar) '(crlf lax))) + (cond + ((eqv? (peek-char in) #\newline) + (read-char in) + (finish-row)) + ((eq? (csv-grammar-record-separator grammar) 'lax) + (finish-row)) + (else + (write-char ch out) + (lp acc (+ index 1) out)))) + ((and (eqv? ch #\newline) + (eq? (csv-grammar-record-separator grammar) 'lax)) + (finish-row)) + (else + (write-char ch out) + (lp acc index out)))))))) + +(define (csv-skip-line in grammar) + (let lp () + (let ((ch (read-char in))) + (cond + ((eof-object? ch)) + ((eqv? ch (csv-grammar-record-separator grammar))) + ((and (eqv? ch #\newline) + (eq? (csv-grammar-record-separator grammar) 'lax))) + ((and (eqv? ch #\return) + (memq (csv-grammar-record-separator grammar) '(crlf lax))) + (cond + ((eqv? (peek-char in) #\newline) (read-char in)) + ((eq? (csv-grammar-record-separator grammar) 'lax)) + (else (lp)))) + (else (lp)))))) + +(define (csv-read-quoted in out grammar) + (let lp () + (let ((ch (read-char in))) + (cond + ((eof-object? ch) + (error "unterminated csv quote" (get-output-string out))) + ((eqv? ch (csv-grammar-quote-char grammar)) + (when (and (eqv? ch (csv-grammar-escape-char grammar)) + (eqv? ch (peek-char in))) + (write-char (read-char in) out) + (lp))) + ((eqv? ch (csv-grammar-escape-char grammar)) + (write-char (read-char in) out) + (lp)) + (else + ;; TODO: Consider an option to disable newlines in quotes. + (write-char ch out) + (lp)))))) + +;;> \section{CSV Readers} + +;;> A CSV reader reads a single record, returning some representation +;;> of it. You can either loop manually with these or pass them to +;;> one of the high-level utilities to operate on a whole CSV file at +;;> a time. + +;;> The simplest reader, simply returns the field string values in +;;> order as a list. +(define csv-read->list + (opt-lambda ((parser (csv-parser))) + (opt-lambda ((in (current-input-port))) + (let ((res (parser (lambda (ls i field) (cons field ls)) '() in))) + (if (pair? res) + (reverse res) + res))))) + +;;> The equivalent of \scheme{csv-read->list} but returns a vector. +(define csv-read->vector + (opt-lambda ((parser (csv-parser))) + (let ((reader (csv-read->list parser))) + (opt-lambda ((in (current-input-port))) + (let ((res (reader in))) + (if (pair? res) + (list->vector res) + res)))))) + +;;> The same as \scheme{csv-read->vector} but requires the vector to +;;> be of a fixed size, and may be more efficient. +(define csv-read->fixed-vector + (opt-lambda (size (parser (csv-parser))) + (opt-lambda ((in (current-input-port))) + (let ((res (make-vector size))) + (let ((len (parser (lambda (prev-i i field) (vector-set! res i field) i) + 0 + in))) + (if (zero? len) + eof-object + res)))))) + +;;> Returns an SXML representation of the record, as a row with +;;> multiple named columns. +(define csv-read->sxml + (opt-lambda ((row-name 'row) + (column-names + (lambda (i) + (string->symbol (string-append "col-" (number->string i))))) + (parser (csv-parser))) + (define (get-column-name i) + (if (procedure? column-names) + (column-names i) + (list-ref column-names i))) + (opt-lambda ((in (current-input-port))) + (let ((res (parser (lambda (ls i field) + `((,(get-column-name i) ,field) ,@ls)) + (list row-name) + in))) + (if (pair? res) + (reverse res) + res))))) + +;;> \section{CSV Utilities} + +;;> A folding operation on records. \var{proc} is called successively +;;> on each row and the accumulated result. +(define csv-fold + (opt-lambda (proc + knil + (reader (csv-read->list)) + (in (current-input-port))) + (let lp ((acc knil)) + (let ((row (reader in))) + (cond + ((eof-object? row) acc) + (else (lp (proc row acc)))))))) + +;;> An iterator which simply calls \var{proc} on each record in the +;;> input in order. +(define csv-for-each + (opt-lambda (proc + (reader (csv-read->list)) + (in (current-input-port))) + (csv-fold (lambda (row acc) (proc row)) #f reader in))) + +;;> Returns a list containing the result of calling \var{proc} on each +;;> element in the input. +(define csv-map + (opt-lambda (proc + (reader (csv-read->list)) + (in (current-input-port))) + (reverse (csv-fold (lambda (row acc) (cons (proc row) acc)) '() reader in)))) + +;;> Returns a list of all of the read records in the input. +(define csv->list + (opt-lambda ((reader (csv-read->list)) + (in (current-input-port))) + (csv-map (lambda (row) row) reader in))) + +;;> Returns an SXML representation of the CSV. +(define csv->sxml + (opt-lambda ((row-name 'row) + (column-names + (lambda (i) + (string->symbol (string-append "col-" (number->string i))))) + (parser (csv-parser))) + (opt-lambda ((in (current-input-port))) + (cons '*TOP* + (csv->list (csv-read->sxml row-name column-names parser) in))))) diff --git a/lib/chibi/csv.sld b/lib/chibi/csv.sld new file mode 100644 index 00000000..f4df593d --- /dev/null +++ b/lib/chibi/csv.sld @@ -0,0 +1,9 @@ + +(define-library (chibi csv) + (import (scheme base) (srfi 227)) + (export csv-grammar csv-parser csv-grammar? + default-csv-grammar default-tsv-grammar + csv-read->list csv-read->vector csv-read->fixed-vector + csv-read->sxml + csv-fold csv-map csv->list csv-for-each csv->sxml) + (include "csv.scm"))