Zorba - NoSQL Query Processing

Content of csv module

xquery version "3.0";

(:
 : Copyright 2006-2009 The FLWOR Foundation.
 :
 : Licensed under the Apache License, Version 2.0 (the "License");
 : you may not use this file except in compliance with the License.
 : You may obtain a copy of the License at
 :
 : http://www.apache.org/licenses/LICENSE-2.0
 :
 : Unless required by applicable law or agreed to in writing, software
 : distributed under the License is distributed on an "AS IS" BASIS,
 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 : See the License for the specific language governing permissions and
 : limitations under the License.
:)

(:~
 : Function library providing converters from CSV/TXT to XML and back.
 : The functions are optimized to work with large amounts of data, in a streaming way.
 :
 : @author Daniel Turcanu
 : @project Zorba/Data Converters/CSV
 :)
module namespace csv = "http://zorba.io/modules/csv";

(:~
 : Import module for checking if csv options element is validated.
 :)
import module namespace schemaOptions = "http://zorba.io/modules/schema";

(:~
 : Contains the definitions of the csv options element.
  :)
import schema namespace csv-options = "http://zorba.io/modules/csv-options";

declare namespace ver = "http://zorba.io/options/versioning";
declare option ver:module-version "1.0";

(:~
 : Parse a CSV or fixed size text and convert to XML.&lt;br/&gt;
 : By default each line is converted to a &amp;lt;row&gt; element, and each field to a &amp;lt;column&gt; element inside &amp;lt;row&gt;.&lt;br/&gt;
 : The format of the param $options is:&lt;br/&gt;
 :  &lt;pre&gt;
 :    &amp;lt;csv-options:options&gt;
 :        &amp;lt;csv  [separator="default comma ,"] ?
 :          [quote-char="default double quotes &amp;amp;quote;"]?
 :          [quote-escape="default double double quotes &amp;amp;quote;&amp;amp;quote;"]? /&gt;
 : 
 :        or
 :        &amp;lt;column-widths&gt;
 :          &amp;lt;column-width&gt;&lt;i&gt;[column fixed width, unsigned int]&lt;/i&gt;&amp;lt;column-width&gt;*
 :        &amp;lt;/column-widths&gt;
 :
 :        or
 :        &amp;lt;column-positions&gt;
 :          &amp;lt;column-position&gt;&lt;i&gt;[column position on line, unsigned int]&lt;/i&gt;&amp;lt;column-position&gt;*
 :        &amp;lt;/column-positions&gt;
 :
 :        &amp;lt;first-row-is-header [line="&lt;i&gt;first_line[-last_line]?&lt;/i&gt;"]?/&gt;?
 :        &amp;lt;start-from-row line="&lt;i&gt;first_line[-last_line]?&lt;/i&gt;"/&gt;?
 :        &amp;lt;add-last-void-columns/&gt;?
 :        &amp;lt;xml-nodes&gt;
 :          [&amp;lt;&lt;i&gt;row-name&lt;/i&gt;&gt;
 :            [&amp;lt;&lt;i&gt;column-name/&lt;/i&gt;&gt;]?
 :          &amp;lt;/&lt;i&gt;row-name&lt;/i&gt;&gt;]?
 :        &amp;lt;/xml-nodes&gt;?
 :    &amp;lt;/csv-options:options&gt;
 :  &lt;/pre&gt;
 :    All the parameters are optional and can appear in any order.&lt;br/&gt;
 :    All the parameters are case sensitive. The namespace used is "http://zorba.io/modules/csv-options".&lt;br/&gt;
 :    All strings must have UTF-8 encoding.&lt;br/&gt;
 :    Parameters csv, column-widths, column-positions are mutually exclusive. If none is specified, 
 :    the input string is assumed to be csv.&lt;br/&gt;
 :    Description of parameters:
 :    &lt;dl&gt;
 :     &lt;dt&gt;&lt;b&gt;csv&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt; Specifies the parameters for parsing a csv string.&lt;br/&gt;
 :       &lt;dl&gt; 
 :        &lt;dt&gt;&lt;b&gt;separator&lt;/b&gt;&lt;/dt&gt;
 :        &lt;dd&gt;The character or group of characters used to separating fields in a row. 
 :            If it is not specified, it defaults to comma ','.
 :        &lt;/dd&gt;
 :        &lt;dt&gt;&lt;b&gt;quote-char&lt;/b&gt;&lt;/dt&gt;
 :        &lt;dd&gt;The character or group of characters used for quoting the fields that may contain special characters,
 :             like separator, new line or this quote char. The default value is double quote ".&lt;br/&gt;
 :        &lt;/dd&gt;
 :        &lt;dt&gt;&lt;b&gt;quote-escape&lt;/b&gt;&lt;/dt&gt;
 :        &lt;dd&gt;The group of characters used for escaping the quote char inside a field. The whole quote escape group
 :           is translated to a quote char during parsing. The default value is double double quotes "".&lt;br/&gt;
 :        &lt;/dd&gt;
 :       &lt;/dl&gt;
 :     &lt;/dd&gt;
 :     &lt;br/&gt;
 :     &lt;dt&gt;&lt;b&gt;column-widths&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;Specifies the column widths for fixed size text. It contains multiple column-width child elements
 :        specifying the fixed width of each column, from left to right.&lt;br/&gt;
 :        If the line has more fields than specified, they are ignored. 
 :     &lt;/dd&gt;
 :     &lt;dt&gt;&lt;b&gt;column-positions&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;This is an alternative to column-widths, and specifies instead the starting position of each column.
 :        Column positions are 1 based, and are specified in order from left to right. 
 :        The last column is read until end of line. The first column position can be greater than 1, if you want
 :        to parse only a part of the input text.
 :     &lt;/dd&gt;
 :     &lt;dt&gt;&lt;b&gt;first-row-is-header&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;The presence of this element indicates that the first row is to be treated as the name of the columns.
 :        If it is not present, then each field is enclosed in a &amp;lt;column&gt; element, 
 :        or how it is specified in &amp;lt;xml-nodes&gt; parameter.&lt;br/&gt;
 :        If the first row is the header, then each field is enclosed in an element with the corresponding name from the header.&lt;br/&gt;
 :        For example, the csv:
 :        &lt;pre&gt;
 :        &lt;i&gt;ID,Name,Occupation
 :        1,John,student&lt;/i&gt;
 :        &lt;/pre&gt;
 :        is parsed into:
 :        &lt;pre&gt;
 :        &lt;i&gt;&amp;lt;row&gt;
 :        &amp;lt;ID&gt;1&amp;lt;/ID&gt;
 :        &amp;lt;Name&gt;John&amp;lt;/Name&gt;
 :        &amp;lt;Occupation&gt;student&amp;lt;/Occupation&gt;
 :        &amp;lt;/row&gt;&lt;/i&gt;
 :        &lt;/pre&gt;
 :        If the header names contain characters that cannot be used in a QName, they are replaced with underscore '_'.&lt;br/&gt;
 :        The namespace for the header QNames is taken from the column name specified in xml-nodes parameter, or from
 :        the row name, or if that doesn't exist either then empty namespace is used. &lt;br/&gt;
 :        If the header is not the first line in the input string, the starting line can be specified in the &lt;b&gt;line&lt;/b&gt; attribute.&lt;br/&gt;
 :        If a column does not have a name, a new name is constructed in the form &lt;i&gt;columnN&lt;/i&gt; where N is the position of the column,
 :        starting from 1.&lt;br/&gt;
 :        &lt;b&gt;Subheaders&lt;/b&gt;&lt;br/&gt;
 :        If the header consists of more than one line, this can be specified in the &lt;b&gt;line&lt;/b&gt; attribute in the form
 :        "&lt;i&gt;first_line - last_line&lt;/i&gt;". Having more lines as the header translates into a hierarchy of elements in the xml.&lt;br/&gt;
 :        For example, the csv:
 :        &lt;pre&gt;
 :        &lt;i&gt;ID,Name,,Occupation
 :        ,First Name,Last Name,
 :        1,John,Howard,student&lt;/i&gt;
 :        &lt;/pre&gt;
 :        is parsed into:
 :        &lt;pre&gt;
 :        &lt;i&gt;&amp;lt;row&gt;
 :        &amp;lt;ID&gt;1&amp;lt;/ID&gt;
 :        &amp;lt;Name&gt;
 :          &amp;lt;First_Name&gt;John&amp;lt;/First_Name&gt;
 :          &amp;lt;Last_Name&gt;Howard&amp;lt;/Last_Name&gt;
 :        &amp;lt;/Name&gt;
 :        &amp;lt;Occupation&gt;student&amp;lt;/Occupation&gt;
 :        &amp;lt;/row&gt;&lt;/i&gt;
 :        &lt;/pre&gt;
 :        This element can have an attribute "accept-all-lines" with values "false" or "true" (default "false").
 :        When set to true it tells the parser to not report lines that do not have the same number of items as 
 :        the header. If set to false, the parser will raise a csv:WrongInput error for these lines.&lt;br/&gt;
 :     &lt;/dd&gt;
 :     &lt;dt&gt;&lt;b&gt;start-from-row&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;If the data does not start from line 1 or immediately after the header, 
 :        you can specify the starting line in the &lt;b&gt;line&lt;/b&gt; attribute.&lt;br/&gt;
 :        Also you can use this attribute in the form "&lt;i&gt;first_line - last_line&lt;/i&gt;" to specify also the last line
 :        if you don't want the whole csv to be parsed.
 :     &lt;/dd&gt;
 :     &lt;dt&gt;&lt;b&gt;add-last-void-columns&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;In the case when using headers and some data lines are shorter than the header, by default the excess columns are ignored
 :          for those lines. You can set the add-last-void-columns parameter to make all the columns appear in xml even if they are void.
 :     &lt;/dd&gt;
 :     &lt;dt&gt;&lt;b&gt;xml-nodes&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;With this parameter you can specify the names for the row element and for the column element if there is no header.&lt;br/&gt;
 :        The first element child of this element specifies the desired QName of the row element in the output xml. 
 :        The name of this element will be used as the name of the row element.&lt;br/&gt;
 :        The element child of this row element is the column element, and its name will be used as the name of the column elements
 :        that enclose the fields in the output xml if there is no header. &lt;br/&gt;
 :        If the csv has a header, only the namespace is used from the column element.&lt;br/&gt;
 :        For example, with parameter:
 :        &lt;pre&gt;
 :        &lt;i&gt;&amp;lt;xml-nodes&gt;
 :        &amp;lt;r&gt;
 :          &amp;lt;c/&gt;
 :        &amp;lt;/r&gt;
 :        &amp;lt;/xml-nodes&gt;&lt;/i&gt;
 :        &lt;/pre&gt;
 :        the output for each line will look like:
 :        &lt;pre&gt;
 :        &lt;i&gt;&amp;lt;r&gt;
 :          &amp;lt;c&gt;field1&amp;lt;/c&gt;
 :          &amp;lt;c&gt;field2&amp;lt;/c&gt;
 :          .......
 :        &amp;lt;/r&gt;&lt;/i&gt;
 :        &lt;/pre&gt;
 :     &lt;/dd&gt;
 :    &lt;/dl&gt;
 : @param $csv the string containing the csv or fixed size text.
 : @param $options this parameter is validated against "http://zorba.io/modules/csv-options" schema. 
 :    If this parameter is not specified, the row name is by default "row" and the column name is by default "column". 
 : @return a sequence of row elements, one for each line in csv
 : @error csv:CSV001 if the input string is streamable string and cannot be rewinded
 : @error csv:WrongInput if the input string has lines with variable number of items, and the csv has headers and
 :         the options do not specify the ignore-foreign-input attribute
 : @error err:XQDY0027 if $options can not be validated against the csv-options schema
 : @error err:XQDY0084 if the options parameter doesn't have the name "csv-options:options".
 : @example test/Queries/converters/csv/csv_parse1.xq
 : @example test/Queries/converters/csv/csv_parse2.xq
 : @example test/Queries/converters/csv/csv_parse3.xq
 : @example test/Queries/converters/csv/csv_parse6.xq
 : @example test/Queries/converters/csv/csv_parse11.xq
 : @example test/Queries/converters/csv/csv_parse_utf8_11.xq
 : @example test/Queries/converters/csv/txt_parse5.xq
 : @example test/Queries/converters/csv/txt_parse8.xq
:)
declare function csv:parse($csv as xs:string,
                           $options as element(csv-options:options)?) as element()*
{
  let $validated-options :=
  if(empty($options)) then
    $options
  else
  if(schemaOptions:is-validated($options)) then
    $options
  else
    validate{$options}
  return
    csv:parse-internal($csv, $validated-options)
};
                                 
declare %private function csv:parse-internal($csv as xs:string,
                                 $options as element(csv-options:options, csv-options:optionsType)?) as element()* external;
                                 
(:~
 : Convert XML into CSV or fixed size text.
 :
 : Note: if you want to serialize out the result, make sure that the serializer method is set to "text". 
 : For example, in zorba command line, you have to set the param --serialize-text.
 : When using the &lt;pre&gt;file:write(...)&lt;/pre&gt; function, you have to set the
 : method serialization parameter to "text":
 : &lt;pre&gt;
 : &amp;lt;output:serialization-parameters&amp;lt;
 :   &amp;lt;output:method value="text"/&amp;lt;
 : &amp;lt;/output:serialization-parameters&amp;lt;
 : &lt;/pre&gt;
 :
 : The &lt;pre&gt;$options&lt;/pre&gt; parameter must have the following format:
 : &lt;pre&gt;
 :    &amp;lt;csv-options:options&gt;&lt;br/&gt;
 :        &amp;lt;csv  [separator="default comma ,"] ? &lt;br/&gt;
 :          [quote-char="default double quotes &amp;amp;quote;"]? &lt;br/&gt;
 :          [quote-escape="default double double quotes &amp;amp;quote;&amp;amp;quote;"]? /&gt; &lt;br/&gt;
 :        &lt;br/&gt;
 :        or&lt;br/&gt;
 :        &amp;lt;column-widths [align="left|right"]?&gt;&lt;br/&gt;
 :          &amp;lt;column-width [align="left|right"]?&gt;&lt;i&gt;[column fixed width, unsigned int]&lt;/i&gt;&amp;lt;column-width&gt;*&lt;br/&gt;
 :        &amp;lt;/column-widths&gt;&lt;br/&gt;
 :        &lt;br/&gt;
 :        or&lt;br/&gt;
 :        &amp;lt;column-positions [align="left|right"]?&gt;&lt;br/&gt;
 :          &amp;lt;column-position [align="left|right"]?&gt;&lt;i&gt;[column position on line, unsigned int]&lt;/i&gt;&amp;lt;column-position&gt;*&lt;br/&gt;
 :        &amp;lt;/column-positions&gt;&lt;br/&gt;
 :        &lt;br/&gt;
 :        &amp;lt;first-row-is-header/&gt;?&lt;br/&gt;
 :    &amp;lt;/csv-options:options&gt;
 : &lt;/pre&gt;
 :
 : All the parameters are optional and can appear in any order.&lt;br/&gt;
 : All the parameters are case sensitive. The namespace used is "http://zorba.io/modules/csv-options".&lt;br/&gt;
 : All strings must have UTF-8 encoding.&lt;br/&gt;
 : Parameters csv, column-widths, column-positions are mutually exclusive.
 : If none is specified, the xml is converted to csv.
 :
 : Description of parameters:
 :    &lt;dl&gt;
 :     &lt;dt&gt;&lt;b&gt;csv&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt; Specifies the parameters for converting to csv.&lt;br/&gt;
 :       &lt;dl&gt; 
 :        &lt;dt&gt;&lt;b&gt;separator&lt;/b&gt;&lt;/dt&gt;
 :        &lt;dd&gt;The character or group of characters used to separating fields in a row. 
 :            If it is not specified, it defaults to comma ','.
 :        &lt;/dd&gt;
 :        &lt;dt&gt;&lt;b&gt;quote-char&lt;/b&gt;&lt;/dt&gt;
 :        &lt;dd&gt;The character or group of characters used for quoting the fields that may contain special characters,
 :             like separator, new line or this quote char. The default value is double quote ".&lt;br/&gt;
 :        &lt;/dd&gt;
 :        &lt;dt&gt;&lt;b&gt;quote-escape&lt;/b&gt;&lt;/dt&gt;
 :        &lt;dd&gt;The group of characters used for escaping the quote char inside a field. The whole quote escape group
 :           is translated to a quote char during parsing. The default value is double double quotes "".&lt;br/&gt;
 :        &lt;/dd&gt;
 :       &lt;/dl&gt;
 :     &lt;/dd&gt;
 :     &lt;br/&gt;
 :     &lt;dt&gt;&lt;b&gt;column-widths&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;Specifies the column widths for fixed size text. It contains multiple column-width child elements
 :        specifying the fixed width of each column, from left to right.&lt;br/&gt;
 :        With the attribute &lt;b&gt;align&lt;/b&gt; you can specify how to align fields that are smaller than the column width.
 :        The default alignment is left.&lt;br/&gt;
 :     &lt;/dd&gt;
 :     &lt;dt&gt;&lt;b&gt;column-positions&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;This is an alternative to column-widths, and specifies instead the starting position of each column.
 :        Column positions are 1 based, and are specified in order from left to right. 
 :        The last column has a variable length.&lt;br/&gt;
 :        With the attribute &lt;b&gt;align&lt;/b&gt; you can specify how to align fields that are smaller than the column width.
 :        The default alignment is left. The last column does not need alignment.&lt;br/&gt;
 :     &lt;/dd&gt;
 :     &lt;dt&gt;&lt;b&gt;first-row-is-header&lt;/b&gt;&lt;/dt&gt;
 :     &lt;dd&gt;The presence of this element indicates that the first row will contain the header, that is, the names of
 :        the column elements. Only the column names from the first row element are taken into account.&lt;br/&gt;
 :        For example, the row xml:&lt;br/&gt;
 :        &lt;i&gt;&amp;lt;row&gt;&lt;br/&gt;
 :        &amp;lt;ID&gt;1&amp;lt;/ID&gt;&lt;br/&gt;
 :        &amp;lt;Name&gt;John&amp;lt;/Name&gt;&lt;br/&gt;
 :        &amp;lt;Occupation&gt;student&amp;lt;/Occupation&gt;&lt;br/&gt;
 :        &amp;lt;/row&gt;&lt;/i&gt;&lt;br/&gt;
 :        &lt;br/&gt;
 :        is converted to&lt;br/&gt;
 :        &lt;i&gt;ID,Name,Occupation&lt;br/&gt;
 :        1,John,student&lt;/i&gt;&lt;br/&gt;
 :        &lt;br/&gt;
 :        The header names are the localnames of the column elements, and the namespace is ignored.&lt;br/&gt;
 :        &lt;b&gt;Subheaders&lt;/b&gt;&lt;br/&gt;
 :        If the row-column hierarchy is more complex, then subheaders are also generated on subsequent lines.
 :        The number of subheaders depends on the depth of the column hierarchy.&lt;br/&gt;
 :        When generating the subheaders, the non-whitespace text nodes are also taken into account, 
 :        and a separate column is generated for them too.&lt;br/&gt;
 :        For example, the xml row element:&lt;br/&gt;
 :        &lt;i&gt;&amp;lt;row&gt;&lt;br/&gt;
 :        &amp;lt;ID&gt;1&amp;lt;/ID&gt;&lt;br/&gt;
 :        &amp;lt;Name&gt;&lt;br/&gt;
 :          Mr.&lt;br/&gt;
 :          &amp;lt;First_Name&gt;John&amp;lt;/First_Name&gt;&lt;br/&gt;
 :          &amp;lt;Last_Name&gt;Howard&amp;lt;/Last_Name&gt;&lt;br/&gt;
 :        &amp;lt;/Name&gt;&lt;br/&gt;
 :        &amp;lt;Occupation&gt;student&amp;lt;/Occupation&gt;&lt;br/&gt;
 :        &amp;lt;/row&gt;&lt;/i&gt;&lt;br/&gt;
 :        is converted to&lt;br/&gt;
 :        &lt;i&gt;ID,Name,,Occupation&lt;br/&gt;
 :        ,,First Name,Last Name,&lt;br/&gt;
 :        1,Mr.,John,Howard,student&lt;/i&gt;&lt;br/&gt;
 :        &lt;br/&gt;
 :        If first-row-is-header is not specified and the columns have a deeper hierarchy,
 :          only the first layer of columns is processed, and the fields are the string values of each column.&lt;br/&gt;
 :        This element can have an attribute "ignore-foreign-input" with values "false" or "true" (default "false").
 :        When set to true it tells the serializer to ignore elements that to not match the header names.
 :        If set to false, the serializer will raise a csv:ForeignInput error for these elements.&lt;br/&gt;
 :     &lt;/dd&gt;
 :    &lt;/dl&gt;
 :
 : @param $xml a sequence of elements, each element representing a row. The name of each row element is ignored.
 :     The childs of each row are the column fields.
 : @param $options The options parameter. See the function description for details. 
 : This parameter is validated against "http://zorba.io/modules/csv-options" schema.
 : @return the csv or fixed size text as string containing all the lines
 : @error csv:CSV003 if the serialize output is streamable string and cannot be reset
 : @error csv:ForeignInput if there are input elements in subsequent rows that do not match the headers,
 :    and the options specify first-row-is-header and do not specify the ignore-foreign-input attribute
 : @error err:XQDY0027 if $options can not be validated against csv-options schema
 : @error err:XQDY0084 if the options parameter doesn't have the name "csv-options:options".
 : @example test/Queries/converters/csv/csv_serialize1.xq
 : @example test/Queries/converters/csv/csv_serialize2.xq
 : @example test/Queries/converters/csv/csv_serialize3.xq
 : @example test/Queries/converters/csv/csv_serialize5.xq
 : @example test/Queries/converters/csv/csv_serialize6.xq
 : @example test/Queries/converters/csv/csv_parse_serialize6.xq
 : @example test/Queries/converters/csv/txt_serialize6.xq
 : @example test/Queries/converters/csv/txt_parse_serialize6.xq
:)
declare function csv:serialize($xml as element()*,
                               $options as element(csv-options:options)?) as xs:string
{
  let $validated-options :=
  if(empty($options)) then
    $options
  else
  if(schemaOptions:is-validated($options)) then
    $options
  else
    validate{$options}
  return
    csv:serialize-internal($xml, $validated-options)
};
                                    
declare %private function csv:serialize-internal($xml as element()*,
                  $options as element(csv-options:options, csv-options:optionsType)?) as xs:string external;

(: vim:set et sw=2 ts=2: :)