Content of csv module

xquery version "3.0";

(:
 : Copyright 2006-2009 The FLWOR Foundation.
 :
 : Licensed under the Apache License, Version 2.0 (the "License");
 : you may not use this file except in compliance with the License.
 : You may obtain a copy of the License at
 :
 : http://www.apache.org/licenses/LICENSE-2.0
 :
 : Unless required by applicable law or agreed to in writing, software
 : distributed under the License is distributed on an "AS IS" BASIS,
 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 : See the License for the specific language governing permissions and
 : limitations under the License.
:)

(:~
 : Function library providing converters from CSV/TXT to XML and back.
 : The functions are optimized to work with large amounts of data, in a streaming way.
 :
 : @author Daniel Turcanu
 : @project Zorba/Data Converters/CSV
 :)
module namespace csv = "http://zorba.io/modules/csv";

(:~
 : Import module for checking if csv options element is validated.
 :)
import module namespace schemaOptions = "http://zorba.io/modules/schema";

(:~
 : Contains the definitions of the csv options element.
  :)
import schema namespace csv-options = "http://zorba.io/modules/csv-options";

declare namespace ver = "http://zorba.io/options/versioning";
declare option ver:module-version "1.0";

(:~
 : Parse a CSV or fixed size text and convert to XML.<br/>
 : By default each line is converted to a <row> element, and each field to a <column> element inside <row>.<br/>
 : The format of the param $options is:<br/>
 :  <pre>
 :    <csv-options:options>
 :        <csv  [separator="default comma ,"] ?
 :          [quote-char="default double quotes &quote;"]?
 :          [quote-escape="default double double quotes &quote;&quote;"]? />
 : 
 :        or
 :        <column-widths>
 :          <column-width><i>[column fixed width, unsigned int]</i><column-width>*
 :        </column-widths>
 :
 :        or
 :        <column-positions>
 :          <column-position><i>[column position on line, unsigned int]</i><column-position>*
 :        </column-positions>
 :
 :        <first-row-is-header [line="<i>first_line[-last_line]?</i>"]?/>?
 :        <start-from-row line="<i>first_line[-last_line]?</i>"/>?
 :        <add-last-void-columns/>?
 :        <xml-nodes>
 :          [<<i>row-name</i>>
 :            [<<i>column-name/</i>>]?
 :          </<i>row-name</i>>]?
 :        </xml-nodes>?
 :    </csv-options:options>
 :  </pre>
 :    All the parameters are optional and can appear in any order.<br/>
 :    All the parameters are case sensitive. The namespace used is "http://zorba.io/modules/csv-options".<br/>
 :    All strings must have UTF-8 encoding.<br/>
 :    Parameters csv, column-widths, column-positions are mutually exclusive. If none is specified, 
 :    the input string is assumed to be csv.<br/>
 :    Description of parameters:
 :    <dl>
 :     <dt><b>csv</b></dt>
 :     <dd> Specifies the parameters for parsing a csv string.<br/>
 :       <dl> 
 :        <dt><b>separator</b></dt>
 :        <dd>The character or group of characters used to separating fields in a row. 
 :            If it is not specified, it defaults to comma ','.
 :        </dd>
 :        <dt><b>quote-char</b></dt>
 :        <dd>The character or group of characters used for quoting the fields that may contain special characters,
 :             like separator, new line or this quote char. The default value is double quote ".<br/>
 :        </dd>
 :        <dt><b>quote-escape</b></dt>
 :        <dd>The group of characters used for escaping the quote char inside a field. The whole quote escape group
 :           is translated to a quote char during parsing. The default value is double double quotes "".<br/>
 :        </dd>
 :       </dl>
 :     </dd>
 :     <br/>
 :     <dt><b>column-widths</b></dt>
 :     <dd>Specifies the column widths for fixed size text. It contains multiple column-width child elements
 :        specifying the fixed width of each column, from left to right.<br/>
 :        If the line has more fields than specified, they are ignored. 
 :     </dd>
 :     <dt><b>column-positions</b></dt>
 :     <dd>This is an alternative to column-widths, and specifies instead the starting position of each column.
 :        Column positions are 1 based, and are specified in order from left to right. 
 :        The last column is read until end of line. The first column position can be greater than 1, if you want
 :        to parse only a part of the input text.
 :     </dd>
 :     <dt><b>first-row-is-header</b></dt>
 :     <dd>The presence of this element indicates that the first row is to be treated as the name of the columns.
 :        If it is not present, then each field is enclosed in a <column> element, 
 :        or how it is specified in <xml-nodes> parameter.<br/>
 :        If the first row is the header, then each field is enclosed in an element with the corresponding name from the header.<br/>
 :        For example, the csv:
 :        <pre>
 :        <i>ID,Name,Occupation
 :        1,John,student</i>
 :        </pre>
 :        is parsed into:
 :        <pre>
 :        <i><row>
 :        <ID>1</ID>
 :        <Name>John</Name>
 :        <Occupation>student</Occupation>
 :        </row></i>
 :        </pre>
 :        If the header names contain characters that cannot be used in a QName, they are replaced with underscore '_'.<br/>
 :        The namespace for the header QNames is taken from the column name specified in xml-nodes parameter, or from
 :        the row name, or if that doesn't exist either then empty namespace is used. <br/>
 :        If the header is not the first line in the input string, the starting line can be specified in the <b>line</b> attribute.<br/>
 :        If a column does not have a name, a new name is constructed in the form <i>columnN</i> where N is the position of the column,
 :        starting from 1.<br/>
 :        <b>Subheaders</b><br/>
 :        If the header consists of more than one line, this can be specified in the <b>line</b> attribute in the form
 :        "<i>first_line - last_line</i>". Having more lines as the header translates into a hierarchy of elements in the xml.<br/>
 :        For example, the csv:
 :        <pre>
 :        <i>ID,Name,,Occupation
 :        ,First Name,Last Name,
 :        1,John,Howard,student</i>
 :        </pre>
 :        is parsed into:
 :        <pre>
 :        <i><row>
 :        <ID>1</ID>
 :        <Name>
 :          <First_Name>John</First_Name>
 :          <Last_Name>Howard</Last_Name>
 :        </Name>
 :        <Occupation>student</Occupation>
 :        </row></i>
 :        </pre>
 :        This element can have an attribute "accept-all-lines" with values "false" or "true" (default "false").
 :        When set to true it tells the parser to not report lines that do not have the same number of items as 
 :        the header. If set to false, the parser will raise a csv:WrongInput error for these lines.<br/>
 :     </dd>
 :     <dt><b>start-from-row</b></dt>
 :     <dd>If the data does not start from line 1 or immediately after the header, 
 :        you can specify the starting line in the <b>line</b> attribute.<br/>
 :        Also you can use this attribute in the form "<i>first_line - last_line</i>" to specify also the last line
 :        if you don't want the whole csv to be parsed.
 :     </dd>
 :     <dt><b>add-last-void-columns</b></dt>
 :     <dd>In the case when using headers and some data lines are shorter than the header, by default the excess columns are ignored
 :          for those lines. You can set the add-last-void-columns parameter to make all the columns appear in xml even if they are void.
 :     </dd>
 :     <dt><b>xml-nodes</b></dt>
 :     <dd>With this parameter you can specify the names for the row element and for the column element if there is no header.<br/>
 :        The first element child of this element specifies the desired QName of the row element in the output xml. 
 :        The name of this element will be used as the name of the row element.<br/>
 :        The element child of this row element is the column element, and its name will be used as the name of the column elements
 :        that enclose the fields in the output xml if there is no header. <br/>
 :        If the csv has a header, only the namespace is used from the column element.<br/>
 :        For example, with parameter:
 :        <pre>
 :        <i><xml-nodes>
 :        <r>
 :          <c/>
 :        </r>
 :        </xml-nodes></i>
 :        </pre>
 :        the output for each line will look like:
 :        <pre>
 :        <i><r>
 :          <c>field1</c>
 :          <c>field2</c>
 :          .......
 :        </r></i>
 :        </pre>
 :     </dd>
 :    </dl>
 : @param $csv the string containing the csv or fixed size text.
 : @param $options this parameter is validated against "http://zorba.io/modules/csv-options" schema. 
 :    If this parameter is not specified, the row name is by default "row" and the column name is by default "column". 
 : @return a sequence of row elements, one for each line in csv
 : @error csv:CSV001 if the input string is streamable string and cannot be rewinded
 : @error csv:WrongInput if the input string has lines with variable number of items, and the csv has headers and
 :         the options do not specify the ignore-foreign-input attribute
 : @error err:XQDY0027 if $options can not be validated against the csv-options schema
 : @error err:XQDY0084 if the options parameter doesn't have the name "csv-options:options".
 : @example test/Queries/converters/csv/csv_parse1.xq
 : @example test/Queries/converters/csv/csv_parse2.xq
 : @example test/Queries/converters/csv/csv_parse3.xq
 : @example test/Queries/converters/csv/csv_parse6.xq
 : @example test/Queries/converters/csv/csv_parse11.xq
 : @example test/Queries/converters/csv/csv_parse_utf8_11.xq
 : @example test/Queries/converters/csv/txt_parse5.xq
 : @example test/Queries/converters/csv/txt_parse8.xq
:)
declare function csv:parse($csv as xs:string,
                           $options as element(csv-options:options)?) as element()*
{
  let $validated-options :=
  if(empty($options)) then
    $options
  else
  if(schemaOptions:is-validated($options)) then
    $options
  else
    validate{$options}
  return
    csv:parse-internal($csv, $validated-options)
};
                                 
declare %private function csv:parse-internal($csv as xs:string,
                                 $options as element(csv-options:options, csv-options:optionsType)?) as element()* external;
                                 
(:~
 : Convert XML into CSV or fixed size text.
 :
 : Note: if you want to serialize out the result, make sure that the serializer method is set to "text". 
 : For example, in zorba command line, you have to set the param --serialize-text.
 : When using the <pre>file:write(...)</pre> function, you have to set the
 : method serialization parameter to "text":
 : <pre>
 : <output:serialization-parameters<
 :   <output:method value="text"/<
 : </output:serialization-parameters<
 : </pre>
 :
 : The <pre>$options</pre> parameter must have the following format:
 : <pre>
 :    <csv-options:options><br/>
 :        <csv  [separator="default comma ,"] ? <br/>
 :          [quote-char="default double quotes &quote;"]? <br/>
 :          [quote-escape="default double double quotes &quote;&quote;"]? /> <br/>
 :        <br/>
 :        or<br/>
 :        <column-widths [align="left|right"]?><br/>
 :          <column-width [align="left|right"]?><i>[column fixed width, unsigned int]</i><column-width>*<br/>
 :        </column-widths><br/>
 :        <br/>
 :        or<br/>
 :        <column-positions [align="left|right"]?><br/>
 :          <column-position [align="left|right"]?><i>[column position on line, unsigned int]</i><column-position>*<br/>
 :        </column-positions><br/>
 :        <br/>
 :        <first-row-is-header/>?<br/>
 :    </csv-options:options>
 : </pre>
 :
 : All the parameters are optional and can appear in any order.<br/>
 : All the parameters are case sensitive. The namespace used is "http://zorba.io/modules/csv-options".<br/>
 : All strings must have UTF-8 encoding.<br/>
 : Parameters csv, column-widths, column-positions are mutually exclusive.
 : If none is specified, the xml is converted to csv.
 :
 : Description of parameters:
 :    <dl>
 :     <dt><b>csv</b></dt>
 :     <dd> Specifies the parameters for converting to csv.<br/>
 :       <dl> 
 :        <dt><b>separator</b></dt>
 :        <dd>The character or group of characters used to separating fields in a row. 
 :            If it is not specified, it defaults to comma ','.
 :        </dd>
 :        <dt><b>quote-char</b></dt>
 :        <dd>The character or group of characters used for quoting the fields that may contain special characters,
 :             like separator, new line or this quote char. The default value is double quote ".<br/>
 :        </dd>
 :        <dt><b>quote-escape</b></dt>
 :        <dd>The group of characters used for escaping the quote char inside a field. The whole quote escape group
 :           is translated to a quote char during parsing. The default value is double double quotes "".<br/>
 :        </dd>
 :       </dl>
 :     </dd>
 :     <br/>
 :     <dt><b>column-widths</b></dt>
 :     <dd>Specifies the column widths for fixed size text. It contains multiple column-width child elements
 :        specifying the fixed width of each column, from left to right.<br/>
 :        With the attribute <b>align</b> you can specify how to align fields that are smaller than the column width.
 :        The default alignment is left.<br/>
 :     </dd>
 :     <dt><b>column-positions</b></dt>
 :     <dd>This is an alternative to column-widths, and specifies instead the starting position of each column.
 :        Column positions are 1 based, and are specified in order from left to right. 
 :        The last column has a variable length.<br/>
 :        With the attribute <b>align</b> you can specify how to align fields that are smaller than the column width.
 :        The default alignment is left. The last column does not need alignment.<br/>
 :     </dd>
 :     <dt><b>first-row-is-header</b></dt>
 :     <dd>The presence of this element indicates that the first row will contain the header, that is, the names of
 :        the column elements. Only the column names from the first row element are taken into account.<br/>
 :        For example, the row xml:<br/>
 :        <i><row><br/>
 :        <ID>1</ID><br/>
 :        <Name>John</Name><br/>
 :        <Occupation>student</Occupation><br/>
 :        </row></i><br/>
 :        <br/>
 :        is converted to<br/>
 :        <i>ID,Name,Occupation<br/>
 :        1,John,student</i><br/>
 :        <br/>
 :        The header names are the localnames of the column elements, and the namespace is ignored.<br/>
 :        <b>Subheaders</b><br/>
 :        If the row-column hierarchy is more complex, then subheaders are also generated on subsequent lines.
 :        The number of subheaders depends on the depth of the column hierarchy.<br/>
 :        When generating the subheaders, the non-whitespace text nodes are also taken into account, 
 :        and a separate column is generated for them too.<br/>
 :        For example, the xml row element:<br/>
 :        <i><row><br/>
 :        <ID>1</ID><br/>
 :        <Name><br/>
 :          Mr.<br/>
 :          <First_Name>John</First_Name><br/>
 :          <Last_Name>Howard</Last_Name><br/>
 :        </Name><br/>
 :        <Occupation>student</Occupation><br/>
 :        </row></i><br/>
 :        is converted to<br/>
 :        <i>ID,Name,,Occupation<br/>
 :        ,,First Name,Last Name,<br/>
 :        1,Mr.,John,Howard,student</i><br/>
 :        <br/>
 :        If first-row-is-header is not specified and the columns have a deeper hierarchy,
 :          only the first layer of columns is processed, and the fields are the string values of each column.<br/>
 :        This element can have an attribute "ignore-foreign-input" with values "false" or "true" (default "false").
 :        When set to true it tells the serializer to ignore elements that to not match the header names.
 :        If set to false, the serializer will raise a csv:ForeignInput error for these elements.<br/>
 :     </dd>
 :    </dl>
 :
 : @param $xml a sequence of elements, each element representing a row. The name of each row element is ignored.
 :     The childs of each row are the column fields.
 : @param $options The options parameter. See the function description for details. 
 : This parameter is validated against "http://zorba.io/modules/csv-options" schema.
 : @return the csv or fixed size text as string containing all the lines
 : @error csv:CSV003 if the serialize output is streamable string and cannot be reset
 : @error csv:ForeignInput if there are input elements in subsequent rows that do not match the headers,
 :    and the options specify first-row-is-header and do not specify the ignore-foreign-input attribute
 : @error err:XQDY0027 if $options can not be validated against csv-options schema
 : @error err:XQDY0084 if the options parameter doesn't have the name "csv-options:options".
 : @example test/Queries/converters/csv/csv_serialize1.xq
 : @example test/Queries/converters/csv/csv_serialize2.xq
 : @example test/Queries/converters/csv/csv_serialize3.xq
 : @example test/Queries/converters/csv/csv_serialize5.xq
 : @example test/Queries/converters/csv/csv_serialize6.xq
 : @example test/Queries/converters/csv/csv_parse_serialize6.xq
 : @example test/Queries/converters/csv/txt_serialize6.xq
 : @example test/Queries/converters/csv/txt_parse_serialize6.xq
:)
declare function csv:serialize($xml as element()*,
                               $options as element(csv-options:options)?) as xs:string
{
  let $validated-options :=
  if(empty($options)) then
    $options
  else
  if(schemaOptions:is-validated($options)) then
    $options
  else
    validate{$options}
  return
    csv:serialize-internal($xml, $validated-options)
};
                                    
declare %private function csv:serialize-internal($xml as element()*,
                  $options as element(csv-options:options, csv-options:optionsType)?) as xs:string external;

(: vim:set et sw=2 ts=2: :)