blob: 7f5c2b24c01153968012004df7b0a7aab80ebc35 [file] [log] [blame]
//Written in the D programming language
/**
* Implements functionality to read Comma Separated Values and its variants
* from an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of `dchar`.
*
* Comma Separated Values provide a simple means to transfer and store
* tabular data. It has been common for programs to use their own
* variant of the CSV format. This parser will loosely follow the
* $(HTTP tools.ietf.org/html/rfc4180, RFC-4180). CSV input should adhere
* to the following criteria (differences from RFC-4180 in parentheses):
*
* $(UL
* $(LI A record is separated by a new line (CRLF,LF,CR))
* $(LI A final record may end with a new line)
* $(LI A header may be provided as the first record in input)
* $(LI A record has fields separated by a comma (customizable))
* $(LI A field containing new lines, commas, or double quotes
* should be enclosed in double quotes (customizable))
* $(LI Double quotes in a field are escaped with a double quote)
* $(LI Each record should contain the same number of fields)
* )
*
* Example:
*
* -------
* import std.algorithm;
* import std.array;
* import std.csv;
* import std.stdio;
* import std.typecons;
*
* void main()
* {
* auto text = "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n";
*
* foreach (record; csvReader!(Tuple!(string, string, int))(text))
* {
* writefln("%s works as a %s and earns $%d per year",
* record[0], record[1], record[2]);
* }
*
* // To read the same string from the file "filename.csv":
*
* auto file = File("filename.csv", "r");
* foreach (record;
* file.byLine.joiner("\n").csvReader!(Tuple!(string, string, int)))
* {
* writefln("%s works as a %s and earns $%d per year",
* record[0], record[1], record[2]);
* }
}
* }
* -------
*
* When an input contains a header the `Contents` can be specified as an
* associative array. Passing null to signify that a header is present.
*
* -------
* auto text = "Name,Occupation,Salary\r" ~
* "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n";
*
* foreach (record; csvReader!(string[string])
* (text, null))
* {
* writefln("%s works as a %s and earns $%s per year.",
* record["Name"], record["Occupation"],
* record["Salary"]);
* }
*
* // To read the same string from the file "filename.csv":
*
* auto file = File("filename.csv", "r");
*
* foreach (record; csvReader!(string[string])
* (file.byLine.joiner("\n"), null))
* {
* writefln("%s works as a %s and earns $%s per year.",
* record["Name"], record["Occupation"],
* record["Salary"]);
* }
* -------
*
* This module allows content to be iterated by record stored in a struct,
* class, associative array, or as a range of fields. Upon detection of an
* error an CSVException is thrown (can be disabled). csvNextToken has been
* made public to allow for attempted recovery.
*
* Disabling exceptions will lift many restrictions specified above. A quote
* can appear in a field if the field was not quoted. If in a quoted field any
* quote by itself, not at the end of a field, will end processing for that
* field. The field is ended when there is no input, even if the quote was not
* closed.
*
* See_Also:
* $(HTTP en.wikipedia.org/wiki/Comma-separated_values, Wikipedia
* Comma-separated values)
*
* Copyright: Copyright 2011
* License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
* Authors: Jesse Phillips
* Source: $(PHOBOSSRC std/csv.d)
*/
module std.csv;
import std.conv;
import std.exception : basicExceptionCtors;
import std.range.primitives;
import std.traits;
/**
* Exception containing the row and column for when an exception was thrown.
*
* Numbering of both row and col start at one and corresponds to the location
* in the file rather than any specified header. Special consideration should
* be made when there is failure to match the header see $(LREF
* HeaderMismatchException) for details.
*
* When performing type conversions, $(REF ConvException, std,conv) is stored in
* the `next` field.
*/
class CSVException : Exception
{
///
size_t row, col;
// FIXME: Use std.exception.basicExceptionCtors here once
// https://issues.dlang.org/show_bug.cgi?id=11500 is fixed
this(string msg, string file = __FILE__, size_t line = __LINE__,
Throwable next = null) @nogc @safe pure nothrow
{
super(msg, file, line, next);
}
this(string msg, Throwable next, string file = __FILE__,
size_t line = __LINE__) @nogc @safe pure nothrow
{
super(msg, file, line, next);
}
this(string msg, size_t row, size_t col, Throwable next = null,
string file = __FILE__, size_t line = __LINE__) @nogc @safe pure nothrow
{
super(msg, next, file, line);
this.row = row;
this.col = col;
}
override string toString() @safe pure const
{
return "(Row: " ~ to!string(row) ~
", Col: " ~ to!string(col) ~ ") " ~ msg;
}
}
///
@safe unittest
{
import std.exception : collectException;
import std.algorithm.searching : count;
string text = "a,b,c\nHello,65";
auto ex = collectException!CSVException(csvReader(text).count);
assert(ex.toString == "(Row: 0, Col: 0) Row 2's length 2 does not match previous length of 3.");
}
///
@safe unittest
{
import std.exception : collectException;
import std.algorithm.searching : count;
import std.typecons : Tuple;
string text = "a,b\nHello,65";
auto ex = collectException!CSVException(csvReader!(Tuple!(string,int))(text).count);
assert(ex.toString == "(Row: 1, Col: 2) Unexpected 'b' when converting from type string to type int");
}
@safe pure unittest
{
import std.string;
auto e1 = new Exception("Foobar");
auto e2 = new CSVException("args", e1);
assert(e2.next is e1);
size_t r = 13;
size_t c = 37;
auto e3 = new CSVException("argv", r, c);
assert(e3.row == r);
assert(e3.col == c);
auto em = e3.toString();
assert(em.indexOf("13") != -1);
assert(em.indexOf("37") != -1);
}
/**
* Exception thrown when a Token is identified to not be completed: a quote is
* found in an unquoted field, data continues after a closing quote, or the
* quoted field was not closed before data was empty.
*/
class IncompleteCellException : CSVException
{
/**
* Data pulled from input before finding a problem
*
* This field is populated when using $(LREF csvReader)
* but not by $(LREF csvNextToken) as this data will have
* already been fed to the output range.
*/
dstring partialData;
mixin basicExceptionCtors;
}
///
@safe unittest
{
import std.exception : assertThrown;
string text = "a,\"b,c\nHello,65,2.5";
assertThrown!IncompleteCellException(text.csvReader(["a","b","c"]));
}
@safe pure unittest
{
auto e1 = new Exception("Foobar");
auto e2 = new IncompleteCellException("args", e1);
assert(e2.next is e1);
}
/**
* Exception thrown under different conditions based on the type of $(D
* Contents).
*
* Structure, Class, and Associative Array
* $(UL
* $(LI When a header is provided but a matching column is not found)
* )
*
* Other
* $(UL
* $(LI When a header is provided but a matching column is not found)
* $(LI Order did not match that found in the input)
* )
*
* Since a row and column is not meaningful when a column specified by the
* header is not found in the data, both row and col will be zero. Otherwise
* row is always one and col is the first instance found in header that
* occurred before the previous starting at one.
*/
class HeaderMismatchException : CSVException
{
mixin basicExceptionCtors;
}
///
@safe unittest
{
import std.exception : assertThrown;
string text = "a,b,c\nHello,65,2.5";
assertThrown!HeaderMismatchException(text.csvReader(["b","c","invalid"]));
}
@safe pure unittest
{
auto e1 = new Exception("Foobar");
auto e2 = new HeaderMismatchException("args", e1);
assert(e2.next is e1);
}
/**
* Determines the behavior for when an error is detected.
*
* Disabling exception will follow these rules:
* $(UL
* $(LI A quote can appear in a field if the field was not quoted.)
* $(LI If in a quoted field any quote by itself, not at the end of a
* field, will end processing for that field.)
* $(LI The field is ended when there is no input, even if the quote was
* not closed.)
* $(LI If the given header does not match the order in the input, the
* content will return as it is found in the input.)
* $(LI If the given header contains columns not found in the input they
* will be ignored.)
* )
*/
enum Malformed
{
ignore, /// No exceptions are thrown due to incorrect CSV.
throwException /// Use exceptions when input has incorrect CSV.
}
///
@safe unittest
{
import std.algorithm.comparison : equal;
import std.algorithm.searching : count;
import std.exception : assertThrown;
string text = "a,b,c\nHello,65,\"2.5";
assertThrown!IncompleteCellException(text.csvReader.count);
// ignore the exceptions and try to handle invalid CSV
auto firstLine = text.csvReader!(string, Malformed.ignore)(null).front;
assert(firstLine.equal(["Hello", "65", "2.5"]));
}
/**
Returns an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
for iterating over records found in `input`.
An optional `header` can be provided. The first record will be read in
as the header. If `Contents` is a struct then the header provided is
expected to correspond to the fields in the struct. When `Contents` is
not a type which can contain the entire record, the `header` must be
provided in the same order as the input or an exception is thrown.
Returns:
An input range R as defined by
$(REF isInputRange, std,range,primitives). When `Contents` is a
struct, class, or an associative array, the element type of R is
`Contents`, otherwise the element type of R is itself a range with
element type `Contents`.
If a `header` argument is provided,
the returned range provides a `header` field for accessing the header
from the input in array form.
Throws:
$(LREF CSVException) When a quote is found in an unquoted field,
data continues after a closing quote, the quoted field was not
closed before data was empty, a conversion failed, or when the row's
length does not match the previous length.
$(LREF HeaderMismatchException) when a header is provided but a
matching column is not found or the order did not match that found in
the input. Read the exception documentation for specific details of
when the exception is thrown for different types of `Contents`.
*/
auto csvReader(Contents = string,Malformed ErrorLevel = Malformed.throwException, Range, Separator = char)(Range input,
Separator delimiter = ',', Separator quote = '"',
bool allowInconsistentDelimiterCount = false)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)
&& isSomeChar!(Separator)
&& !is(Contents T : T[U], U : string))
{
return CsvReader!(Contents,ErrorLevel,Range,
Unqual!(ElementType!Range),string[])
(input, delimiter, quote, allowInconsistentDelimiterCount);
}
/// ditto
auto csvReader(Contents = string,
Malformed ErrorLevel = Malformed.throwException,
Range, Header, Separator = char)
(Range input, Header header,
Separator delimiter = ',', Separator quote = '"',
bool allowInconsistentDelimiterCount = false)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)
&& isSomeChar!(Separator)
&& isForwardRange!Header
&& isSomeString!(ElementType!Header))
{
return CsvReader!(Contents,ErrorLevel,Range,
Unqual!(ElementType!Range),Header)
(input, header, delimiter, quote, allowInconsistentDelimiterCount);
}
/// ditto
auto csvReader(Contents = string,
Malformed ErrorLevel = Malformed.throwException,
Range, Header, Separator = char)
(Range input, Header header,
Separator delimiter = ',', Separator quote = '"',
bool allowInconsistentDelimiterCount = false)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)
&& isSomeChar!(Separator)
&& is(Header : typeof(null)))
{
return CsvReader!(Contents,ErrorLevel,Range,
Unqual!(ElementType!Range),string[])
(input, cast(string[]) null, delimiter, quote,
allowInconsistentDelimiterCount);
}
/**
The `Contents` of the input can be provided if all the records are the
same type such as all integer data:
*/
@safe unittest
{
import std.algorithm.comparison : equal;
string text = "76,26,22";
auto records = text.csvReader!int;
assert(records.equal!equal([
[76, 26, 22],
]));
}
/**
Using a struct with modified delimiter:
*/
@safe unittest
{
import std.algorithm.comparison : equal;
string text = "Hello;65;2.5\nWorld;123;7.5";
struct Layout
{
string name;
int value;
double other;
}
auto records = text.csvReader!Layout(';');
assert(records.equal([
Layout("Hello", 65, 2.5),
Layout("World", 123, 7.5),
]));
}
/**
Specifying `ErrorLevel` as $(LREF Malformed.ignore) will lift restrictions
on the format. This example shows that an exception is not thrown when
finding a quote in a field not quoted.
*/
@safe unittest
{
string text = "A \" is now part of the data";
auto records = text.csvReader!(string, Malformed.ignore);
auto record = records.front;
assert(record.front == text);
}
/// Read only column "b"
@safe unittest
{
import std.algorithm.comparison : equal;
string text = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
auto records = text.csvReader!int(["b"]);
assert(records.equal!equal([
[65],
[123],
]));
}
/// Read while rearranging the columns by specifying a header with a different order"
@safe unittest
{
import std.algorithm.comparison : equal;
string text = "a,b,c\nHello,65,2.5\nWorld,123,7.5";
struct Layout
{
int value;
double other;
string name;
}
auto records = text.csvReader!Layout(["b","c","a"]);
assert(records.equal([
Layout(65, 2.5, "Hello"),
Layout(123, 7.5, "World")
]));
}
/**
The header can also be left empty if the input contains a header row
and all columns should be iterated.
The header from the input can always be accessed from the `header` field.
*/
@safe unittest
{
string text = "a,b,c\nHello,65,63.63";
auto records = text.csvReader(null);
assert(records.header == ["a","b","c"]);
}
/**
Handcrafted csv files tend to have an variable amount of columns.
By default `std.csv` will throw if the number of columns on a line
is unequal to the number of columns of the first line.
To allow, or disallow, a variable amount of columns a `bool` can be passed to
all overloads of the `csvReader` function as shown below.
*/
@safe unittest
{
import std.algorithm.comparison : equal;
string text = "76,26,22\n1,2\n3,4,5,6";
auto records = text.csvReader!int(',', '"', true);
assert(records.equal!equal([
[76, 26, 22],
[1, 2],
[3, 4, 5, 6]
]));
}
/// ditto
@safe unittest
{
import std.algorithm.comparison : equal;
static struct Three
{
int a;
int b;
int c;
}
string text = "76,26,22\n1,2\n3,4,5,6";
auto records = text.csvReader!Three(',', '"', true);
assert(records.equal([
Three(76, 26, 22),
Three(1, 2, 0),
Three(3, 4, 5)
]));
}
/// ditto
@safe unittest
{
import std.algorithm.comparison : equal;
auto text = "Name,Occupation,Salary\r" ~
"Joe,Carpenter,300000\nFred,Blacksmith\r\n";
auto r = csvReader!(string[string])(text, null, ',', '"', true);
assert(r.equal([
[ "Name" : "Joe", "Occupation" : "Carpenter", "Salary" : "300000" ],
[ "Name" : "Fred", "Occupation" : "Blacksmith" ]
]));
}
// Test standard iteration over input.
@safe pure unittest
{
string str = `one,"two ""quoted"""` ~ "\n\"three\nnew line\",\nfive,six";
auto records = csvReader(str);
int count;
foreach (record; records)
{
foreach (cell; record)
{
count++;
}
}
assert(count == 6);
}
// Test newline on last record
@safe pure unittest
{
string str = "one,two\nthree,four\n";
auto records = csvReader(str);
records.popFront();
records.popFront();
assert(records.empty);
}
// Test shorter row length
@safe pure unittest
{
wstring str = "one,1\ntwo\nthree"w;
struct Layout
{
string name;
int value;
}
Layout[3] ans;
ans[0].name = "one";
ans[0].value = 1;
ans[1].name = "two";
ans[1].value = 0;
ans[2].name = "three";
ans[2].value = 0;
auto records = csvReader!(Layout,Malformed.ignore)(str);
int count;
foreach (record; records)
{
assert(ans[count].name == record.name);
assert(ans[count].value == record.value);
count++;
}
}
// Test shorter row length exception
@safe pure unittest
{
import std.exception;
struct A
{
string a,b,c;
}
auto strs = ["one,1\ntwo",
"one\ntwo,2,二\nthree,3,三",
"one\ntwo,2\nthree,3",
"one,1\ntwo\nthree,3"];
foreach (str; strs)
{
auto records = csvReader!A(str);
assertThrown!CSVException((){foreach (record; records) { }}());
}
}
// Test structure conversion interface with unicode.
@safe pure unittest
{
import std.math.algebraic : abs;
wstring str = "\U00010143Hello,65,63.63\nWorld,123,3673.562"w;
struct Layout
{
string name;
int value;
double other;
}
Layout[2] ans;
ans[0].name = "\U00010143Hello";
ans[0].value = 65;
ans[0].other = 63.63;
ans[1].name = "World";
ans[1].value = 123;
ans[1].other = 3673.562;
auto records = csvReader!Layout(str);
int count;
foreach (record; records)
{
assert(ans[count].name == record.name);
assert(ans[count].value == record.value);
assert(abs(ans[count].other - record.other) < 0.00001);
count++;
}
assert(count == ans.length);
}
// Test input conversion interface
@safe pure unittest
{
import std.algorithm;
string str = `76,26,22`;
int[] ans = [76,26,22];
auto records = csvReader!int(str);
foreach (record; records)
{
assert(equal(record, ans));
}
}
// Test struct & header interface and same unicode
@safe unittest
{
import std.math.algebraic : abs;
string str = "a,b,c\nHello,65,63.63\n➊➋➂❹,123,3673.562";
struct Layout
{
int value;
double other;
string name;
}
auto records = csvReader!Layout(str, ["b","c","a"]);
Layout[2] ans;
ans[0].name = "Hello";
ans[0].value = 65;
ans[0].other = 63.63;
ans[1].name = "➊➋➂❹";
ans[1].value = 123;
ans[1].other = 3673.562;
int count;
foreach (record; records)
{
assert(ans[count].name == record.name);
assert(ans[count].value == record.value);
assert(abs(ans[count].other - record.other) < 0.00001);
count++;
}
assert(count == ans.length);
}
// Test header interface
@safe unittest
{
import std.algorithm;
string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
auto records = csvReader!int(str, ["b"]);
auto ans = [[65],[123]];
foreach (record; records)
{
assert(equal(record, ans.front));
ans.popFront();
}
try
{
csvReader(str, ["c","b"]);
assert(0);
}
catch (HeaderMismatchException e)
{
assert(e.col == 2);
}
auto records2 = csvReader!(string,Malformed.ignore)
(str, ["b","a"], ',', '"');
auto ans2 = [["Hello","65"],["World","123"]];
foreach (record; records2)
{
assert(equal(record, ans2.front));
ans2.popFront();
}
str = "a,c,e\nJoe,Carpenter,300000\nFred,Fly,4";
records2 = csvReader!(string,Malformed.ignore)
(str, ["a","b","c","d"], ',', '"');
ans2 = [["Joe","Carpenter"],["Fred","Fly"]];
foreach (record; records2)
{
assert(equal(record, ans2.front));
ans2.popFront();
}
}
// Test null header interface
@safe unittest
{
string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
auto records = csvReader(str, ["a"]);
assert(records.header == ["a","b","c"]);
}
// Test unchecked read
@safe pure unittest
{
string str = "one \"quoted\"";
foreach (record; csvReader!(string,Malformed.ignore)(str))
{
foreach (cell; record)
{
assert(cell == "one \"quoted\"");
}
}
str = "one \"quoted\",two \"quoted\" end";
struct Ans
{
string a,b;
}
foreach (record; csvReader!(Ans,Malformed.ignore)(str))
{
assert(record.a == "one \"quoted\"");
assert(record.b == "two \"quoted\" end");
}
}
// Test partial data returned
@safe pure unittest
{
string str = "\"one\nnew line";
try
{
foreach (record; csvReader(str))
{}
assert(0);
}
catch (IncompleteCellException ice)
{
assert(ice.partialData == "one\nnew line");
}
}
// Test Windows line break
@safe pure unittest
{
string str = "one,two\r\nthree";
auto records = csvReader(str);
auto record = records.front;
assert(record.front == "one");
record.popFront();
assert(record.front == "two");
records.popFront();
record = records.front;
assert(record.front == "three");
}
// Test associative array support with unicode separator
@safe unittest
{
string str = "1❁2❁3\n34❁65❁63\n34❁65❁63";
auto records = csvReader!(string[string])(str,["3","1"],'❁');
int count;
foreach (record; records)
{
count++;
assert(record["1"] == "34");
assert(record["3"] == "63");
}
assert(count == 2);
}
// Test restricted range
@safe unittest
{
import std.typecons;
struct InputRange
{
dstring text;
this(dstring txt)
{
text = txt;
}
@property auto empty()
{
return text.empty;
}
void popFront()
{
text.popFront();
}
@property dchar front()
{
return text[0];
}
}
auto ir = InputRange("Name,Occupation,Salary\r"d~
"Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n"d);
foreach (record; csvReader(ir, cast(string[]) null))
foreach (cell; record) {}
foreach (record; csvReader!(Tuple!(string, string, int))
(ir,cast(string[]) null)) {}
foreach (record; csvReader!(string[string])
(ir,cast(string[]) null)) {}
}
@safe unittest // const/immutable dchars
{
import std.algorithm.iteration : map;
import std.array : array;
const(dchar)[] c = "foo,bar\n";
assert(csvReader(c).map!array.array == [["foo", "bar"]]);
immutable(dchar)[] i = "foo,bar\n";
assert(csvReader(i).map!array.array == [["foo", "bar"]]);
}
/*
* This struct is stored on the heap for when the structures
* are passed around.
*/
private pure struct Input(Range, Malformed ErrorLevel)
{
Range range;
size_t row, col;
static if (ErrorLevel == Malformed.throwException)
size_t rowLength;
}
/*
* Range for iterating CSV records.
*
* This range is returned by the $(LREF csvReader) functions. It can be
* created in a similar manner to allow `ErrorLevel` be set to $(LREF
* Malformed).ignore if best guess processing should take place.
*/
private struct CsvReader(Contents, Malformed ErrorLevel, Range, Separator, Header)
if (isSomeChar!Separator && isInputRange!Range
&& is(immutable ElementType!Range == immutable dchar)
&& isForwardRange!Header && isSomeString!(ElementType!Header))
{
private:
Input!(Range, ErrorLevel)* _input;
Separator _separator;
Separator _quote;
size_t[] indices;
bool _empty;
bool _allowInconsistentDelimiterCount;
static if (is(Contents == struct) || is(Contents == class))
{
Contents recordContent;
CsvRecord!(string, ErrorLevel, Range, Separator) recordRange;
}
else static if (is(Contents T : T[U], U : string))
{
Contents recordContent;
CsvRecord!(T, ErrorLevel, Range, Separator) recordRange;
}
else
CsvRecord!(Contents, ErrorLevel, Range, Separator) recordRange;
public:
/**
* Header from the input in array form.
*
* -------
* string str = "a,b,c\nHello,65,63.63";
* auto records = csvReader(str, ["a"]);
*
* assert(records.header == ["a","b","c"]);
* -------
*/
string[] header;
/**
* Constructor to initialize the input, delimiter and quote for input
* without a header.
*
* -------
* string str = `76;^26^;22`;
* int[] ans = [76,26,22];
* auto records = CsvReader!(int,Malformed.ignore,string,char,string[])
* (str, ';', '^');
*
* foreach (record; records)
* {
* assert(equal(record, ans));
* }
* -------
*/
this(Range input, Separator delimiter, Separator quote,
bool allowInconsistentDelimiterCount)
{
_input = new Input!(Range, ErrorLevel)(input);
_separator = delimiter;
_quote = quote;
_allowInconsistentDelimiterCount = allowInconsistentDelimiterCount;
if (_input.range.empty)
{
_empty = true;
return;
}
prime();
}
/**
* Constructor to initialize the input, delimiter and quote for input
* with a header.
*
* -------
* string str = `high;mean;low\n76;^26^;22`;
* auto records = CsvReader!(int,Malformed.ignore,string,char,string[])
* (str, ["high","low"], ';', '^');
*
* int[] ans = [76,22];
* foreach (record; records)
* {
* assert(equal(record, ans));
* }
* -------
*
* Throws:
* $(LREF HeaderMismatchException) when a header is provided but a
* matching column is not found or the order did not match that found
* in the input (non-struct).
*/
this(Range input, Header colHeaders, Separator delimiter, Separator quote,
bool allowInconsistentDelimiterCount)
{
_input = new Input!(Range, ErrorLevel)(input);
_separator = delimiter;
_quote = quote;
_allowInconsistentDelimiterCount = allowInconsistentDelimiterCount;
if (_input.range.empty)
{
_empty = true;
return;
}
size_t[string] colToIndex;
foreach (h; colHeaders)
{
colToIndex[h] = size_t.max;
}
auto r = CsvRecord!(string, ErrorLevel, Range, Separator)
(_input, _separator, _quote, indices,
_allowInconsistentDelimiterCount);
size_t colIndex;
foreach (col; r)
{
header ~= col;
auto ptr = col in colToIndex;
if (ptr)
*ptr = colIndex;
colIndex++;
}
// The above loop empties the header row.
recordRange._empty = true;
recordRange._allowInconsistentDelimiterCount =
allowInconsistentDelimiterCount;
indices.length = colToIndex.length;
int i;
foreach (h; colHeaders)
{
immutable index = colToIndex[h];
static if (ErrorLevel != Malformed.ignore)
if (index == size_t.max)
throw new HeaderMismatchException
("Header not found: " ~ to!string(h));
indices[i++] = index;
}
static if (!is(Contents == struct) && !is(Contents == class))
{
static if (is(Contents T : T[U], U : string))
{
import std.algorithm.sorting : sort;
sort(indices);
}
else static if (ErrorLevel == Malformed.ignore)
{
import std.algorithm.sorting : sort;
sort(indices);
}
else
{
import std.algorithm.searching : findAdjacent;
import std.algorithm.sorting : isSorted;
if (!isSorted(indices))
{
auto ex = new HeaderMismatchException
("Header in input does not match specified header.");
findAdjacent!"a > b"(indices);
ex.row = 1;
ex.col = indices.front;
throw ex;
}
}
}
popFront();
}
/**
* Part of an input range as defined by
* $(REF isInputRange, std,range,primitives).
*
* Returns:
* If `Contents` is a struct, will be filled with record data.
*
* If `Contents` is a class, will be filled with record data.
*
* If `Contents` is a associative array, will be filled
* with record data.
*
* If `Contents` is non-struct, a $(LREF CsvRecord) will be
* returned.
*/
@property auto front()
{
assert(!empty, "Attempting to fetch the front of an empty CsvReader");
static if (is(Contents == struct) || is(Contents == class))
{
return recordContent;
}
else static if (is(Contents T : T[U], U : string))
{
return recordContent;
}
else
{
return recordRange;
}
}
/**
* Part of an input range as defined by
* $(REF isInputRange, std,range,primitives).
*/
@property bool empty() @safe @nogc pure nothrow const
{
return _empty;
}
/**
* Part of an input range as defined by
* $(REF isInputRange, std,range,primitives).
*
* Throws:
* $(LREF CSVException) When a quote is found in an unquoted field,
* data continues after a closing quote, the quoted field was not
* closed before data was empty, a conversion failed, or when the
* row's length does not match the previous length.
*/
void popFront()
{
while (!recordRange.empty)
{
recordRange.popFront();
}
static if (ErrorLevel == Malformed.throwException)
if (_input.rowLength == 0)
_input.rowLength = _input.col;
_input.col = 0;
if (!_input.range.empty)
{
if (_input.range.front == '\r')
{
_input.range.popFront();
if (!_input.range.empty && _input.range.front == '\n')
_input.range.popFront();
}
else if (_input.range.front == '\n')
_input.range.popFront();
}
if (_input.range.empty)
{
_empty = true;
return;
}
prime();
}
private void prime()
{
if (_empty)
return;
_input.row++;
static if (is(Contents == struct) || is(Contents == class))
{
recordRange = typeof(recordRange)
(_input, _separator, _quote, null,
_allowInconsistentDelimiterCount);
}
else
{
recordRange = typeof(recordRange)
(_input, _separator, _quote, indices,
_allowInconsistentDelimiterCount);
}
static if (is(Contents T : T[U], U : string))
{
T[U] aa;
try
{
for (; !recordRange.empty; recordRange.popFront())
{
aa[header[_input.col-1]] = recordRange.front;
}
}
catch (ConvException e)
{
throw new CSVException(e.msg, _input.row, _input.col, e);
}
recordContent = aa;
}
else static if (is(Contents == struct) || is(Contents == class))
{
static if (is(Contents == class))
recordContent = new typeof(recordContent)();
else
recordContent = typeof(recordContent).init;
size_t colIndex;
try
{
for (; !recordRange.empty;)
{
auto colData = recordRange.front;
scope(exit) colIndex++;
if (indices.length > 0)
{
foreach (ti, ToType; Fields!(Contents))
{
if (indices[ti] == colIndex)
{
static if (!isSomeString!ToType) skipWS(colData);
recordContent.tupleof[ti] = to!ToType(colData);
}
}
}
else
{
foreach (ti, ToType; Fields!(Contents))
{
if (ti == colIndex)
{
static if (!isSomeString!ToType) skipWS(colData);
recordContent.tupleof[ti] = to!ToType(colData);
}
}
}
recordRange.popFront();
}
}
catch (ConvException e)
{
throw new CSVException(e.msg, _input.row, colIndex, e);
}
}
}
}
@safe pure unittest
{
import std.algorithm.comparison : equal;
string str = `76;^26^;22`;
int[] ans = [76,26,22];
auto records = CsvReader!(int,Malformed.ignore,string,char,string[])
(str, ';', '^', false);
foreach (record; records)
{
assert(equal(record, ans));
}
}
// https://issues.dlang.org/show_bug.cgi?id=15545
// @system due to the catch for Throwable
@system pure unittest
{
import std.exception : assertNotThrown;
enum failData =
"name, surname, age
Joe, Joker, 99\r";
auto r = csvReader(failData);
assertNotThrown((){foreach (entry; r){}}());
}
/*
* This input range is accessible through $(LREF CsvReader) when the
* requested `Contents` type is neither a structure or an associative array.
*/
private struct CsvRecord(Contents, Malformed ErrorLevel, Range, Separator)
if (!is(Contents == class) && !is(Contents == struct))
{
import std.array : appender;
private:
Input!(Range, ErrorLevel)* _input;
Separator _separator;
Separator _quote;
Contents curContentsoken;
typeof(appender!(dchar[])()) _front;
bool _empty;
bool _allowInconsistentDelimiterCount;
size_t[] _popCount;
public:
/*
* Params:
* input = Pointer to a character $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
* delimiter = Separator for each column
* quote = Character used for quotation
* indices = An array containing which columns will be returned.
* If empty, all columns are returned. List must be in order.
*/
this(Input!(Range, ErrorLevel)* input, Separator delimiter,
Separator quote, size_t[] indices,
bool allowInconsistentDelimiterCount)
{
_input = input;
_separator = delimiter;
_quote = quote;
_front = appender!(dchar[])();
_popCount = indices.dup;
_allowInconsistentDelimiterCount = allowInconsistentDelimiterCount;
// If a header was given, each call to popFront will need
// to eliminate so many tokens. This calculates
// how many will be skipped to get to the next header column
size_t normalizer;
foreach (ref c; _popCount)
{
static if (ErrorLevel == Malformed.ignore)
{
// If we are not throwing exceptions
// a header may not exist, indices are sorted
// and will be size_t.max if not found.
if (c == size_t.max)
break;
}
c -= normalizer;
normalizer += c + 1;
}
prime();
}
/**
* Part of an input range as defined by
* $(REF isInputRange, std,range,primitives).
*/
@property Contents front() @safe pure
{
assert(!empty, "Attempting to fetch the front of an empty CsvRecord");
return curContentsoken;
}
/**
* Part of an input range as defined by
* $(REF isInputRange, std,range,primitives).
*/
@property bool empty() @safe pure nothrow @nogc const
{
return _empty;
}
/*
* CsvRecord is complete when input
* is empty or starts with record break
*/
private bool recordEnd()
{
if (_input.range.empty
|| _input.range.front == '\n'
|| _input.range.front == '\r')
{
return true;
}
return false;
}
/**
* Part of an input range as defined by
* $(REF isInputRange, std,range,primitives).
*
* Throws:
* $(LREF CSVException) When a quote is found in an unquoted field,
* data continues after a closing quote, the quoted field was not
* closed before data was empty, a conversion failed, or when the
* row's length does not match the previous length.
*/
void popFront()
{
static if (ErrorLevel == Malformed.throwException)
import std.format : format;
// Skip last of record when header is depleted.
if (_popCount.ptr && _popCount.empty)
while (!recordEnd())
{
prime(1);
}
if (recordEnd())
{
_empty = true;
static if (ErrorLevel == Malformed.throwException)
{
if (_input.rowLength != 0 && _input.col != _input.rowLength
&& !_allowInconsistentDelimiterCount)
{
throw new CSVException(
format("Row %s's length %s does not match "~
"previous length of %s.", _input.row,
_input.col, _input.rowLength));
}
}
return;
}
else
{
static if (ErrorLevel == Malformed.throwException)
{
if (_input.rowLength != 0 && _input.col > _input.rowLength)
{
if (!_allowInconsistentDelimiterCount)
{
throw new CSVException(
format("Row %s's length %s does not match "~
"previous length of %s.", _input.row,
_input.col, _input.rowLength));
}
else
{
_empty = true;
return;
}
}
}
}
// Separator is left on the end of input from the last call.
// This cannot be moved to after the call to csvNextToken as
// there may be an empty record after it.
if (_input.range.front == _separator)
_input.range.popFront();
_front.shrinkTo(0);
prime();
}
/*
* Handles moving to the next skipNum token.
*/
private void prime(size_t skipNum)
{
foreach (i; 0 .. skipNum)
{
_input.col++;
_front.shrinkTo(0);
if (_input.range.front == _separator)
_input.range.popFront();
try
csvNextToken!(Range, ErrorLevel, Separator)
(_input.range, _front, _separator, _quote,false);
catch (IncompleteCellException ice)
{
ice.row = _input.row;
ice.col = _input.col;
ice.partialData = _front.data.idup;
throw ice;
}
catch (ConvException e)
{
throw new CSVException(e.msg, _input.row, _input.col, e);
}
}
}
private void prime()
{
try
{
_input.col++;
csvNextToken!(Range, ErrorLevel, Separator)
(_input.range, _front, _separator, _quote,false);
}
catch (IncompleteCellException ice)
{
ice.row = _input.row;
ice.col = _input.col;
ice.partialData = _front.data.idup;
throw ice;
}
auto skipNum = _popCount.empty ? 0 : _popCount.front;
if (!_popCount.empty)
_popCount.popFront();
if (skipNum == size_t.max)
{
while (!recordEnd())
prime(1);
_empty = true;
return;
}
if (skipNum)
prime(skipNum);
auto data = _front.data;
static if (!isSomeString!Contents) skipWS(data);
try curContentsoken = to!Contents(data);
catch (ConvException e)
{
throw new CSVException(e.msg, _input.row, _input.col, e);
}
}
}
/**
* Lower level control over parsing CSV
*
* This function consumes the input. After each call the input will
* start with either a delimiter or record break (\n, \r\n, \r) which
* must be removed for subsequent calls.
*
* Params:
* input = Any CSV input
* ans = The first field in the input
* sep = The character to represent a comma in the specification
* quote = The character to represent a quote in the specification
* startQuoted = Whether the input should be considered to already be in
* quotes
*
* Throws:
* $(LREF IncompleteCellException) When a quote is found in an unquoted
* field, data continues after a closing quote, or the quoted field was
* not closed before data was empty.
*/
void csvNextToken(Range, Malformed ErrorLevel = Malformed.throwException,
Separator, Output)
(ref Range input, ref Output ans,
Separator sep, Separator quote,
bool startQuoted = false)
if (isSomeChar!Separator && isInputRange!Range
&& is(immutable ElementType!Range == immutable dchar)
&& isOutputRange!(Output, dchar))
{
bool quoted = startQuoted;
bool escQuote;
if (input.empty)
return;
if (input.front == '\n')
return;
if (input.front == '\r')
return;
if (input.front == quote)
{
quoted = true;
input.popFront();
}
while (!input.empty)
{
assert(!(quoted && escQuote),
"Invalid quotation state in csvNextToken");
if (!quoted)
{
// When not quoted the token ends at sep
if (input.front == sep)
break;
if (input.front == '\r')
break;
if (input.front == '\n')
break;
}
if (!quoted && !escQuote)
{
if (input.front == quote)
{
// Not quoted, but quote found
static if (ErrorLevel == Malformed.throwException)
throw new IncompleteCellException(
"Quote located in unquoted token");
else static if (ErrorLevel == Malformed.ignore)
ans.put(quote);
}
else
{
// Not quoted, non-quote character
ans.put(input.front);
}
}
else
{
if (input.front == quote)
{
// Quoted, quote found
// By turning off quoted and turning on escQuote
// I can tell when to add a quote to the string
// escQuote is turned to false when it escapes a
// quote or is followed by a non-quote (see outside else).
// They are mutually exclusive, but provide different
// information.
if (escQuote)
{
escQuote = false;
quoted = true;
ans.put(quote);
} else
{
escQuote = true;
quoted = false;
}
}
else
{
// Quoted, non-quote character
if (escQuote)
{
static if (ErrorLevel == Malformed.throwException)
throw new IncompleteCellException(
"Content continues after end quote, " ~
"or needs to be escaped.");
else static if (ErrorLevel == Malformed.ignore)
break;
}
ans.put(input.front);
}
}
input.popFront();
}
static if (ErrorLevel == Malformed.throwException)
if (quoted && (input.empty || input.front == '\n' || input.front == '\r'))
throw new IncompleteCellException(
"Data continues on future lines or trailing quote");
}
///
@safe unittest
{
import std.array : appender;
import std.range.primitives : popFront;
string str = "65,63\n123,3673";
auto a = appender!(char[])();
csvNextToken(str,a,',','"');
assert(a.data == "65");
assert(str == ",63\n123,3673");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "63");
assert(str == "\n123,3673");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "123");
assert(str == ",3673");
}
// Test csvNextToken on simplest form and correct format.
@safe pure unittest
{
import std.array;
string str = "\U00010143Hello,65,63.63\nWorld,123,3673.562";
auto a = appender!(dchar[])();
csvNextToken!string(str,a,',','"');
assert(a.data == "\U00010143Hello");
assert(str == ",65,63.63\nWorld,123,3673.562");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "65");
assert(str == ",63.63\nWorld,123,3673.562");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "63.63");
assert(str == "\nWorld,123,3673.562");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "World");
assert(str == ",123,3673.562");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "123");
assert(str == ",3673.562");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "3673.562");
assert(str == "");
}
// Test quoted tokens
@safe pure unittest
{
import std.array;
string str = `one,two,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix";
auto a = appender!(dchar[])();
csvNextToken!string(str,a,',','"');
assert(a.data == "one");
assert(str == `,two,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "two");
assert(str == `,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "three \"quoted\"");
assert(str == `,"",` ~ "\"five\nnew line\"\nsix");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "");
assert(str == ",\"five\nnew line\"\nsix");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "five\nnew line");
assert(str == "\nsix");
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "six");
assert(str == "");
}
// Test empty data is pulled at end of record.
@safe pure unittest
{
import std.array;
string str = "one,";
auto a = appender!(dchar[])();
csvNextToken(str,a,',','"');
assert(a.data == "one");
assert(str == ",");
a.shrinkTo(0);
csvNextToken(str,a,',','"');
assert(a.data == "");
}
// Test exceptions
@safe pure unittest
{
import std.array;
string str = "\"one\nnew line";
typeof(appender!(dchar[])()) a;
try
{
a = appender!(dchar[])();
csvNextToken(str,a,',','"');
assert(0);
}
catch (IncompleteCellException ice)
{
assert(a.data == "one\nnew line");
assert(str == "");
}
str = "Hello world\"";
try
{
a = appender!(dchar[])();
csvNextToken(str,a,',','"');
assert(0);
}
catch (IncompleteCellException ice)
{
assert(a.data == "Hello world");
assert(str == "\"");
}
str = "one, two \"quoted\" end";
a = appender!(dchar[])();
csvNextToken!(string,Malformed.ignore)(str,a,',','"');
assert(a.data == "one");
str.popFront();
a.shrinkTo(0);
csvNextToken!(string,Malformed.ignore)(str,a,',','"');
assert(a.data == " two \"quoted\" end");
}
// Test modifying token delimiter
@safe pure unittest
{
import std.array;
string str = `one|two|/three "quoted"/|//`;
auto a = appender!(dchar[])();
csvNextToken(str,a, '|','/');
assert(a.data == "one"d);
assert(str == `|two|/three "quoted"/|//`);
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a, '|','/');
assert(a.data == "two"d);
assert(str == `|/three "quoted"/|//`);
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a, '|','/');
assert(a.data == `three "quoted"`);
assert(str == `|//`);
str.popFront();
a.shrinkTo(0);
csvNextToken(str,a, '|','/');
assert(a.data == ""d);
}
// https://issues.dlang.org/show_bug.cgi?id=8908
@safe pure unittest
{
string csv = ` 1.0, 2.0, 3.0
4.0, 5.0, 6.0`;
static struct Data { real a, b, c; }
size_t i = 0;
foreach (data; csvReader!Data(csv)) with (data)
{
int[] row = [cast(int) a, cast(int) b, cast(int) c];
if (i == 0)
assert(row == [1, 2, 3]);
else
assert(row == [4, 5, 6]);
++i;
}
i = 0;
foreach (data; csvReader!real(csv))
{
auto a = data.front; data.popFront();
auto b = data.front; data.popFront();
auto c = data.front;
int[] row = [cast(int) a, cast(int) b, cast(int) c];
if (i == 0)
assert(row == [1, 2, 3]);
else
assert(row == [4, 5, 6]);
++i;
}
}
// https://issues.dlang.org/show_bug.cgi?id=21629
@safe pure unittest
{
import std.typecons : Tuple;
struct Reccord
{
string a;
string b;
}
auto header = ["a" ,"b"];
string input = "";
assert(csvReader!Reccord(input).empty, "This should be empty");
assert(csvReader!Reccord(input, header).empty, "This should be empty");
assert(csvReader!(Tuple!(string,string))(input).empty, "This should be empty");
assert(csvReader!(string[string])(input, header).empty, "This should be empty");
assert(csvReader!(string[string])(input, null).empty, "This should be empty");
assert(csvReader!(int)(input, null).empty, "This should be empty");
}