Skip to content

Commit 7102087

Browse files
committed
Allow for escaping the delimiter in reader
1 parent 40ea4c4 commit 7102087

File tree

2 files changed

+80
-10
lines changed

2 files changed

+80
-10
lines changed

csv-core/src/reader.rs

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -428,14 +428,15 @@ enum NfaState {
428428
InQuotedField = 3,
429429
InEscapedQuote = 4,
430430
InDoubleEscapedQuote = 5,
431-
InComment = 6,
431+
InEscapeSequence = 6,
432+
InComment = 7,
432433
// All states below are "final field" states.
433434
// Namely, they indicate that a field has been parsed.
434-
EndFieldDelim = 7,
435+
EndFieldDelim = 8,
435436
// All states below are "final record" states.
436437
// Namely, they indicate that a record has been parsed.
437-
EndRecord = 8,
438-
CRLF = 9,
438+
EndRecord = 9,
439+
CRLF = 10,
439440
}
440441

441442
/// A list of NFA states that have an explicit representation in the DFA.
@@ -447,6 +448,7 @@ const NFA_STATES: &'static [NfaState] = &[
447448
NfaState::InQuotedField,
448449
NfaState::InEscapedQuote,
449450
NfaState::InDoubleEscapedQuote,
451+
NfaState::InEscapeSequence,
450452
NfaState::InComment,
451453
NfaState::EndRecord,
452454
NfaState::CRLF,
@@ -805,9 +807,9 @@ impl Reader {
805807
self.dfa.classes.add(self.delimiter);
806808
if self.quoting {
807809
self.dfa.classes.add(self.quote);
808-
if let Some(escape) = self.escape {
809-
self.dfa.classes.add(escape);
810-
}
810+
}
811+
if let Some(escape) = self.escape {
812+
self.dfa.classes.add(escape);
811813
}
812814
if let Some(comment) = self.comment {
813815
self.dfa.classes.add(comment);
@@ -970,7 +972,7 @@ impl Reader {
970972
match state {
971973
End | StartRecord | EndRecord | InComment | CRLF => End,
972974
StartField | EndFieldDelim | EndFieldTerm | InField
973-
| InQuotedField | InEscapedQuote | InDoubleEscapedQuote
975+
| InQuotedField | InEscapedQuote | InDoubleEscapedQuote | InEscapeSequence
974976
| InRecordTerm => EndRecord,
975977
}
976978
}
@@ -1007,6 +1009,8 @@ impl Reader {
10071009
(EndFieldDelim, NfaInputAction::Discard)
10081010
} else if self.term.equals(c) {
10091011
(EndFieldTerm, NfaInputAction::Epsilon)
1012+
} else if !self.quoting && self.escape == Some(c) {
1013+
(InEscapeSequence, NfaInputAction::Discard)
10101014
} else {
10111015
(InField, NfaInputAction::CopyToOutput)
10121016
}
@@ -1018,6 +1022,8 @@ impl Reader {
10181022
(EndFieldDelim, NfaInputAction::Discard)
10191023
} else if self.term.equals(c) {
10201024
(EndFieldTerm, NfaInputAction::Epsilon)
1025+
} else if !self.quoting && self.escape == Some(c) {
1026+
(InEscapeSequence, NfaInputAction::Discard)
10211027
} else {
10221028
(InField, NfaInputAction::CopyToOutput)
10231029
}
@@ -1043,6 +1049,7 @@ impl Reader {
10431049
(InField, NfaInputAction::CopyToOutput)
10441050
}
10451051
}
1052+
InEscapeSequence => (InField, NfaInputAction::CopyToOutput),
10461053
InComment => {
10471054
if b'\n' == c {
10481055
(StartRecord, NfaInputAction::Discard)
@@ -1087,7 +1094,7 @@ impl Reader {
10871094
/// be reached by epsilon transitions will never have explicit usage in the
10881095
/// DFA.
10891096
const TRANS_CLASSES: usize = 7;
1090-
const DFA_STATES: usize = 10;
1097+
const DFA_STATES: usize = 11;
10911098
const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES;
10921099

10931100
/// The number of possible transition classes. (See the comment on `TRANS_SIZE`
@@ -1119,6 +1126,8 @@ struct Dfa {
11191126
in_field: DfaState,
11201127
/// The DFA state corresponding to being inside an quoted field.
11211128
in_quoted: DfaState,
1129+
/// The DFA state corresponding to being in an escape sequence.
1130+
in_escape_sequence: DfaState,
11221131
/// The minimum DFA state that indicates a field has been parsed. All DFA
11231132
/// states greater than this are also final-field states.
11241133
final_field: DfaState,
@@ -1135,6 +1144,7 @@ impl Dfa {
11351144
classes: DfaClasses::new(),
11361145
in_field: DfaState(0),
11371146
in_quoted: DfaState(0),
1147+
in_escape_sequence: DfaState(0),
11381148
final_field: DfaState(0),
11391149
final_record: DfaState(0),
11401150
}
@@ -1170,6 +1180,7 @@ impl Dfa {
11701180
fn finish(&mut self) {
11711181
self.in_field = self.new_state(NfaState::InField);
11721182
self.in_quoted = self.new_state(NfaState::InQuotedField);
1183+
self.in_escape_sequence = self.new_state(NfaState::InEscapeSequence);
11731184
self.final_field = self.new_state(NfaState::EndFieldDelim);
11741185
self.final_record = self.new_state(NfaState::EndRecord);
11751186
}
@@ -1665,6 +1676,15 @@ mod tests {
16651676
}
16661677
);
16671678

1679+
parses_to!(
1680+
escape_sequence,
1681+
"a\\,b\\\\c,\\,fo\"o\\,,bar",
1682+
csv![["a,b\\c", ",fo\"o,", "bar"]],
1683+
|b: &mut ReaderBuilder| {
1684+
b.quoting(false).escape(Some(b'\\'));
1685+
}
1686+
);
1687+
16681688
parses_to!(
16691689
delimiter_tabs,
16701690
"a\tb",
@@ -1863,6 +1883,25 @@ mod tests {
18631883
assert_read!(rdr, &[], out, 0, 0, End);
18641884
}
18651885

1886+
// Test we can read escape sequences correctly in a stream.
1887+
#[test]
1888+
fn stream_escape_sequence() {
1889+
use crate::ReadFieldResult::*;
1890+
1891+
let out = &mut [0; 10];
1892+
let mut builder = ReaderBuilder::new();
1893+
let mut rdr = builder.quoting(false).escape(Some(b'\\')).build();
1894+
1895+
assert_read!(rdr, b("\\,f\\\\o\\"), out, 7, 4, InputEmpty);
1896+
assert_eq!(&out[..4], b(",f\\o"));
1897+
1898+
assert_read!(rdr, b(",o\\,"), &mut out[4..], 4, 3, InputEmpty);
1899+
assert_eq!(&out[..7], b(",f\\o,o,"));
1900+
1901+
assert_read!(rdr, &[], out, 0, 0, Field { record_end: true });
1902+
assert_read!(rdr, &[], out, 0, 0, End);
1903+
}
1904+
18661905
// Test that empty output buffers don't wreak havoc.
18671906
#[test]
18681907
fn stream_empty_output() {

src/reader.rs

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,9 +428,12 @@ impl ReaderBuilder {
428428
/// In some variants of CSV, quotes are escaped using a special escape
429429
/// character like `\` (instead of escaping quotes by doubling them).
430430
///
431+
/// Other variants of CSV may use an escape character to escape delimiters instead
432+
/// of using quoted fields, this is supported only when quoting is disabled.
433+
///
431434
/// By default, recognizing these idiosyncratic escapes is disabled.
432435
///
433-
/// # Example
436+
/// # Example with escaped quotes
434437
///
435438
/// ```
436439
/// use std::error::Error;
@@ -457,6 +460,34 @@ impl ReaderBuilder {
457460
/// }
458461
/// }
459462
/// ```
463+
///
464+
/// # Example with escaped delimiters
465+
///
466+
/// ```
467+
/// use std::error::Error;
468+
/// use csv::ReaderBuilder;
469+
/// # fn main() { example().unwrap(); }
470+
/// fn example() -> Result<(), Box<dyn Error>> {
471+
/// let data = "\
472+
/// city,country,pop
473+
/// Boston,The\\, United\\, States,4628910
474+
/// ";
475+
/// let mut rdr = ReaderBuilder::new()
476+
/// .quoting(false)
477+
/// .escape(Some(b'\\'))
478+
/// .from_reader(data.as_bytes());
479+
///
480+
/// if let Some(result) = rdr.records().next() {
481+
/// let record = result?;
482+
/// assert_eq!(record, vec![
483+
/// "Boston", "The, United, States", "4628910",
484+
/// ]);
485+
/// Ok(())
486+
/// } else {
487+
/// Err(From::from("expected at least one record but got none"))
488+
/// }
489+
/// }
490+
/// ```
460491
pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder {
461492
self.builder.escape(escape);
462493
self

0 commit comments

Comments
 (0)