src/encoding/csv/reader.go - go - Git at Google (original) (raw)

// Copyright 2011 The Go Authors. All rights reserved.

// Use of this source code is governed by a BSD-style

// license that can be found in the LICENSE file.

// Package csv reads and writes comma-separated values (CSV) files.

// There are many kinds of CSV files; this package supports the format

// described in RFC 4180, except that [Writer] uses LF

// instead of CRLF as newline character by default.

//

// A csv file contains zero or more records of one or more fields per record.

// Each record is separated by the newline character. The final record may

// optionally be followed by a newline character.

//

// field1,field2,field3

//

// White space is considered part of a field.

//

// Carriage returns before newline characters are silently removed.

//

// Blank lines are ignored. A line with only whitespace characters (excluding

// the ending newline character) is not considered a blank line.

//

// Fields which start and stop with the quote character " are called

// quoted-fields. The beginning and ending quote are not part of the

// field.

//

// The source:

//

// normal string,"quoted-field"

//

// results in the fields

//

// {`normal string`, `quoted-field`}

//

// Within a quoted-field a quote character followed by a second quote

// character is considered a single quote.

//

// "the ""word"" is true","a ""quoted-field"""

//

// results in

//

// {`the "word" is true`, `a "quoted-field"`}

//

// Newlines and commas may be included in a quoted-field

//

// "Multi-line

// field","comma is ,"

//

// results in

//

// {`Multi-line

// field`, `comma is ,`}

package csv

import (

"bufio"

"bytes"

"errors"

"fmt"

"io"

"unicode"

"unicode/utf8"

)

// A ParseError is returned for parsing errors.

// Line and column numbers are 1-indexed.

type ParseError struct {

StartLine int // Line where the record starts

Line int // Line where the error occurred

Column int // Column (1-based byte index) where the error occurred

Err error // The actual error

}

func (e *ParseError) Error() string {

if e.Err == ErrFieldCount {

return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)

}

if e.StartLine != e.Line {

return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err)

}

return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)

}

func (e *ParseError) Unwrap() error { return e.Err }

// These are the errors that can be returned in [ParseError.Err].

var (

ErrBareQuote = errors.New("bare \" in non-quoted-field")

ErrQuote = errors.New("extraneous or missing \" in quoted-field")

ErrFieldCount = errors.New("wrong number of fields")

// Deprecated: ErrTrailingComma is no longer used.

ErrTrailingComma = errors.New("extra delimiter at end of line")

)

var errInvalidDelim = errors.New("csv: invalid field or comment delimiter")

func validDelim(r rune) bool {

return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError

}

// A Reader reads records from a CSV-encoded file.

//

// As returned by [NewReader], a Reader expects input conforming to RFC 4180.

// The exported fields can be changed to customize the details before the

// first call to [Reader.Read] or [Reader.ReadAll].

//

// The Reader converts all \r\n sequences in its input to plain \n,

// including in multiline field values, so that the returned data does

// not depend on which line-ending convention an input file uses.

type Reader struct {

// Comma is the field delimiter.

// It is set to comma (',') by NewReader.

// Comma must be a valid rune and must not be \r, \n,

// or the Unicode replacement character (0xFFFD).

Comma rune

// Comment, if not 0, is the comment character. Lines beginning with the

// Comment character without preceding whitespace are ignored.

// With leading whitespace the Comment character becomes part of the

// field, even if TrimLeadingSpace is true.

// Comment must be a valid rune and must not be \r, \n,

// or the Unicode replacement character (0xFFFD).

// It must also not be equal to Comma.

Comment rune

// FieldsPerRecord is the number of expected fields per record.

// If FieldsPerRecord is positive, Read requires each record to

// have the given number of fields. If FieldsPerRecord is 0, Read sets it to

// the number of fields in the first record, so that future records must

// have the same field count. If FieldsPerRecord is negative, no check is

// made and records may have a variable number of fields.

FieldsPerRecord int

// If LazyQuotes is true, a quote may appear in an unquoted field and a

// non-doubled quote may appear in a quoted field.

LazyQuotes bool

// If TrimLeadingSpace is true, leading white space in a field is ignored.

// This is done even if the field delimiter, Comma, is white space.

TrimLeadingSpace bool

// ReuseRecord controls whether calls to Read may return a slice sharing

// the backing array of the previous call's returned slice for performance.

// By default, each call to Read returns newly allocated memory owned by the caller.

ReuseRecord bool

// Deprecated: TrailingComma is no longer used.

TrailingComma bool

r *bufio.Reader

// numLine is the current line being read in the CSV file.

numLine int

// offset is the input stream byte offset of the current reader position.

offset int64

// rawBuffer is a line buffer only used by the readLine method.

rawBuffer []byte

// recordBuffer holds the unescaped fields, one after another.

// The fields can be accessed by using the indexes in fieldIndexes.

// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`

// and fieldIndexes will contain the indexes [1, 2, 5, 6].

recordBuffer []byte

// fieldIndexes is an index of fields inside recordBuffer.

// The i'th field ends at offset fieldIndexes[i] in recordBuffer.

fieldIndexes []int

// fieldPositions is an index of field positions for the

// last record returned by Read.

fieldPositions []position

// lastRecord is a record cache and only used when ReuseRecord == true.

lastRecord []string

}

// NewReader returns a new Reader that reads from r.

func NewReader(r io.Reader) *Reader {

return &Reader{

Comma: ',',

r: bufio.NewReader(r),

}

}

// Read reads one record (a slice of fields) from r.

// If the record has an unexpected number of fields,

// Read returns the record along with the error [ErrFieldCount].

// If the record contains a field that cannot be parsed,

// Read returns a partial record along with the parse error.

// The partial record contains all fields read before the error.

// If there is no data left to be read, Read returns nil, [io.EOF].

// If [Reader.ReuseRecord] is true, the returned slice may be shared

// between multiple calls to Read.

func (r *Reader) Read() (record []string, err error) {

if r.ReuseRecord {

record, err = r.readRecord(r.lastRecord)

r.lastRecord = record

} else {

record, err = r.readRecord(nil)

}

return record, err

}

// FieldPos returns the line and column corresponding to

// the start of the field with the given index in the slice most recently

// returned by [Reader.Read]. Numbering of lines and columns starts at 1;

// columns are counted in bytes, not runes.

//

// If this is called with an out-of-bounds index, it panics.

func (r *Reader) FieldPos(field int) (line, column int) {

if field < 0 || field >= len(r.fieldPositions) {

panic("out of range index passed to FieldPos")

}

p := &r.fieldPositions[field]

return p.line, p.col

}

// InputOffset returns the input stream byte offset of the current reader

// position. The offset gives the location of the end of the most recently

// read row and the beginning of the next row.

func (r *Reader) InputOffset() int64 {

return r.offset

}

// pos holds the position of a field in the current line.

type position struct {

line, col int

}

// ReadAll reads all the remaining records from r.

// Each record is a slice of fields.

// A successful call returns err == nil, not err == [io.EOF]. Because ReadAll is

// defined to read until EOF, it does not treat end of file as an error to be

// reported.

func (r *Reader) ReadAll() (records [][]string, err error) {

for {

record, err := r.readRecord(nil)

if err == io.EOF {

return records, nil

}

if err != nil {

return nil, err

}

records = append(records, record)

}

}

// readLine reads the next line (with the trailing endline).

// If EOF is hit without a trailing endline, it will be omitted.

// If some bytes were read, then the error is never [io.EOF].

// The result is only valid until the next call to readLine.

func (r *Reader) readLine() ([]byte, error) {

line, err := r.r.ReadSlice('\n')

if err == bufio.ErrBufferFull {

r.rawBuffer = append(r.rawBuffer[:0], line...)

for err == bufio.ErrBufferFull {

line, err = r.r.ReadSlice('\n')

r.rawBuffer = append(r.rawBuffer, line...)

}

line = r.rawBuffer

}

readSize := len(line)

if readSize > 0 && err == io.EOF {

err = nil

// For backwards compatibility, drop trailing \r before EOF.

if line[readSize-1] == '\r' {

line = line[:readSize-1]

}

}

r.numLine++

r.offset += int64(readSize)

// Normalize \r\n to \n on all input lines.

if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {

line[n-2] = '\n'

line = line[:n-1]

}

return line, err

}

// lengthNL reports the number of bytes for the trailing \n.

func lengthNL(b []byte) int {

if len(b) > 0 && b[len(b)-1] == '\n' {

return 1

}

return 0

}

// nextRune returns the next rune in b or utf8.RuneError.

func nextRune(b []byte) rune {

r, _ := utf8.DecodeRune(b)

return r

}

func (r *Reader) readRecord(dst []string) ([]string, error) {

if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {

return nil, errInvalidDelim

}

// Read line (automatically skipping past empty lines and any comments).

var line []byte

var errRead error

for errRead == nil {

line, errRead = r.readLine()

if r.Comment != 0 && nextRune(line) == r.Comment {

line = nil

continue // Skip comment lines

}

if errRead == nil && len(line) == lengthNL(line) {

line = nil

continue // Skip empty lines

}

break

}

if errRead == io.EOF {

return nil, errRead

}

// Parse each field in the record.

var err error

const quoteLen = len(`"`)

commaLen := utf8.RuneLen(r.Comma)

recLine := r.numLine // Starting line for record

r.recordBuffer = r.recordBuffer[:0]

r.fieldIndexes = r.fieldIndexes[:0]

r.fieldPositions = r.fieldPositions[:0]

pos := position{line: r.numLine, col: 1}

parseField:

for {

if r.TrimLeadingSpace {

i := bytes.IndexFunc(line, func(r rune) bool {

return !unicode.IsSpace(r)

})

if i < 0 {

i = len(line)

pos.col -= lengthNL(line)

}

line = line[i:]

pos.col += i

}

if len(line) == 0 || line[0] != '"' {

// Non-quoted string field

i := bytes.IndexRune(line, r.Comma)

field := line

if i >= 0 {

field = field[:i]

} else {

field = field[:len(field)-lengthNL(field)]

}

// Check to make sure a quote does not appear in field.

if !r.LazyQuotes {

if j := bytes.IndexByte(field, '"'); j >= 0 {

col := pos.col + j

err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}

break parseField

}

}

r.recordBuffer = append(r.recordBuffer, field...)

r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))

r.fieldPositions = append(r.fieldPositions, pos)

if i >= 0 {

line = line[i+commaLen:]

pos.col += i + commaLen

continue parseField

}

break parseField

} else {

// Quoted string field

fieldPos := pos

line = line[quoteLen:]

pos.col += quoteLen

for {

i := bytes.IndexByte(line, '"')

if i >= 0 {

// Hit next quote.

r.recordBuffer = append(r.recordBuffer, line[:i]...)

line = line[i+quoteLen:]

pos.col += i + quoteLen

switch rn := nextRune(line); {

case rn == '"':

// `""` sequence (append quote).

r.recordBuffer = append(r.recordBuffer, '"')

line = line[quoteLen:]

pos.col += quoteLen

case rn == r.Comma:

// `",` sequence (end of field).

line = line[commaLen:]

pos.col += commaLen

r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))

r.fieldPositions = append(r.fieldPositions, fieldPos)

continue parseField

case lengthNL(line) == len(line):

// `"\n` sequence (end of line).

r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))

r.fieldPositions = append(r.fieldPositions, fieldPos)

break parseField

case r.LazyQuotes:

// `"` sequence (bare quote).

r.recordBuffer = append(r.recordBuffer, '"')

default:

// `"*` sequence (invalid non-escaped quote).

err = &ParseError{StartLine: recLine, Line: r.numLine, Column: pos.col - quoteLen, Err: ErrQuote}

break parseField

}

} else if len(line) > 0 {

// Hit end of line (copy all data so far).

r.recordBuffer = append(r.recordBuffer, line...)

if errRead != nil {

break parseField

}

pos.col += len(line)

line, errRead = r.readLine()

if len(line) > 0 {

pos.line++

pos.col = 1

}

if errRead == io.EOF {

errRead = nil

}

} else {

// Abrupt end of file (EOF or error).

if !r.LazyQuotes && errRead == nil {

err = &ParseError{StartLine: recLine, Line: pos.line, Column: pos.col, Err: ErrQuote}

break parseField

}

r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))

r.fieldPositions = append(r.fieldPositions, fieldPos)

break parseField

}

}

}

}

if err == nil {

err = errRead

}

// Create a single string and create slices out of it.

// This pins the memory of the fields together, but allocates once.

str := string(r.recordBuffer) // Convert to string once to batch allocations

dst = dst[:0]

if cap(dst) < len(r.fieldIndexes) {

dst = make([]string, len(r.fieldIndexes))

}

dst = dst[:len(r.fieldIndexes)]

var preIdx int

for i, idx := range r.fieldIndexes {

dst[i] = str[preIdx:idx]

preIdx = idx

}

// Check or update the expected fields per record.

if r.FieldsPerRecord > 0 {

if len(dst) != r.FieldsPerRecord && err == nil {

err = &ParseError{

StartLine: recLine,

Line: recLine,

Column: 1,

Err: ErrFieldCount,

}

}

} else if r.FieldsPerRecord == 0 {

r.FieldsPerRecord = len(dst)

}

return dst, err

}