csv-parser/csvparser.go

package csvparser

import (
	"fmt"
	"strings"
	"unicode/utf8"
)

type CsvParser struct {
	enclosers       []string
	delimiters      string
	mergeDelimiters bool
	fields          []string
	ignore          string
	maxFieldIndex   int
}

/*
* delimiters: string with all delimiter chars
* mergeDelimiters: if true, all successive delimiters are considered as one
* enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
* lineFormat: format of the line (each fieldname or ignore)
 */
func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error {

	if utf8.RuneCountInString(delimiters) == 0 {
		return fmt.Errorf("delimiter shoud be at least one character")
	}
	p.delimiters = delimiters
	p.mergeDelimiters = mergeDelimiters
	for _, encloser := range enclosers {
		if utf8.RuneCountInString(encloser) != 2 {
			return fmt.Errorf("encolser should have to characters")
		}
	}
	p.enclosers = enclosers

	p.ignore = ignore
	// line format is in the form of: field1 field2 ignore ...
	// if field name is ignore, it is parsed but not retained
	fields := strings.Split(lineFormat, " ")

	p.fields = make([]string, 0)
	for _, f := range fields {
		if len(f) > 0 {
			p.fields = append(p.fields, f)
		}
	}
	p.maxFieldIndex = len(p.fields) - 1
	return nil
}

type ParserState struct {
	inField       bool // reading field value
	delimiter     bool // last char was a delimiter
	escape        bool // the previaus char was a escaper \\
	enclosed      bool // the current field is enclosed
	enclosedMode  bool // the current algo is for enclosed string, do not search delimiter
	encloserStart rune // the current enclosed field opener char
	encloserEnd   rune // the current enclosed field closer char
}

/*
* si pas dans la lecture d'un champ
 */
func (p *CsvParser) Parse(line string) (map[string]string, error) {
	currentFieldIndex := 0
	valueStart := -1

	state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}

	ret := make(map[string]string)

	maxIndex := len(line) - 1
	for index, r := range line {
		if r == '\\' {
			state.escape = true
			continue
		}
		// previous rune was \\: ignore the current rune
		if state.escape {
			state.escape = false
			continue
		}
		// reading en enclosed field: watch for end of this field (encloserEnd)
		if state.enclosedMode {
			// searching for end of encloser
			if r == state.encloserEnd {
				state.enclosedMode = false
			}
			if index < maxIndex {
				continue
			}
		}
		isDelimiter := false
		for _, d := range p.delimiters {
			if r == d {
				isDelimiter = true
				break
			}
		}

		// previous rune was a delimiter and mergeDelimiters is true, skip
		if isDelimiter && state.delimiter && p.mergeDelimiters {
			continue
		}

		// current rune is a delimiter, a value is present and the next char is the next value
		if isDelimiter || index == maxIndex {
      if index == maxIndex && valueStart == -1 {
        valueStart = 0
      }
			state.delimiter = true
			field := line[valueStart:index +1]
      field = strings.TrimRight(field,p.delimiters)
			if state.enclosed {
				field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd))
			}
			if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore {
				ret[p.fields[currentFieldIndex]] = field
			}
			currentFieldIndex++
			state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
			continue
		}
		if state.delimiter {
			valueStart = index
			for _, e := range p.enclosers {
				runes := []rune(e)
				if r == runes[0] {
					state.enclosed = true
					state.enclosedMode = true
					state.encloserStart = runes[0]
					state.encloserEnd = runes[1]
				}
				state.delimiter = false
			}
		}
	}
	return ret, nil
}