142 lines
3.9 KiB
Go
142 lines
3.9 KiB
Go
package csvparser
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
type CsvParser struct {
|
|
enclosers []string
|
|
delimiters string
|
|
mergeDelimiters bool
|
|
fields []string
|
|
ignore string
|
|
maxFieldIndex int
|
|
}
|
|
|
|
/*
|
|
* delimiters: string with all delimiter chars
|
|
* mergeDelimiters: if true, all successive delimiters are considered as one
|
|
* enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
|
|
* lineFormat: format of the line (each fieldname or ignore)
|
|
*/
|
|
func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error {
|
|
|
|
if utf8.RuneCountInString(delimiters) == 0 {
|
|
return fmt.Errorf("delimiter shoud be at least one character")
|
|
}
|
|
p.delimiters = delimiters
|
|
p.mergeDelimiters = mergeDelimiters
|
|
for _, encloser := range enclosers {
|
|
if utf8.RuneCountInString(encloser) != 2 {
|
|
return fmt.Errorf("encolser should have to characters")
|
|
}
|
|
}
|
|
p.enclosers = enclosers
|
|
|
|
p.ignore = ignore
|
|
// line format is in the form of: field1 field2 ignore ...
|
|
// if field name is ignore, it is parsed but not retained
|
|
fields := strings.Split(lineFormat, " ")
|
|
|
|
p.fields = make([]string, 0)
|
|
for _, f := range fields {
|
|
if len(f) > 0 {
|
|
p.fields = append(p.fields, f)
|
|
}
|
|
}
|
|
p.maxFieldIndex = len(p.fields) - 1
|
|
return nil
|
|
}
|
|
|
|
type ParserState struct {
|
|
inField bool // reading field value
|
|
delimiter bool // last char was a delimiter
|
|
escape bool // the previaus char was a escaper \\
|
|
enclosed bool // the current field is enclosed
|
|
enclosedMode bool // the current algo is for enclosed string, do not search delimiter
|
|
encloserStart rune // the current enclosed field opener char
|
|
encloserEnd rune // the current enclosed field closer char
|
|
}
|
|
|
|
/*
|
|
* si pas dans la lecture d'un champ
|
|
*/
|
|
func (p *CsvParser) Parse(line string) (map[string]string, error) {
|
|
currentFieldIndex := 0
|
|
valueStart := -1
|
|
|
|
state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
|
|
|
|
ret := make(map[string]string)
|
|
|
|
maxIndex := len(line) - 1
|
|
for index, r := range line {
|
|
if r == '\\' {
|
|
state.escape = true
|
|
continue
|
|
}
|
|
// previous rune was \\: ignore the current rune
|
|
if state.escape {
|
|
state.escape = false
|
|
continue
|
|
}
|
|
// reading en enclosed field: watch for end of this field (encloserEnd)
|
|
if state.enclosedMode {
|
|
// searching for end of encloser
|
|
if r == state.encloserEnd {
|
|
state.enclosedMode = false
|
|
}
|
|
if index < maxIndex {
|
|
continue
|
|
}
|
|
}
|
|
isDelimiter := false
|
|
for _, d := range p.delimiters {
|
|
if r == d {
|
|
isDelimiter = true
|
|
break
|
|
}
|
|
}
|
|
|
|
// previous rune was a delimiter and mergeDelimiters is true, skip
|
|
if isDelimiter && state.delimiter && p.mergeDelimiters {
|
|
continue
|
|
}
|
|
|
|
// current rune is a delimiter, a value is present and the next char is the next value
|
|
if isDelimiter || index == maxIndex {
|
|
if index == maxIndex && valueStart == -1 {
|
|
valueStart = 0
|
|
}
|
|
state.delimiter = true
|
|
field := line[valueStart:index +1]
|
|
field = strings.TrimRight(field,p.delimiters)
|
|
if state.enclosed {
|
|
field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd))
|
|
}
|
|
if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore {
|
|
ret[p.fields[currentFieldIndex]] = field
|
|
}
|
|
currentFieldIndex++
|
|
state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
|
|
continue
|
|
}
|
|
if state.delimiter {
|
|
valueStart = index
|
|
for _, e := range p.enclosers {
|
|
runes := []rune(e)
|
|
if r == runes[0] {
|
|
state.enclosed = true
|
|
state.enclosedMode = true
|
|
state.encloserStart = runes[0]
|
|
state.encloserEnd = runes[1]
|
|
}
|
|
state.delimiter = false
|
|
}
|
|
}
|
|
}
|
|
return ret, nil
|
|
}
|