package csvparser import ( "fmt" "strings" "unicode/utf8" ) type CsvParser struct { enclosers []string delimiters string mergeDelimiters bool fields []string ignore string maxFieldIndex int } /* * delimiters: string with all delimiter chars * mergeDelimiters: if true, all successive delimiters are considered as one * enclosers: array of string, each string contains a enclorser tuple: "", [], {}... * lineFormat: format of the line (each fieldname or ignore) */ func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error { if utf8.RuneCountInString(delimiters) == 0 { return fmt.Errorf("delimiter shoud be at least one character") } p.delimiters = delimiters p.mergeDelimiters = mergeDelimiters for _, encloser := range enclosers { if utf8.RuneCountInString(encloser) != 2 { return fmt.Errorf("encolser should have to characters") } } p.enclosers = enclosers p.ignore = ignore // line format is in the form of: field1 field2 ignore ... // if field name is ignore, it is parsed but not retained fields := strings.Split(lineFormat, " ") p.fields = make([]string, 0) for _, f := range fields { if len(f) > 0 { p.fields = append(p.fields, f) } } p.maxFieldIndex = len(p.fields) - 1 return nil } type ParserState struct { inField bool // reading field value delimiter bool // last char was a delimiter escape bool // the previaus char was a escaper \\ enclosed bool // the current field is enclosed enclosedMode bool // the current algo is for enclosed string, do not search delimiter encloserStart rune // the current enclosed field opener char encloserEnd rune // the current enclosed field closer char } /* * si pas dans la lecture d'un champ */ func (p *CsvParser) Parse(line string) (map[string]string, error) { currentFieldIndex := 0 valueStart := -1 state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'} ret := make(map[string]string) maxIndex := len(line) - 1 for index, r := range line { if r == '\\' { state.escape = true continue } // previous rune was \\: ignore the current rune if state.escape { state.escape = false continue } // reading en enclosed field: watch for end of this field (encloserEnd) if state.enclosedMode { // searching for end of encloser if r == state.encloserEnd { state.enclosedMode = false } if index < maxIndex { continue } } isDelimiter := false for _, d := range p.delimiters { if r == d { isDelimiter = true break } } // previous rune was a delimiter and mergeDelimiters is true, skip if isDelimiter && state.delimiter && p.mergeDelimiters { continue } // current rune is a delimiter, a value is present and the next char is the next value if isDelimiter || index == maxIndex { if index == maxIndex && valueStart == -1 { valueStart = 0 } state.delimiter = true field := line[valueStart:index +1] field = strings.TrimRight(field,p.delimiters) if state.enclosed { field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd)) } fmt.Printf("field:%s:\n", field) if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore { ret[p.fields[currentFieldIndex]] = field } currentFieldIndex++ state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'} continue } if state.delimiter { valueStart = index for _, e := range p.enclosers { runes := []rune(e) if r == runes[0] { state.enclosed = true state.enclosedMode = true state.encloserStart = runes[0] state.encloserEnd = runes[1] } state.delimiter = false } } } return ret, nil }