Files
csv-parser/csvparser.go
T

118 lines
2.6 KiB
Go
Raw Normal View History

2024-07-06 21:03:35 +02:00
package csvparser
import (
"fmt"
"strings"
"unicode/utf8"
)
2024-07-08 14:47:34 +02:00
type CsvParser struct {
enclosers []string
delimiters string
2024-08-07 20:36:25 +02:00
mergeDelimiters string
fields []string
2024-07-06 21:03:35 +02:00
}
2024-08-07 20:36:25 +02:00
/*
* delimiters: string with all delimiter chars
* mergeDelimiters: if true, all successive delimiters are considered as one
* enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
* lineFormat: format of the line (each fieldname or ignore)
*/
func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string) error {
if utf8.RuneCountInString(delimiters) == 0 {
return fmt.Errorf("delimiter shoud be at least one character")
2024-07-06 21:03:35 +02:00
}
p.delimiters = delimiters
2024-08-07 20:36:25 +02:00
p.mergeDelimiters = mergeDelimiters
2024-07-07 10:32:06 +02:00
for _, encloser := range enclosers {
if utf8.RuneCountInString(encloser) != 2 {
return fmt.Errorf("encolser should have to characters")
}
2024-07-06 21:03:35 +02:00
}
2024-07-07 10:32:06 +02:00
p.enclosers = enclosers
2024-07-06 21:03:35 +02:00
// line format is in the form of: field1 field2 ignore ...
// if field name is ignore, it is parsed but not retained
p.fields = strings.Split(lineFormat, " ")
return nil
}
2024-08-07 20:36:25 +02:00
type parserState struct {
delimiter bool
enclosed bool
encloserEnd rune
}
2024-07-08 14:47:34 +02:00
func (p *CsvParser) Parse(line string) (map[string]string, error) {
2024-07-07 10:32:06 +02:00
currentFieldIndex := 0
2024-08-07 20:36:25 +02:00
valueStart := -1
2024-08-07 15:46:14 +02:00
escape := false
enclosed := false
2024-07-08 14:47:34 +02:00
2024-07-06 21:03:35 +02:00
ret := make(map[string]string)
2024-07-11 16:03:03 +02:00
indexMax := len(line) - 1
maxFieldIndex := len(p.fields) - 1
delimiters := p.delimiters
2024-08-07 20:36:25 +02:00
delimiter := false
2024-07-06 21:03:35 +02:00
for index, r := range line {
if r == '\\' {
// Check if EOL before continue
escape = true
continue
}
if escape {
escape = false
continue
}
for _, d := range delimiters {
2024-08-07 20:36:25 +02:00
if r == d {
delimiter = true
break
}
}
2024-08-07 20:36:25 +02:00
if delimiter && valueStart > -1 {
if p.fields[currentFieldIndex] != "ignore" {
ret[p.fields[currentFieldIndex]] = line[valueStart:index]
}
currentFieldIndex++
}
if delimiter {
if p.fields[currentFieldIndex] != "ignore" {
ret[p.fields[currentFieldIndex]] = line[valueStart:index]
}
currentFieldIndex++
if currentFieldIndex > maxFieldIndex {
break
}
valueStart = index + 1
if enclosed {
enclosed = false
2024-08-07 20:36:25 +02:00
delimiters = p.delimiters
// Omit next delimiter
escape = true
}
continue
}
if index >= indexMax {
if p.fields[currentFieldIndex] != "ignore" {
ret[p.fields[currentFieldIndex]] = line[valueStart:]
}
continue
}
2024-07-06 21:03:35 +02:00
for _, encloser := range p.enclosers {
runes := []rune(encloser)
if r == runes[0] {
// opening encloser
enclosed = true
2024-08-07 20:36:25 +02:00
delimiters = string(runes[1])
valueStart++
break
}
}
}
2024-07-06 21:03:35 +02:00
return ret, nil
}