Penser à gérer le case de délimiteurs multiples qui s'enchaînent et qui ne doivent être pris en compte que pour 1 seul

This commit is contained in:
laurentu 2024-08-07 16:19:17 +02:00
parent 3e4b170099
commit 9f83b3ce1f
1 changed files with 57 additions and 59 deletions

View File

@ -7,28 +7,22 @@ import (
) )
type CsvParser struct { type CsvParser struct {
enclosers []string enclosers []string
delimiter rune delimiters string
fields []string fields []string
} }
func (p *CsvParser) Initialize(delimiter string, enclosers []string, lineFormat string) error { func (p *CsvParser) Initialize(delimiters string, enclosers []string, lineFormat string) error {
if utf8.RuneCountInString(delimiter) != 1 { if utf8.RuneCountInString(delimiter) == 0 {
return fmt.Errorf("delimiter shoud be one character") return fmt.Errorf("delimiter shoud be at least one character")
} }
p.enclosers = make([]string, 0) p.delimiters = delimiters
for _, encloser := range enclosers { for _, encloser := range enclosers {
if utf8.RuneCountInString(encloser) != 2 { if utf8.RuneCountInString(encloser) != 2 {
return fmt.Errorf("encolser should have to characters") return fmt.Errorf("encolser should have to characters")
} }
} }
p.enclosers = enclosers p.enclosers = enclosers
p.delimiter = []rune(delimiter)[0]
for _, pair := range enclosers {
if utf8.RuneCountInString(pair) != 2 {
return fmt.Errorf("encoloser should contain two characters: %s", pair)
}
}
// line format is in the form of: field1 field2 ignore ... // line format is in the form of: field1 field2 ignore ...
// if field name is ignore, it is parsed but not retained // if field name is ignore, it is parsed but not retained
p.fields = strings.Split(lineFormat, " ") p.fields = strings.Split(lineFormat, " ")
@ -40,56 +34,60 @@ func (p *CsvParser) Parse(line string) (map[string]string, error) {
currentFieldIndex := 0 currentFieldIndex := 0
valueStart := 0 valueStart := 0
escape := false escape := false
enclosed := false enclosed := false
ret := make(map[string]string) ret := make(map[string]string)
indexMax := len(line) - 1 indexMax := len(line) - 1
maxFieldIndex := len(p.fields) - 1 maxFieldIndex := len(p.fields) - 1
delimiter := p.delimiter delimiters := p.delimiters
for index, r := range line { for index, r := range line {
if r == '\\' { if r == '\\' {
// Check if EOL before continue // Check if EOL before continue
escape=true escape = true
continue continue
} }
if escape { if escape {
escape=false escape = false
continue continue
} }
if r == delimiter { for _, d := range delimiters {
if p.fields[currentFieldIndex] != "ignore" { delimiter = true
ret[p.fields[currentFieldIndex]] = line[valueStart : index] }
} if delimiter {
currentFieldIndex++ if p.fields[currentFieldIndex] != "ignore" {
if currentFieldIndex > maxFieldIndex { ret[p.fields[currentFieldIndex]] = line[valueStart:index]
break }
} currentFieldIndex++
valueStart = index + 1 if currentFieldIndex > maxFieldIndex {
if enclosed { break
enclosed=false }
delimiter = p.delimiter valueStart = index + 1
// Omit next delimiter if enclosed {
escape=true enclosed = false
} delimiter = p.delimiter
continue // Omit next delimiter
} escape = true
if index >= indexMax { }
if p.fields[currentFieldIndex] != "ignore" { continue
ret[p.fields[currentFieldIndex]] = line[valueStart:] }
}
continue
}
for _, encloser := range p.enclosers { if index >= indexMax {
runes := []rune(encloser) if p.fields[currentFieldIndex] != "ignore" {
if r == runes[0] { ret[p.fields[currentFieldIndex]] = line[valueStart:]
// opening encloser }
enclosed = true continue
delimiter = runes[1] }
valueStart++
break for _, encloser := range p.enclosers {
} runes := []rune(encloser)
} if r == runes[0] {
} // opening encloser
enclosed = true
delimiters = runes[1]
valueStart++
break
}
}
}
return ret, nil return ret, nil
} }