Compare commits

..

No commits in common. "master" and "v0.0.5" have entirely different histories.

3 changed files with 66 additions and 182 deletions

View File

@ -7,137 +7,87 @@ import (
) )
type CsvParser struct { type CsvParser struct {
enclosers []string enclosers []string
delimiters string delimiter rune
mergeDelimiters bool fields []string
fields []string
ignore string
maxFieldIndex int
} }
func (p *csvParser) FieldCount() int {
return maxFieldIndex + 1
}
/*
* delimiters: string with all delimiter chars
* mergeDelimiters: if true, all successive delimiters are considered as one
* enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
* lineFormat: format of the line (each fieldname or ignore)
*/
func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error {
if utf8.RuneCountInString(delimiters) == 0 { func (p *CsvParser) Initialize(delimiter string, enclosers []string, lineFormat string) error {
return fmt.Errorf("delimiter shoud be at least one character") if utf8.RuneCountInString(delimiter) != 1 {
return fmt.Errorf("delimiter shoud be one character")
} }
p.delimiters = delimiters p.enclosers = make([]string, 0)
p.mergeDelimiters = mergeDelimiters
for _, encloser := range enclosers { for _, encloser := range enclosers {
if utf8.RuneCountInString(encloser) != 2 { if utf8.RuneCountInString(encloser) != 2 {
return fmt.Errorf("encolser should have to characters") return fmt.Errorf("encolser should have to characters")
} }
} }
p.enclosers = enclosers p.enclosers = enclosers
p.delimiter = []rune(delimiter)[0]
p.ignore = ignore for _, pair := range enclosers {
// line format is in the form of: field1 field2 ignore ... if utf8.RuneCountInString(pair) != 2 {
// if field name is ignore, it is parsed but not retained return fmt.Errorf("encoloser should contain two characters: %s", pair)
fields := strings.Split(lineFormat, " ")
p.fields = make([]string, 0)
for _, f := range fields {
if len(f) > 0 {
p.fields = append(p.fields, f)
} }
} }
p.maxFieldIndex = len(p.fields) - 1 // line format is in the form of: field1 field2 ignore ...
// if field name is ignore, it is parsed but not retained
p.fields = strings.Split(lineFormat, " ")
return nil return nil
} }
type ParserState struct {
inField bool // reading field value
delimiter bool // last char was a delimiter
escape bool // the previaus char was a escaper \\
enclosed bool // the current field is enclosed
enclosedMode bool // the current algo is for enclosed string, do not search delimiter
encloserStart rune // the current enclosed field opener char
encloserEnd rune // the current enclosed field closer char
}
/*
* si pas dans la lecture d'un champ
*/
func (p *CsvParser) Parse(line string) (map[string]string, error) { func (p *CsvParser) Parse(line string) (map[string]string, error) {
inEnclosedField := false
currentEncloserEnd := ' '
escape := false
currentFieldIndex := 0 currentFieldIndex := 0
valueStart := -1
state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
ret := make(map[string]string) ret := make(map[string]string)
valueStart := 0
maxIndex := len(line) - 1 indexMax := len(line) - 1
for index, r := range line { for index, r := range line {
if index == indexMax {
if currentFieldIndex < len(p.fields) {
//fmt.Println("start:", valueStart, "end:", index)
//fmt.Println("Found a field value for:", p.fields[currentFieldIndex], line[valueStart:index])
if inEnclosedField && r == currentEncloserEnd {
ret[p.fields[currentFieldIndex]] = line[valueStart:index]
} else {
ret[p.fields[currentFieldIndex]] = line[valueStart : index+1]
}
//fmt.Println("Index is:", index)
}
}
if r == '\\' { if r == '\\' {
state.escape = true escape = !escape
continue } else if inEnclosedField {
} if r == currentEncloserEnd && !escape {
// previous rune was \\: ignore the current rune inEnclosedField = false
if state.escape {
state.escape = false
continue
}
// reading en enclosed field: watch for end of this field (encloserEnd)
if state.enclosedMode {
// searching for end of encloser
if r == state.encloserEnd {
state.enclosedMode = false
} }
if index < maxIndex { } else if r == p.delimiter {
continue
}
}
isDelimiter := false
for _, d := range p.delimiters {
if r == d {
isDelimiter = true
break
}
}
// previous rune was a delimiter and mergeDelimiters is true, skip if currentFieldIndex < len(p.fields) {
if isDelimiter && state.delimiter && p.mergeDelimiters { //fmt.Println("start:", valueStart, "end:", index)
continue //fmt.Println("Found a field value for:", p.fields[currentFieldIndex], line[valueStart:index])
} ret[p.fields[currentFieldIndex]] = line[valueStart:index]
//fmt.Println("Index is:", index)
// current rune is a delimiter, a value is present and the next char is the next value valueStart = index + 1
if isDelimiter || index == maxIndex {
if index == maxIndex && valueStart == -1 {
valueStart = 0
}
state.delimiter = true
field := line[valueStart : index+1]
field = strings.TrimRight(field, p.delimiters)
if state.enclosed {
field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd))
}
if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore {
ret[p.fields[currentFieldIndex]] = field
} }
currentFieldIndex++ currentFieldIndex++
state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'} } else {
continue for _, encloser := range p.enclosers {
} runes := []rune(encloser)
if state.delimiter {
valueStart = index
for _, e := range p.enclosers {
runes := []rune(e)
if r == runes[0] { if r == runes[0] {
state.enclosed = true // opening encloser
state.enclosedMode = true inEnclosedField = true
state.encloserStart = runes[0] currentEncloserEnd = runes[1]
state.encloserEnd = runes[1] valueStart++
break
} }
state.delimiter = false
} }
} }
} }
return ret, nil return ret, nil
} }

View File

@ -7,75 +7,16 @@ import (
func TestCorrectLines(t *testing.T) { func TestCorrectLines(t *testing.T) {
var csvParser CsvParser var csvParser CsvParser
csvParser.Initialize(" ", []string{"\"\"", "[]"}, "firstname lastname complete_name")
line := "" line := ""
/*
csvParser.Initialize(" ", false, []string{"\"\"", "[]"}, "firstname lastname complete_name")
fmt.Println("parsing:", line)
event, err := csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John \"John Doe\" Doe"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe \"John Doe\""
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe \"John Doe\"\\\\"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe I don't know him"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
*/
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\""
fmt.Println("parsing:", line) fmt.Println("parsing:", line)
event, err := csvParser.Parse(line) event, err := csvParser.Parse(line)
if err != nil { if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event) t.Fatalf("Parsing of empty line failed %v %v", err, event)
} }
fmt.Println("event is", event) fmt.Println("event is", event)
/*
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore") line = "John"
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" 15169"
fmt.Println("parsing:", line) fmt.Println("parsing:", line)
event, err = csvParser.Parse(line) event, err = csvParser.Parse(line)
if err != nil { if err != nil {
@ -83,8 +24,7 @@ func TestCorrectLines(t *testing.T) {
} }
fmt.Println("event is", event) fmt.Println("event is", event)
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore") line = "John Doe"
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169"
fmt.Println("parsing:", line) fmt.Println("parsing:", line)
event, err = csvParser.Parse(line) event, err = csvParser.Parse(line)
if err != nil { if err != nil {
@ -92,17 +32,7 @@ func TestCorrectLines(t *testing.T) {
} }
fmt.Println("event is", event) fmt.Println("event is", event)
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore") line = "John \"John Doe\" Doe"
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
*/
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
fmt.Println("parsing:", line) fmt.Println("parsing:", line)
event, err = csvParser.Parse(line) event, err = csvParser.Parse(line)
if err != nil { if err != nil {
@ -110,8 +40,7 @@ func TestCorrectLines(t *testing.T) {
} }
fmt.Println("event is", event) fmt.Println("event is", event)
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore") line = "John Doe \"John Doe\""
line = "G"
fmt.Println("parsing:", line) fmt.Println("parsing:", line)
event, err = csvParser.Parse(line) event, err = csvParser.Parse(line)
if err != nil { if err != nil {
@ -119,5 +48,11 @@ func TestCorrectLines(t *testing.T) {
} }
fmt.Println("event is", event) fmt.Println("event is", event)
line = "John Doe I don't know him"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
} }

1
go.mod
View File

@ -1,4 +1,3 @@
module git.passke.org/laurentu/csv-parser module git.passke.org/laurentu/csv-parser
toolchain go1.22.1
go 1.22 go 1.22