diff --git a/csvparser.go b/csvparser.go index 6a26b20..07bf9c4 100644 --- a/csvparser.go +++ b/csvparser.go @@ -9,14 +9,22 @@ import ( type CsvParser struct { enclosers []string delimiters string + mergeDelimiters string fields []string } -func (p *CsvParser) Initialize(delimiters string, enclosers []string, lineFormat string) error { - if utf8.RuneCountInString(delimiter) == 0 { +/* +* delimiters: string with all delimiter chars +* mergeDelimiters: if true, all successive delimiters are considered as one +* enclosers: array of string, each string contains a enclorser tuple: "", [], {}... +* lineFormat: format of the line (each fieldname or ignore) +*/ +func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string) error { + if utf8.RuneCountInString(delimiters) == 0 { return fmt.Errorf("delimiter shoud be at least one character") } p.delimiters = delimiters + p.mergeDelimiters = mergeDelimiters for _, encloser := range enclosers { if utf8.RuneCountInString(encloser) != 2 { return fmt.Errorf("encolser should have to characters") @@ -30,9 +38,15 @@ func (p *CsvParser) Initialize(delimiters string, enclosers []string, lineFormat return nil } +type parserState struct { + delimiter bool + enclosed bool + encloserEnd rune +} + func (p *CsvParser) Parse(line string) (map[string]string, error) { currentFieldIndex := 0 - valueStart := 0 + valueStart := -1 escape := false enclosed := false @@ -40,6 +54,7 @@ func (p *CsvParser) Parse(line string) (map[string]string, error) { indexMax := len(line) - 1 maxFieldIndex := len(p.fields) - 1 delimiters := p.delimiters + delimiter := false for index, r := range line { if r == '\\' { // Check if EOL before continue @@ -51,8 +66,17 @@ func (p *CsvParser) Parse(line string) (map[string]string, error) { continue } for _, d := range delimiters { - delimiter = true + if r == d { + delimiter = true + break + } } + if delimiter && valueStart > -1 { + if p.fields[currentFieldIndex] != "ignore" { + ret[p.fields[currentFieldIndex]] = line[valueStart:index] + } + currentFieldIndex++ + } if delimiter { if p.fields[currentFieldIndex] != "ignore" { ret[p.fields[currentFieldIndex]] = line[valueStart:index] @@ -64,7 +88,7 @@ func (p *CsvParser) Parse(line string) (map[string]string, error) { valueStart = index + 1 if enclosed { enclosed = false - delimiter = p.delimiter + delimiters = p.delimiters // Omit next delimiter escape = true } @@ -83,7 +107,7 @@ func (p *CsvParser) Parse(line string) (map[string]string, error) { if r == runes[0] { // opening encloser enclosed = true - delimiters = runes[1] + delimiters = string(runes[1]) valueStart++ break } diff --git a/csvparser_test.go b/csvparser_test.go index 9b8966d..2dfbdd8 100644 --- a/csvparser_test.go +++ b/csvparser_test.go @@ -62,5 +62,14 @@ func TestCorrectLines(t *testing.T) { if err != nil { t.Fatalf("Parsing of empty line failed %v %v", err, event) } + csvParser.Initialize(" \t", []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent") fmt.Println("event is", event) + line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\"" + fmt.Println("parsing:", line) + event, err = csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + }