diff --git a/csvparser.go b/csvparser.go index 07bf9c4..c275894 100644 --- a/csvparser.go +++ b/csvparser.go @@ -7,10 +7,12 @@ import ( ) type CsvParser struct { - enclosers []string - delimiters string - mergeDelimiters string - fields []string + enclosers []string + delimiters string + mergeDelimiters bool + fields []string + ignore string + maxFieldIndex int } /* @@ -18,98 +20,117 @@ type CsvParser struct { * mergeDelimiters: if true, all successive delimiters are considered as one * enclosers: array of string, each string contains a enclorser tuple: "", [], {}... * lineFormat: format of the line (each fieldname or ignore) -*/ -func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string) error { + */ +func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error { + if utf8.RuneCountInString(delimiters) == 0 { return fmt.Errorf("delimiter shoud be at least one character") } p.delimiters = delimiters - p.mergeDelimiters = mergeDelimiters + p.mergeDelimiters = mergeDelimiters for _, encloser := range enclosers { if utf8.RuneCountInString(encloser) != 2 { return fmt.Errorf("encolser should have to characters") } } p.enclosers = enclosers + + p.ignore = ignore // line format is in the form of: field1 field2 ignore ... // if field name is ignore, it is parsed but not retained - p.fields = strings.Split(lineFormat, " ") + fields := strings.Split(lineFormat, " ") + p.fields = make([]string, 0) + for _, f := range fields { + if len(f) > 0 { + p.fields = append(p.fields, f) + } + } + p.maxFieldIndex = len(p.fields) - 1 return nil } -type parserState struct { - delimiter bool - enclosed bool - encloserEnd rune +type ParserState struct { + inField bool // reading field value + delimiter bool // last char was a delimiter + escape bool // the previaus char was a escaper \\ + enclosed bool // the current field is enclosed + enclosedMode bool // the current algo is for enclosed string, do not search delimiter + encloserStart rune // the current enclosed field opener char + encloserEnd rune // the current enclosed field closer char } +/* +* si pas dans la lecture d'un champ + */ func (p *CsvParser) Parse(line string) (map[string]string, error) { currentFieldIndex := 0 valueStart := -1 - escape := false - enclosed := false + //valueEnd := -1 + + state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'} ret := make(map[string]string) - indexMax := len(line) - 1 - maxFieldIndex := len(p.fields) - 1 - delimiters := p.delimiters - delimiter := false + + maxIndex := len(line) - 1 for index, r := range line { if r == '\\' { - // Check if EOL before continue - escape = true + state.escape = true continue } - if escape { - escape = false + // previous rune was \\: ignore the current rune + if state.escape { + state.escape = false continue } - for _, d := range delimiters { - if r == d { - delimiter = true - break - } + // reading en enclosed field: watch for end of this field (encloserEnd) + if state.enclosedMode { + // searching for end of encloser + if r == state.encloserEnd { + state.enclosedMode = false + } + if index < maxIndex { + continue + } } - if delimiter && valueStart > -1 { - if p.fields[currentFieldIndex] != "ignore" { - ret[p.fields[currentFieldIndex]] = line[valueStart:index] - } - currentFieldIndex++ - } - if delimiter { - if p.fields[currentFieldIndex] != "ignore" { - ret[p.fields[currentFieldIndex]] = line[valueStart:index] - } - currentFieldIndex++ - if currentFieldIndex > maxFieldIndex { + isDelimiter := false + for _, d := range p.delimiters { + if r == d { + isDelimiter = true break } - valueStart = index + 1 - if enclosed { - enclosed = false - delimiters = p.delimiters - // Omit next delimiter - escape = true - } + } + + // previous rune was a delimiter and mergeDelimiters is true, skip + if isDelimiter && state.delimiter && p.mergeDelimiters { continue } - if index >= indexMax { - if p.fields[currentFieldIndex] != "ignore" { - ret[p.fields[currentFieldIndex]] = line[valueStart:] + // current rune is a delimiter, a value is present and the next char is the next value + if isDelimiter || index == maxIndex { + state.delimiter = true + field := line[valueStart:index] + if state.enclosed { + field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd)) } + if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore { + ret[p.fields[currentFieldIndex]] = field + } + currentFieldIndex++ + state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'} continue } - - for _, encloser := range p.enclosers { - runes := []rune(encloser) - if r == runes[0] { - // opening encloser - enclosed = true - delimiters = string(runes[1]) - valueStart++ - break + if state.delimiter { + valueStart = index + for _, e := range p.enclosers { + runes := []rune(e) + if r == runes[0] { + state.enclosed = true + state.enclosedMode = true + state.encloserStart = runes[0] + state.encloserEnd = runes[1] + } + state.delimiter = false } } } diff --git a/csvparser_test.go b/csvparser_test.go index 2dfbdd8..9dba0dc 100644 --- a/csvparser_test.go +++ b/csvparser_test.go @@ -7,8 +7,66 @@ import ( func TestCorrectLines(t *testing.T) { var csvParser CsvParser - csvParser.Initialize(" ", []string{"\"\"", "[]"}, "firstname lastname complete_name") line := "" + /* + csvParser.Initialize(" ", false, []string{"\"\"", "[]"}, "firstname lastname complete_name") + fmt.Println("parsing:", line) + event, err := csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + + line = "John" + fmt.Println("parsing:", line) + event, err = csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + + line = "John Doe" + fmt.Println("parsing:", line) + event, err = csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + + line = "John \"John Doe\" Doe" + fmt.Println("parsing:", line) + event, err = csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + + line = "John Doe \"John Doe\"" + fmt.Println("parsing:", line) + event, err = csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + + line = "John Doe \"John Doe\"\\\\" + fmt.Println("parsing:", line) + event, err = csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + + line = "John Doe I don't know him" + fmt.Println("parsing:", line) + event, err = csvParser.Parse(line) + if err != nil { + t.Fatalf("Parsing of empty line failed %v %v", err, event) + } + fmt.Println("event is", event) + */ + csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore") + line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\"" fmt.Println("parsing:", line) event, err := csvParser.Parse(line) if err != nil { @@ -16,7 +74,7 @@ func TestCorrectLines(t *testing.T) { } fmt.Println("event is", event) - line = "John" + line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" 15169" fmt.Println("parsing:", line) event, err = csvParser.Parse(line) if err != nil { @@ -24,47 +82,7 @@ func TestCorrectLines(t *testing.T) { } fmt.Println("event is", event) - line = "John Doe" - fmt.Println("parsing:", line) - event, err = csvParser.Parse(line) - if err != nil { - t.Fatalf("Parsing of empty line failed %v %v", err, event) - } - fmt.Println("event is", event) - - line = "John \"John Doe\" Doe" - fmt.Println("parsing:", line) - event, err = csvParser.Parse(line) - if err != nil { - t.Fatalf("Parsing of empty line failed %v %v", err, event) - } - fmt.Println("event is", event) - - line = "John Doe \"John Doe\"" - fmt.Println("parsing:", line) - event, err = csvParser.Parse(line) - if err != nil { - t.Fatalf("Parsing of empty line failed %v %v", err, event) - } - fmt.Println("event is", event) - - line = "John Doe \"John Doe\"\\\\" - fmt.Println("parsing:", line) - event, err = csvParser.Parse(line) - if err != nil { - t.Fatalf("Parsing of empty line failed %v %v", err, event) - } - fmt.Println("event is", event) - - line = "John Doe I don't know him" - fmt.Println("parsing:", line) - event, err = csvParser.Parse(line) - if err != nil { - t.Fatalf("Parsing of empty line failed %v %v", err, event) - } - csvParser.Initialize(" \t", []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent") - fmt.Println("event is", event) - line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\"" + line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169" fmt.Println("parsing:", line) event, err = csvParser.Parse(line) if err != nil {