From 367bd16e42746ad3d1fed7d9f8dff1113cdf7c7f Mon Sep 17 00:00:00 2001
From: laurentu <laurentu@gmail.com>
Date: Thu, 8 Aug 2024 11:02:10 +0200
Subject: [PATCH] =?UTF-8?q?Algo=20ok,=20mais=20peut-=C3=AAtre=20=C3=A0=20o?=
 =?UTF-8?q?ptimiser=20=3F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 csvparser.go      | 137 ++++++++++++++++++++++++++--------------------
 csvparser_test.go | 104 ++++++++++++++++++++---------------
 2 files changed, 140 insertions(+), 101 deletions(-)

diff --git a/csvparser.go b/csvparser.go
index 07bf9c4..c275894 100644
--- a/csvparser.go
+++ b/csvparser.go
@@ -7,10 +7,12 @@ import (
 )
 
 type CsvParser struct {
-	enclosers  []string
-	delimiters string
-  mergeDelimiters string
-	fields     []string
+	enclosers       []string
+	delimiters      string
+	mergeDelimiters bool
+	fields          []string
+	ignore          string
+	maxFieldIndex   int
 }
 
 /*
@@ -18,98 +20,117 @@ type CsvParser struct {
 * mergeDelimiters: if true, all successive delimiters are considered as one
 * enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
 * lineFormat: format of the line (each fieldname or ignore)
-*/
-func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string) error {
+ */
+func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error {
+
 	if utf8.RuneCountInString(delimiters) == 0 {
 		return fmt.Errorf("delimiter shoud be at least one character")
 	}
 	p.delimiters = delimiters
-  p.mergeDelimiters = mergeDelimiters
+	p.mergeDelimiters = mergeDelimiters
 	for _, encloser := range enclosers {
 		if utf8.RuneCountInString(encloser) != 2 {
 			return fmt.Errorf("encolser should have to characters")
 		}
 	}
 	p.enclosers = enclosers
+
+	p.ignore = ignore
 	// line format is in the form of: field1 field2 ignore ...
 	// if field name is ignore, it is parsed but not retained
-	p.fields = strings.Split(lineFormat, " ")
+	fields := strings.Split(lineFormat, " ")
 
+	p.fields = make([]string, 0)
+	for _, f := range fields {
+		if len(f) > 0 {
+			p.fields = append(p.fields, f)
+		}
+	}
+	p.maxFieldIndex = len(p.fields) - 1
 	return nil
 }
 
-type parserState struct {
-  delimiter bool
-  enclosed bool
-  encloserEnd rune
+type ParserState struct {
+	inField       bool // reading field value
+	delimiter     bool // last char was a delimiter
+	escape        bool // the previaus char was a escaper \\
+	enclosed      bool // the current field is enclosed
+	enclosedMode  bool // the current algo is for enclosed string, do not search delimiter
+	encloserStart rune // the current enclosed field opener char
+	encloserEnd   rune // the current enclosed field closer char
 }
 
+/*
+* si pas dans la lecture d'un champ
+ */
 func (p *CsvParser) Parse(line string) (map[string]string, error) {
 	currentFieldIndex := 0
 	valueStart := -1
-	escape := false
-	enclosed := false
+	//valueEnd := -1
+
+	state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
 
 	ret := make(map[string]string)
-	indexMax := len(line) - 1
-	maxFieldIndex := len(p.fields) - 1
-	delimiters := p.delimiters
-  delimiter := false
+
+	maxIndex := len(line) - 1
 	for index, r := range line {
 		if r == '\\' {
-			// Check if EOL before continue
-			escape = true
+			state.escape = true
 			continue
 		}
-		if escape {
-			escape = false
+		// previous rune was \\: ignore the current rune
+		if state.escape {
+			state.escape = false
 			continue
 		}
-		for _, d := range delimiters {
-      if r == d {
-        delimiter = true
-        break
-      }
+		// reading en enclosed field: watch for end of this field (encloserEnd)
+		if state.enclosedMode {
+			// searching for end of encloser
+			if r == state.encloserEnd {
+				state.enclosedMode = false
+			}
+			if index < maxIndex {
+				continue
+			}
 		}
-    if delimiter && valueStart > -1 {
-			if p.fields[currentFieldIndex] != "ignore" {
-				ret[p.fields[currentFieldIndex]] = line[valueStart:index]
-			}
-			currentFieldIndex++
-    }
-		if delimiter {
-			if p.fields[currentFieldIndex] != "ignore" {
-				ret[p.fields[currentFieldIndex]] = line[valueStart:index]
-			}
-			currentFieldIndex++
-			if currentFieldIndex > maxFieldIndex {
+		isDelimiter := false
+		for _, d := range p.delimiters {
+			if r == d {
+				isDelimiter = true
 				break
 			}
-			valueStart = index + 1
-			if enclosed {
-				enclosed = false
-				delimiters = p.delimiters
-				// Omit next delimiter
-				escape = true
-			}
+		}
+
+		// previous rune was a delimiter and mergeDelimiters is true, skip
+		if isDelimiter && state.delimiter && p.mergeDelimiters {
 			continue
 		}
 
-		if index >= indexMax {
-			if p.fields[currentFieldIndex] != "ignore" {
-				ret[p.fields[currentFieldIndex]] = line[valueStart:]
+		// current rune is a delimiter, a value is present and the next char is the next value
+		if isDelimiter || index == maxIndex {
+			state.delimiter = true
+			field := line[valueStart:index]
+			if state.enclosed {
+				field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd))
 			}
+			if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore {
+				ret[p.fields[currentFieldIndex]] = field
+			}
+			currentFieldIndex++
+			state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
 			continue
 		}
-
-		for _, encloser := range p.enclosers {
-			runes := []rune(encloser)
-			if r == runes[0] {
-				// opening encloser
-				enclosed = true
-				delimiters = string(runes[1])
-				valueStart++
-				break
+		if state.delimiter {
+			valueStart = index
+			for _, e := range p.enclosers {
+				runes := []rune(e)
+				if r == runes[0] {
+					state.enclosed = true
+					state.enclosedMode = true
+					state.encloserStart = runes[0]
+					state.encloserEnd = runes[1]
+				}
+				state.delimiter = false
 			}
 		}
 	}
diff --git a/csvparser_test.go b/csvparser_test.go
index 2dfbdd8..9dba0dc 100644
--- a/csvparser_test.go
+++ b/csvparser_test.go
@@ -7,8 +7,66 @@ import (
 
 func TestCorrectLines(t *testing.T) {
 	var csvParser CsvParser
-	csvParser.Initialize(" ", []string{"\"\"", "[]"}, "firstname lastname complete_name")
 	line := ""
+	/*
+		csvParser.Initialize(" ", false, []string{"\"\"", "[]"}, "firstname lastname complete_name")
+		fmt.Println("parsing:", line)
+		event, err := csvParser.Parse(line)
+		if err != nil {
+			t.Fatalf("Parsing of empty line failed %v %v", err, event)
+		}
+		fmt.Println("event is", event)
+
+		line = "John"
+		fmt.Println("parsing:", line)
+		event, err = csvParser.Parse(line)
+		if err != nil {
+			t.Fatalf("Parsing of empty line failed %v %v", err, event)
+		}
+		fmt.Println("event is", event)
+
+		line = "John Doe"
+		fmt.Println("parsing:", line)
+		event, err = csvParser.Parse(line)
+		if err != nil {
+			t.Fatalf("Parsing of empty line failed %v %v", err, event)
+		}
+		fmt.Println("event is", event)
+
+		line = "John \"John Doe\" Doe"
+		fmt.Println("parsing:", line)
+		event, err = csvParser.Parse(line)
+		if err != nil {
+			t.Fatalf("Parsing of empty line failed %v %v", err, event)
+		}
+		fmt.Println("event is", event)
+
+		line = "John Doe \"John Doe\""
+		fmt.Println("parsing:", line)
+		event, err = csvParser.Parse(line)
+		if err != nil {
+			t.Fatalf("Parsing of empty line failed %v %v", err, event)
+		}
+		fmt.Println("event is", event)
+
+		line = "John Doe \"John Doe\"\\\\"
+		fmt.Println("parsing:", line)
+		event, err = csvParser.Parse(line)
+		if err != nil {
+			t.Fatalf("Parsing of empty line failed %v %v", err, event)
+		}
+		fmt.Println("event is", event)
+
+		line = "John Doe I don't know him"
+		fmt.Println("parsing:", line)
+		event, err = csvParser.Parse(line)
+		if err != nil {
+			t.Fatalf("Parsing of empty line failed %v %v", err, event)
+		}
+		fmt.Println("event is", event)
+	*/
+	csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
+	line = "Aug  7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\""
 	fmt.Println("parsing:", line)
 	event, err := csvParser.Parse(line)
 	if err != nil {
@@ -16,7 +74,7 @@ func TestCorrectLines(t *testing.T) {
 	}
 	fmt.Println("event is", event)
 
-	line = "John"
+	line = "Aug  7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" 15169"
 	fmt.Println("parsing:", line)
 	event, err = csvParser.Parse(line)
 	if err != nil {
@@ -24,47 +82,7 @@ func TestCorrectLines(t *testing.T) {
 	}
 	fmt.Println("event is", event)
 
-	line = "John Doe"
-	fmt.Println("parsing:", line)
-	event, err = csvParser.Parse(line)
-	if err != nil {
-		t.Fatalf("Parsing of empty line failed %v %v", err, event)
-	}
-	fmt.Println("event is", event)
-
-	line = "John \"John Doe\" Doe"
-	fmt.Println("parsing:", line)
-	event, err = csvParser.Parse(line)
-	if err != nil {
-		t.Fatalf("Parsing of empty line failed %v %v", err, event)
-	}
-	fmt.Println("event is", event)
-
-	line = "John Doe \"John Doe\""
-	fmt.Println("parsing:", line)
-	event, err = csvParser.Parse(line)
-	if err != nil {
-		t.Fatalf("Parsing of empty line failed %v %v", err, event)
-	}
-	fmt.Println("event is", event)
-
-	line = "John Doe \"John Doe\"\\\\"
-	fmt.Println("parsing:", line)
-	event, err = csvParser.Parse(line)
-	if err != nil {
-		t.Fatalf("Parsing of empty line failed %v %v", err, event)
-	}
-	fmt.Println("event is", event)
-
-	line = "John Doe I don't know him"
-	fmt.Println("parsing:", line)
-	event, err = csvParser.Parse(line)
-	if err != nil {
-		t.Fatalf("Parsing of empty line failed %v %v", err, event)
-	}
-	csvParser.Initialize(" \t", []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent")
-	fmt.Println("event is", event)
-  line = "Aug  7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\"" 
+	line = "Aug  7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169"
 	fmt.Println("parsing:", line)
 	event, err = csvParser.Parse(line)
 	if err != nil {