Compare commits
	
		
			3 Commits
		
	
	
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 367bd16e42 | |||
|  | f92b89e01e | ||
| 9f83b3ce1f | 
							
								
								
									
										135
									
								
								csvparser.go
									
									
									
									
									
								
							
							
						
						
									
										135
									
								
								csvparser.go
									
									
									
									
									
								
							| @@ -8,86 +8,129 @@ import ( | |||||||
|  |  | ||||||
| type CsvParser struct { | type CsvParser struct { | ||||||
| 	enclosers       []string | 	enclosers       []string | ||||||
| 	delimiter rune | 	delimiters      string | ||||||
|  | 	mergeDelimiters bool | ||||||
| 	fields          []string | 	fields          []string | ||||||
|  | 	ignore          string | ||||||
|  | 	maxFieldIndex   int | ||||||
| } | } | ||||||
|  |  | ||||||
| func (p *CsvParser) Initialize(delimiter string, enclosers []string, lineFormat string) error { | /* | ||||||
| 	if utf8.RuneCountInString(delimiter) != 1 { | * delimiters: string with all delimiter chars | ||||||
| 		return fmt.Errorf("delimiter shoud be one character") | * mergeDelimiters: if true, all successive delimiters are considered as one | ||||||
|  | * enclosers: array of string, each string contains a enclorser tuple: "", [], {}... | ||||||
|  | * lineFormat: format of the line (each fieldname or ignore) | ||||||
|  |  */ | ||||||
|  | func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error { | ||||||
|  |  | ||||||
|  | 	if utf8.RuneCountInString(delimiters) == 0 { | ||||||
|  | 		return fmt.Errorf("delimiter shoud be at least one character") | ||||||
| 	} | 	} | ||||||
| 	p.enclosers = make([]string, 0) | 	p.delimiters = delimiters | ||||||
|  | 	p.mergeDelimiters = mergeDelimiters | ||||||
| 	for _, encloser := range enclosers { | 	for _, encloser := range enclosers { | ||||||
| 		if utf8.RuneCountInString(encloser) != 2 { | 		if utf8.RuneCountInString(encloser) != 2 { | ||||||
| 			return fmt.Errorf("encolser should have to characters") | 			return fmt.Errorf("encolser should have to characters") | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	p.enclosers = enclosers | 	p.enclosers = enclosers | ||||||
| 	p.delimiter = []rune(delimiter)[0] |  | ||||||
| 	for _, pair := range enclosers { | 	p.ignore = ignore | ||||||
| 		if utf8.RuneCountInString(pair) != 2 { |  | ||||||
| 			return fmt.Errorf("encoloser should contain two characters: %s", pair) |  | ||||||
| 		} |  | ||||||
| 	} |  | ||||||
| 	// line format is in the form of: field1 field2 ignore ... | 	// line format is in the form of: field1 field2 ignore ... | ||||||
| 	// if field name is ignore, it is parsed but not retained | 	// if field name is ignore, it is parsed but not retained | ||||||
| 	p.fields = strings.Split(lineFormat, " ") | 	fields := strings.Split(lineFormat, " ") | ||||||
|  |  | ||||||
|  | 	p.fields = make([]string, 0) | ||||||
|  | 	for _, f := range fields { | ||||||
|  | 		if len(f) > 0 { | ||||||
|  | 			p.fields = append(p.fields, f) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 	p.maxFieldIndex = len(p.fields) - 1 | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
|  |  | ||||||
|  | type ParserState struct { | ||||||
|  | 	inField       bool // reading field value | ||||||
|  | 	delimiter     bool // last char was a delimiter | ||||||
|  | 	escape        bool // the previaus char was a escaper \\ | ||||||
|  | 	enclosed      bool // the current field is enclosed | ||||||
|  | 	enclosedMode  bool // the current algo is for enclosed string, do not search delimiter | ||||||
|  | 	encloserStart rune // the current enclosed field opener char | ||||||
|  | 	encloserEnd   rune // the current enclosed field closer char | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /* | ||||||
|  | * si pas dans la lecture d'un champ | ||||||
|  |  */ | ||||||
| func (p *CsvParser) Parse(line string) (map[string]string, error) { | func (p *CsvParser) Parse(line string) (map[string]string, error) { | ||||||
| 	currentFieldIndex := 0 | 	currentFieldIndex := 0 | ||||||
| 	valueStart := 0 | 	valueStart := -1 | ||||||
| 	escape := false | 	//valueEnd := -1 | ||||||
|   enclosed := false |  | ||||||
|  | 	state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'} | ||||||
|  |  | ||||||
| 	ret := make(map[string]string) | 	ret := make(map[string]string) | ||||||
| 	indexMax := len(line) - 1 |  | ||||||
|   maxFieldIndex := len(p.fields) - 1 | 	maxIndex := len(line) - 1 | ||||||
|   delimiter := p.delimiter |  | ||||||
| 	for index, r := range line { | 	for index, r := range line { | ||||||
| 		if r == '\\' { | 		if r == '\\' { | ||||||
|       // Check if EOL before continue | 			state.escape = true | ||||||
|       escape=true |  | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
|     if escape { | 		// previous rune was \\: ignore the current rune | ||||||
|       escape=false | 		if state.escape { | ||||||
|  | 			state.escape = false | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
|     if r == delimiter { | 		// reading en enclosed field: watch for end of this field (encloserEnd) | ||||||
|       if p.fields[currentFieldIndex] != "ignore" { | 		if state.enclosedMode { | ||||||
|         ret[p.fields[currentFieldIndex]] = line[valueStart : index] | 			// searching for end of encloser | ||||||
|  | 			if r == state.encloserEnd { | ||||||
|  | 				state.enclosedMode = false | ||||||
| 			} | 			} | ||||||
|       currentFieldIndex++ | 			if index < maxIndex { | ||||||
|       if currentFieldIndex > maxFieldIndex { | 				continue | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		isDelimiter := false | ||||||
|  | 		for _, d := range p.delimiters { | ||||||
|  | 			if r == d { | ||||||
|  | 				isDelimiter = true | ||||||
| 				break | 				break | ||||||
| 			} | 			} | ||||||
|       valueStart = index + 1 |  | ||||||
|       if enclosed { |  | ||||||
|         enclosed=false |  | ||||||
|         delimiter = p.delimiter |  | ||||||
|         // Omit next delimiter |  | ||||||
|         escape=true |  | ||||||
|       } |  | ||||||
|       continue |  | ||||||
|     } |  | ||||||
|     if index >= indexMax { |  | ||||||
|       if p.fields[currentFieldIndex] != "ignore" { |  | ||||||
| 			  ret[p.fields[currentFieldIndex]] = line[valueStart:] |  | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|  | 		// previous rune was a delimiter and mergeDelimiters is true, skip | ||||||
|  | 		if isDelimiter && state.delimiter && p.mergeDelimiters { | ||||||
| 			continue | 			continue | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
|     for _, encloser := range p.enclosers { | 		// current rune is a delimiter, a value is present and the next char is the next value | ||||||
|       runes := []rune(encloser) | 		if isDelimiter || index == maxIndex { | ||||||
|  | 			state.delimiter = true | ||||||
|  | 			field := line[valueStart:index] | ||||||
|  | 			if state.enclosed { | ||||||
|  | 				field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd)) | ||||||
|  | 			} | ||||||
|  | 			if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore { | ||||||
|  | 				ret[p.fields[currentFieldIndex]] = field | ||||||
|  | 			} | ||||||
|  | 			currentFieldIndex++ | ||||||
|  | 			state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'} | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		if state.delimiter { | ||||||
|  | 			valueStart = index | ||||||
|  | 			for _, e := range p.enclosers { | ||||||
|  | 				runes := []rune(e) | ||||||
| 				if r == runes[0] { | 				if r == runes[0] { | ||||||
|         // opening encloser | 					state.enclosed = true | ||||||
|         enclosed = true | 					state.enclosedMode = true | ||||||
|         delimiter = runes[1] | 					state.encloserStart = runes[0] | ||||||
|         valueStart++ | 					state.encloserEnd = runes[1] | ||||||
|         break | 				} | ||||||
|  | 				state.delimiter = false | ||||||
| 			} | 			} | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|   | |||||||
| @@ -7,8 +7,9 @@ import ( | |||||||
|  |  | ||||||
| func TestCorrectLines(t *testing.T) { | func TestCorrectLines(t *testing.T) { | ||||||
| 	var csvParser CsvParser | 	var csvParser CsvParser | ||||||
| 	csvParser.Initialize(" ", []string{"\"\"", "[]"}, "firstname lastname complete_name") |  | ||||||
| 	line := "" | 	line := "" | ||||||
|  | 	/* | ||||||
|  | 		csvParser.Initialize(" ", false, []string{"\"\"", "[]"}, "firstname lastname complete_name") | ||||||
| 		fmt.Println("parsing:", line) | 		fmt.Println("parsing:", line) | ||||||
| 		event, err := csvParser.Parse(line) | 		event, err := csvParser.Parse(line) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| @@ -63,4 +64,30 @@ func TestCorrectLines(t *testing.T) { | |||||||
| 			t.Fatalf("Parsing of empty line failed %v %v", err, event) | 			t.Fatalf("Parsing of empty line failed %v %v", err, event) | ||||||
| 		} | 		} | ||||||
| 		fmt.Println("event is", event) | 		fmt.Println("event is", event) | ||||||
|  | 	*/ | ||||||
|  | 	csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore") | ||||||
|  | 	line = "Aug  7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\"" | ||||||
|  | 	fmt.Println("parsing:", line) | ||||||
|  | 	event, err := csvParser.Parse(line) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Parsing of empty line failed %v %v", err, event) | ||||||
|  | 	} | ||||||
|  | 	fmt.Println("event is", event) | ||||||
|  |  | ||||||
|  | 	line = "Aug  7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" 15169" | ||||||
|  | 	fmt.Println("parsing:", line) | ||||||
|  | 	event, err = csvParser.Parse(line) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Parsing of empty line failed %v %v", err, event) | ||||||
|  | 	} | ||||||
|  | 	fmt.Println("event is", event) | ||||||
|  |  | ||||||
|  | 	line = "Aug  7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169" | ||||||
|  | 	fmt.Println("parsing:", line) | ||||||
|  | 	event, err = csvParser.Parse(line) | ||||||
|  | 	if err != nil { | ||||||
|  | 		t.Fatalf("Parsing of empty line failed %v %v", err, event) | ||||||
|  | 	} | ||||||
|  | 	fmt.Println("event is", event) | ||||||
|  |  | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user