Compare commits
10 Commits
Author | SHA1 | Date |
---|---|---|
|
8c241fff65 | |
|
62296e1da6 | |
|
7bf103427b | |
|
9301f353ba | |
|
c363b760d9 | |
|
1ea2bab9e8 | |
|
367bd16e42 | |
|
f92b89e01e | |
|
9f83b3ce1f | |
|
3e4b170099 |
172
csvparser.go
172
csvparser.go
|
@ -7,85 +7,137 @@ import (
|
|||
)
|
||||
|
||||
type CsvParser struct {
|
||||
enclosers []string
|
||||
delimiter rune
|
||||
fields []string
|
||||
enclosers []string
|
||||
delimiters string
|
||||
mergeDelimiters bool
|
||||
fields []string
|
||||
ignore string
|
||||
maxFieldIndex int
|
||||
}
|
||||
func (p *csvParser) FieldCount() int {
|
||||
return maxFieldIndex + 1
|
||||
}
|
||||
/*
|
||||
* delimiters: string with all delimiter chars
|
||||
* mergeDelimiters: if true, all successive delimiters are considered as one
|
||||
* enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
|
||||
* lineFormat: format of the line (each fieldname or ignore)
|
||||
*/
|
||||
func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error {
|
||||
|
||||
func (p *CsvParser) Initialize(delimiter string, enclosers []string, lineFormat string) error {
|
||||
if utf8.RuneCountInString(delimiter) != 1 {
|
||||
return fmt.Errorf("delimiter shoud be one character")
|
||||
if utf8.RuneCountInString(delimiters) == 0 {
|
||||
return fmt.Errorf("delimiter shoud be at least one character")
|
||||
}
|
||||
p.enclosers = make([]string, 0)
|
||||
p.delimiters = delimiters
|
||||
p.mergeDelimiters = mergeDelimiters
|
||||
for _, encloser := range enclosers {
|
||||
if utf8.RuneCountInString(encloser) != 2 {
|
||||
return fmt.Errorf("encolser should have to characters")
|
||||
}
|
||||
}
|
||||
p.enclosers = enclosers
|
||||
p.delimiter = []rune(delimiter)[0]
|
||||
for _, pair := range enclosers {
|
||||
if utf8.RuneCountInString(pair) != 2 {
|
||||
return fmt.Errorf("encoloser should contain two characters: %s", pair)
|
||||
}
|
||||
}
|
||||
|
||||
p.ignore = ignore
|
||||
// line format is in the form of: field1 field2 ignore ...
|
||||
// if field name is ignore, it is parsed but not retained
|
||||
p.fields = strings.Split(lineFormat, " ")
|
||||
fields := strings.Split(lineFormat, " ")
|
||||
|
||||
p.fields = make([]string, 0)
|
||||
for _, f := range fields {
|
||||
if len(f) > 0 {
|
||||
p.fields = append(p.fields, f)
|
||||
}
|
||||
}
|
||||
p.maxFieldIndex = len(p.fields) - 1
|
||||
return nil
|
||||
}
|
||||
|
||||
type ParserState struct {
|
||||
inField bool // reading field value
|
||||
delimiter bool // last char was a delimiter
|
||||
escape bool // the previaus char was a escaper \\
|
||||
enclosed bool // the current field is enclosed
|
||||
enclosedMode bool // the current algo is for enclosed string, do not search delimiter
|
||||
encloserStart rune // the current enclosed field opener char
|
||||
encloserEnd rune // the current enclosed field closer char
|
||||
}
|
||||
|
||||
/*
|
||||
* si pas dans la lecture d'un champ
|
||||
*/
|
||||
func (p *CsvParser) Parse(line string) (map[string]string, error) {
|
||||
currentFieldIndex := 0
|
||||
valueStart := 0
|
||||
escape := false
|
||||
enclosed := false
|
||||
valueStart := -1
|
||||
|
||||
state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
|
||||
|
||||
ret := make(map[string]string)
|
||||
indexMax := len(line) - 1
|
||||
maxFieldIndex := len(p.fields) - 1
|
||||
delimiter := p.delimiter
|
||||
for index, r := range line {
|
||||
if r == '\\' {
|
||||
// Check if EOL before continue
|
||||
escape=true
|
||||
continue
|
||||
}
|
||||
if escape {
|
||||
escape=false
|
||||
continue
|
||||
}
|
||||
if r == delimiter {
|
||||
ret[p.fields[currentFieldIndex]] = line[valueStart : index]
|
||||
currentFieldIndex++
|
||||
if currentFieldIndex > maxFieldIndex {
|
||||
break
|
||||
}
|
||||
valueStart = index + 1
|
||||
if enclosed {
|
||||
enclosed=false
|
||||
delimiter = p.delimiter
|
||||
// Omit next delimiter
|
||||
escape=true
|
||||
}
|
||||
continue
|
||||
}
|
||||
if index >= indexMax {
|
||||
ret[p.fields[currentFieldIndex]] = line[valueStart:]
|
||||
continue
|
||||
}
|
||||
|
||||
for _, encloser := range p.enclosers {
|
||||
runes := []rune(encloser)
|
||||
if r == runes[0] {
|
||||
// opening encloser
|
||||
enclosed = true
|
||||
delimiter = runes[1]
|
||||
valueStart++
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
maxIndex := len(line) - 1
|
||||
for index, r := range line {
|
||||
if r == '\\' {
|
||||
state.escape = true
|
||||
continue
|
||||
}
|
||||
// previous rune was \\: ignore the current rune
|
||||
if state.escape {
|
||||
state.escape = false
|
||||
continue
|
||||
}
|
||||
// reading en enclosed field: watch for end of this field (encloserEnd)
|
||||
if state.enclosedMode {
|
||||
// searching for end of encloser
|
||||
if r == state.encloserEnd {
|
||||
state.enclosedMode = false
|
||||
}
|
||||
if index < maxIndex {
|
||||
continue
|
||||
}
|
||||
}
|
||||
isDelimiter := false
|
||||
for _, d := range p.delimiters {
|
||||
if r == d {
|
||||
isDelimiter = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// previous rune was a delimiter and mergeDelimiters is true, skip
|
||||
if isDelimiter && state.delimiter && p.mergeDelimiters {
|
||||
continue
|
||||
}
|
||||
|
||||
// current rune is a delimiter, a value is present and the next char is the next value
|
||||
if isDelimiter || index == maxIndex {
|
||||
if index == maxIndex && valueStart == -1 {
|
||||
valueStart = 0
|
||||
}
|
||||
state.delimiter = true
|
||||
field := line[valueStart : index+1]
|
||||
field = strings.TrimRight(field, p.delimiters)
|
||||
if state.enclosed {
|
||||
field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd))
|
||||
}
|
||||
if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore {
|
||||
ret[p.fields[currentFieldIndex]] = field
|
||||
}
|
||||
currentFieldIndex++
|
||||
state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
|
||||
continue
|
||||
}
|
||||
if state.delimiter {
|
||||
valueStart = index
|
||||
for _, e := range p.enclosers {
|
||||
runes := []rune(e)
|
||||
if r == runes[0] {
|
||||
state.enclosed = true
|
||||
state.enclosedMode = true
|
||||
state.encloserStart = runes[0]
|
||||
state.encloserEnd = runes[1]
|
||||
}
|
||||
state.delimiter = false
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret, nil
|
||||
}
|
||||
|
|
|
@ -7,16 +7,75 @@ import (
|
|||
|
||||
func TestCorrectLines(t *testing.T) {
|
||||
var csvParser CsvParser
|
||||
csvParser.Initialize(" ", []string{"\"\"", "[]"}, "firstname lastname complete_name")
|
||||
line := ""
|
||||
/*
|
||||
csvParser.Initialize(" ", false, []string{"\"\"", "[]"}, "firstname lastname complete_name")
|
||||
fmt.Println("parsing:", line)
|
||||
event, err := csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John \"John Doe\" Doe"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe \"John Doe\""
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe \"John Doe\"\\\\"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe I don't know him"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
*/
|
||||
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
|
||||
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\""
|
||||
fmt.Println("parsing:", line)
|
||||
event, err := csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John"
|
||||
/*
|
||||
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
|
||||
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" 15169"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
|
@ -24,7 +83,8 @@ func TestCorrectLines(t *testing.T) {
|
|||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe"
|
||||
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
|
||||
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
|
@ -32,7 +92,17 @@ func TestCorrectLines(t *testing.T) {
|
|||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John \"John Doe\" Doe"
|
||||
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
|
||||
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
*/
|
||||
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
|
||||
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
|
@ -40,7 +110,8 @@ func TestCorrectLines(t *testing.T) {
|
|||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe \"John Doe\""
|
||||
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
|
||||
line = "G"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
|
@ -48,19 +119,5 @@ func TestCorrectLines(t *testing.T) {
|
|||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe \"John Doe\"\\\\"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
|
||||
line = "John Doe I don't know him"
|
||||
fmt.Println("parsing:", line)
|
||||
event, err = csvParser.Parse(line)
|
||||
if err != nil {
|
||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
||||
}
|
||||
fmt.Println("event is", event)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue