Compare commits
No commits in common. "master" and "v0.0.2" have entirely different histories.
|
@ -0,0 +1,95 @@
|
||||||
|
package csvparser
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"errors"
|
||||||
|
)
|
||||||
|
|
||||||
|
type EncloserId int
|
||||||
|
|
||||||
|
const (
|
||||||
|
None EncloserId = iota
|
||||||
|
DoubleQuotes
|
||||||
|
SingleQuotes
|
||||||
|
RoundBrackets
|
||||||
|
SquareBrackets
|
||||||
|
CurlyBrackets
|
||||||
|
)
|
||||||
|
|
||||||
|
type Encloser struct {
|
||||||
|
Open byte
|
||||||
|
Close byte
|
||||||
|
}
|
||||||
|
|
||||||
|
var EnclosersRunes = map[EncloserId]Encloser{
|
||||||
|
DoubleQuotes: Encloser{'"', '"'},
|
||||||
|
SingleQuotes: Encloser{'\'', '\''},
|
||||||
|
RoundBrackets: Encloser{'(', ')'},
|
||||||
|
SquareBrackets: Encloser{'[', ']'},
|
||||||
|
CurlyBrackets: Encloser{'{', '}'},
|
||||||
|
}
|
||||||
|
|
||||||
|
type CsvParser struct {
|
||||||
|
Enclosers []EncloserId
|
||||||
|
Delimiter string
|
||||||
|
Fields []string
|
||||||
|
Line string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (parser *CsvParser) Init() {
|
||||||
|
parser.Enclosers = []EncloserId{DoubleQuotes, SquareBrackets}
|
||||||
|
parser.Delimiter = " \t"
|
||||||
|
}
|
||||||
|
|
||||||
|
func (parser *CsvParser) ExtractEnclosedFieldValue(endChar byte) error {
|
||||||
|
lineLen := len(parser.Line)
|
||||||
|
for i := 1; i < lineLen; i++ {
|
||||||
|
if parser.Line[i] == endChar && parser.Line[i-1] != '\\' {
|
||||||
|
parser.Fields = append(parser.Fields, parser.Line[0:i])
|
||||||
|
parser.Line = parser.Line[i+1:]
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return errors.New("Encloser close not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (parser *CsvParser) Parse(CsvLine string) error {
|
||||||
|
var err error = nil
|
||||||
|
|
||||||
|
parser.Fields = make([]string,0)
|
||||||
|
parser.Line = CsvLine
|
||||||
|
for len(parser.Line) > 0 {
|
||||||
|
parser.Line = strings.TrimLeft(parser.Line, parser.Delimiter)
|
||||||
|
parser.Line = strings.TrimRight(parser.Line, parser.Delimiter)
|
||||||
|
if len(parser.Line) == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Search for an encloser
|
||||||
|
encloserId := None
|
||||||
|
for _, id := range parser.Enclosers {
|
||||||
|
if parser.Line[0] == EnclosersRunes[id].Open {
|
||||||
|
encloserId = id
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if encloserId != None {
|
||||||
|
parser.Line = parser.Line[1:]
|
||||||
|
err = parser.ExtractEnclosedFieldValue(EnclosersRunes[encloserId].Close)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nextSpace := strings.IndexAny(parser.Line, parser.Delimiter)
|
||||||
|
if nextSpace != -1 {
|
||||||
|
parser.Fields = append(parser.Fields, parser.Line[:nextSpace])
|
||||||
|
parser.Line = parser.Line[nextSpace:]
|
||||||
|
} else {
|
||||||
|
parser.Fields = append(parser.Fields, parser.Line)
|
||||||
|
parser.Line = ""
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
|
@ -0,0 +1,68 @@
|
||||||
|
package csvparser
|
||||||
|
|
||||||
|
import(
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
func TestParse(t *testing.T) {
|
||||||
|
var parser CsvParser
|
||||||
|
|
||||||
|
var CsvTestValues = [...]string {
|
||||||
|
"field1 field2 field3", // standard CSV
|
||||||
|
" field1 field2 field3 ", // Space or multiple spaces as delimiters
|
||||||
|
" field1 field2 field3 ", // Spaces + tabs as delimiters
|
||||||
|
" \"field1\" field2 field3 ", // Enclosed fields
|
||||||
|
" \"field1\" field2 [field3] ", // Enclosed fields
|
||||||
|
}
|
||||||
|
var CsvExpectedValues = [...]string {
|
||||||
|
"field1",
|
||||||
|
"field2",
|
||||||
|
"field3",
|
||||||
|
}
|
||||||
|
|
||||||
|
parser.Init()
|
||||||
|
for _, v := range CsvTestValues {
|
||||||
|
err := parser.Parse(v)
|
||||||
|
if err != nil {
|
||||||
|
t.Error("Parse error:", err, " in ", v)
|
||||||
|
}
|
||||||
|
if len(parser.Fields) != len(CsvExpectedValues) {
|
||||||
|
t.Error("Extracted field number does not match expected", parser.Fields)
|
||||||
|
}
|
||||||
|
for i,val := range CsvExpectedValues {
|
||||||
|
if parser.Fields[i] != val {
|
||||||
|
t.Error("Field values do not match", i, " expected ", val, " got ", parser.Fields[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
func TestParseWithEscape(t *testing.T) {
|
||||||
|
var parser CsvParser
|
||||||
|
|
||||||
|
var CsvTestValues = [...]string {
|
||||||
|
"\"\\\"field1 and more\" field2 [\\[field3] ", // Enclosed fields
|
||||||
|
}
|
||||||
|
var CsvExpectedValues = [...]string {
|
||||||
|
"\\\"field1 and more",
|
||||||
|
"field2",
|
||||||
|
"\\[field3",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
parser.Init()
|
||||||
|
for _, v := range CsvTestValues {
|
||||||
|
err := parser.Parse(v)
|
||||||
|
if err != nil {
|
||||||
|
t.Error("Parse error:", err, " in ", v)
|
||||||
|
}
|
||||||
|
if len(parser.Fields) != len(CsvExpectedValues) {
|
||||||
|
t.Error("Extracted field number does not match expected", parser.Fields)
|
||||||
|
}
|
||||||
|
for i,val := range CsvExpectedValues {
|
||||||
|
if parser.Fields[i] != val {
|
||||||
|
t.Error("Field values do not match", i, " expected ", val, " got ", parser.Fields[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
143
csvparser.go
143
csvparser.go
|
@ -1,143 +0,0 @@
|
||||||
package csvparser
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"strings"
|
|
||||||
"unicode/utf8"
|
|
||||||
)
|
|
||||||
|
|
||||||
type CsvParser struct {
|
|
||||||
enclosers []string
|
|
||||||
delimiters string
|
|
||||||
mergeDelimiters bool
|
|
||||||
fields []string
|
|
||||||
ignore string
|
|
||||||
maxFieldIndex int
|
|
||||||
}
|
|
||||||
func (p *csvParser) FieldCount() int {
|
|
||||||
return maxFieldIndex + 1
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* delimiters: string with all delimiter chars
|
|
||||||
* mergeDelimiters: if true, all successive delimiters are considered as one
|
|
||||||
* enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
|
|
||||||
* lineFormat: format of the line (each fieldname or ignore)
|
|
||||||
*/
|
|
||||||
func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error {
|
|
||||||
|
|
||||||
if utf8.RuneCountInString(delimiters) == 0 {
|
|
||||||
return fmt.Errorf("delimiter shoud be at least one character")
|
|
||||||
}
|
|
||||||
p.delimiters = delimiters
|
|
||||||
p.mergeDelimiters = mergeDelimiters
|
|
||||||
for _, encloser := range enclosers {
|
|
||||||
if utf8.RuneCountInString(encloser) != 2 {
|
|
||||||
return fmt.Errorf("encolser should have to characters")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p.enclosers = enclosers
|
|
||||||
|
|
||||||
p.ignore = ignore
|
|
||||||
// line format is in the form of: field1 field2 ignore ...
|
|
||||||
// if field name is ignore, it is parsed but not retained
|
|
||||||
fields := strings.Split(lineFormat, " ")
|
|
||||||
|
|
||||||
p.fields = make([]string, 0)
|
|
||||||
for _, f := range fields {
|
|
||||||
if len(f) > 0 {
|
|
||||||
p.fields = append(p.fields, f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
p.maxFieldIndex = len(p.fields) - 1
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type ParserState struct {
|
|
||||||
inField bool // reading field value
|
|
||||||
delimiter bool // last char was a delimiter
|
|
||||||
escape bool // the previaus char was a escaper \\
|
|
||||||
enclosed bool // the current field is enclosed
|
|
||||||
enclosedMode bool // the current algo is for enclosed string, do not search delimiter
|
|
||||||
encloserStart rune // the current enclosed field opener char
|
|
||||||
encloserEnd rune // the current enclosed field closer char
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* si pas dans la lecture d'un champ
|
|
||||||
*/
|
|
||||||
func (p *CsvParser) Parse(line string) (map[string]string, error) {
|
|
||||||
currentFieldIndex := 0
|
|
||||||
valueStart := -1
|
|
||||||
|
|
||||||
state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
|
|
||||||
|
|
||||||
ret := make(map[string]string)
|
|
||||||
|
|
||||||
maxIndex := len(line) - 1
|
|
||||||
for index, r := range line {
|
|
||||||
if r == '\\' {
|
|
||||||
state.escape = true
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// previous rune was \\: ignore the current rune
|
|
||||||
if state.escape {
|
|
||||||
state.escape = false
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// reading en enclosed field: watch for end of this field (encloserEnd)
|
|
||||||
if state.enclosedMode {
|
|
||||||
// searching for end of encloser
|
|
||||||
if r == state.encloserEnd {
|
|
||||||
state.enclosedMode = false
|
|
||||||
}
|
|
||||||
if index < maxIndex {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
isDelimiter := false
|
|
||||||
for _, d := range p.delimiters {
|
|
||||||
if r == d {
|
|
||||||
isDelimiter = true
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// previous rune was a delimiter and mergeDelimiters is true, skip
|
|
||||||
if isDelimiter && state.delimiter && p.mergeDelimiters {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// current rune is a delimiter, a value is present and the next char is the next value
|
|
||||||
if isDelimiter || index == maxIndex {
|
|
||||||
if index == maxIndex && valueStart == -1 {
|
|
||||||
valueStart = 0
|
|
||||||
}
|
|
||||||
state.delimiter = true
|
|
||||||
field := line[valueStart : index+1]
|
|
||||||
field = strings.TrimRight(field, p.delimiters)
|
|
||||||
if state.enclosed {
|
|
||||||
field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd))
|
|
||||||
}
|
|
||||||
if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore {
|
|
||||||
ret[p.fields[currentFieldIndex]] = field
|
|
||||||
}
|
|
||||||
currentFieldIndex++
|
|
||||||
state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if state.delimiter {
|
|
||||||
valueStart = index
|
|
||||||
for _, e := range p.enclosers {
|
|
||||||
runes := []rune(e)
|
|
||||||
if r == runes[0] {
|
|
||||||
state.enclosed = true
|
|
||||||
state.enclosedMode = true
|
|
||||||
state.encloserStart = runes[0]
|
|
||||||
state.encloserEnd = runes[1]
|
|
||||||
}
|
|
||||||
state.delimiter = false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret, nil
|
|
||||||
}
|
|
|
@ -1,123 +0,0 @@
|
||||||
package csvparser
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestCorrectLines(t *testing.T) {
|
|
||||||
var csvParser CsvParser
|
|
||||||
line := ""
|
|
||||||
/*
|
|
||||||
csvParser.Initialize(" ", false, []string{"\"\"", "[]"}, "firstname lastname complete_name")
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err := csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
line = "John"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
line = "John Doe"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
line = "John \"John Doe\" Doe"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
line = "John Doe \"John Doe\""
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
line = "John Doe \"John Doe\"\\\\"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
line = "John Doe I don't know him"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
*/
|
|
||||||
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
|
|
||||||
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\""
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err := csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
/*
|
|
||||||
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
|
|
||||||
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" 15169"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
|
|
||||||
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
|
|
||||||
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
*/
|
|
||||||
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
|
|
||||||
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
|
|
||||||
line = "G"
|
|
||||||
fmt.Println("parsing:", line)
|
|
||||||
event, err = csvParser.Parse(line)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("Parsing of empty line failed %v %v", err, event)
|
|
||||||
}
|
|
||||||
fmt.Println("event is", event)
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in New Issue