Compare commits

..

20 Commits

Author SHA1 Message Date
Laurentu 8c241fff65 zz 2025-05-14 15:56:45 +02:00
Laurent Ulrich 62296e1da6 simple tabulation 2024-11-15 15:35:13 +01:00
laurentu 7bf103427b suppression log 2024-08-08 16:04:56 +02:00
laurentu 9301f353ba semble ok 2024-08-08 16:03:06 +02:00
laurentu c363b760d9 bug sur les fin de lignes non enclosed (manque le dernier char) 2024-08-08 15:23:56 +02:00
laurentu 1ea2bab9e8 petit bug sur les lignes de 1 caractère 2024-08-08 15:01:39 +02:00
laurentu 367bd16e42 Algo ok, mais peut-être à optimiser ? 2024-08-08 11:02:10 +02:00
Laurentu f92b89e01e progression 2024-08-07 20:36:25 +02:00
laurentu 9f83b3ce1f Penser à gérer le case de délimiteurs multiples qui s'enchaînent et qui ne doivent être pris en compte que pour 1 seul 2024-08-07 16:19:17 +02:00
laurentu 3e4b170099 ne renvoie plus les ignore 2024-08-07 15:51:27 +02:00
laurentu 614ec535aa Ré-écriture de l'algo 2024-08-07 15:46:14 +02:00
laurentu 5f6ef91d86 Ignorer le champs ignore 2024-08-07 13:46:53 +02:00
Laurent Ulrich d33fbd13f9 Ok ça avance 2024-07-12 15:59:03 +02:00
Laurent Ulrich b48f2dc7fe Ne fonctionne pas des masses 2024-07-11 16:03:03 +02:00
Laurent Ulrich 892e9a62ec Problème de reconnaissance des champs ? 2024-07-08 14:47:34 +02:00
Laurent Ulrich f9b8ec213a zz 2024-07-07 10:32:06 +02:00
Laurent Ulrich f5c589127a zz 2024-07-06 21:03:35 +02:00
laurentu 7d521b3d11 fusion 2022-05-04 12:56:51 +02:00
laurentu 4806216045 gofmt 2022-05-04 12:52:06 +02:00
Laurent ULRICH 8e7dae3638 Renamed Parser 2022-03-28 07:04:45 +00:00
5 changed files with 269 additions and 165 deletions

View File

@ -1,95 +0,0 @@
package csvparser
import (
"strings"
"errors"
)
type EncloserId int
const (
None EncloserId = iota
DoubleQuotes
SingleQuotes
RoundBrackets
SquareBrackets
CurlyBrackets
)
type Encloser struct {
Open byte
Close byte
}
var EnclosersRunes = map[EncloserId]Encloser{
DoubleQuotes: Encloser{'"', '"'},
SingleQuotes: Encloser{'\'', '\''},
RoundBrackets: Encloser{'(', ')'},
SquareBrackets: Encloser{'[', ']'},
CurlyBrackets: Encloser{'{', '}'},
}
type CsvParser struct {
Enclosers []EncloserId
Delimiter string
Fields []string
Line string
}
func (parser *CsvParser) Init() {
parser.Enclosers = []EncloserId{DoubleQuotes, SquareBrackets}
parser.Delimiter = " \t"
}
func (parser *CsvParser) ExtractEnclosedFieldValue(endChar byte) error {
lineLen := len(parser.Line)
for i := 1; i < lineLen; i++ {
if parser.Line[i] == endChar && parser.Line[i-1] != '\\' {
parser.Fields = append(parser.Fields, parser.Line[0:i])
parser.Line = parser.Line[i+1:]
return nil
}
}
return errors.New("Encloser close not found")
}
func (parser *CsvParser) Parse(CsvLine string) error {
var err error = nil
parser.Fields = make([]string,0)
parser.Line = CsvLine
for len(parser.Line) > 0 {
parser.Line = strings.TrimLeft(parser.Line, parser.Delimiter)
parser.Line = strings.TrimRight(parser.Line, parser.Delimiter)
if len(parser.Line) == 0 {
break
}
// Search for an encloser
encloserId := None
for _, id := range parser.Enclosers {
if parser.Line[0] == EnclosersRunes[id].Open {
encloserId = id
break
}
}
if encloserId != None {
parser.Line = parser.Line[1:]
err = parser.ExtractEnclosedFieldValue(EnclosersRunes[encloserId].Close)
if err != nil {
return err
}
} else {
nextSpace := strings.IndexAny(parser.Line, parser.Delimiter)
if nextSpace != -1 {
parser.Fields = append(parser.Fields, parser.Line[:nextSpace])
parser.Line = parser.Line[nextSpace:]
} else {
parser.Fields = append(parser.Fields, parser.Line)
parser.Line = ""
break
}
}
}
return nil
}

View File

@ -1,68 +0,0 @@
package csvparser
import(
"testing"
)
func TestParse(t *testing.T) {
var parser CsvParser
var CsvTestValues = [...]string {
"field1 field2 field3", // standard CSV
" field1 field2 field3 ", // Space or multiple spaces as delimiters
" field1 field2 field3 ", // Spaces + tabs as delimiters
" \"field1\" field2 field3 ", // Enclosed fields
" \"field1\" field2 [field3] ", // Enclosed fields
}
var CsvExpectedValues = [...]string {
"field1",
"field2",
"field3",
}
parser.Init()
for _, v := range CsvTestValues {
err := parser.Parse(v)
if err != nil {
t.Error("Parse error:", err, " in ", v)
}
if len(parser.Fields) != len(CsvExpectedValues) {
t.Error("Extracted field number does not match expected", parser.Fields)
}
for i,val := range CsvExpectedValues {
if parser.Fields[i] != val {
t.Error("Field values do not match", i, " expected ", val, " got ", parser.Fields[i])
}
}
}
}
func TestParseWithEscape(t *testing.T) {
var parser CsvParser
var CsvTestValues = [...]string {
"\"\\\"field1 and more\" field2 [\\[field3] ", // Enclosed fields
}
var CsvExpectedValues = [...]string {
"\\\"field1 and more",
"field2",
"\\[field3",
}
parser.Init()
for _, v := range CsvTestValues {
err := parser.Parse(v)
if err != nil {
t.Error("Parse error:", err, " in ", v)
}
if len(parser.Fields) != len(CsvExpectedValues) {
t.Error("Extracted field number does not match expected", parser.Fields)
}
for i,val := range CsvExpectedValues {
if parser.Fields[i] != val {
t.Error("Field values do not match", i, " expected ", val, " got ", parser.Fields[i])
}
}
}
}

143
csvparser.go Normal file
View File

@ -0,0 +1,143 @@
package csvparser
import (
"fmt"
"strings"
"unicode/utf8"
)
type CsvParser struct {
enclosers []string
delimiters string
mergeDelimiters bool
fields []string
ignore string
maxFieldIndex int
}
func (p *csvParser) FieldCount() int {
return maxFieldIndex + 1
}
/*
* delimiters: string with all delimiter chars
* mergeDelimiters: if true, all successive delimiters are considered as one
* enclosers: array of string, each string contains a enclorser tuple: "", [], {}...
* lineFormat: format of the line (each fieldname or ignore)
*/
func (p *CsvParser) Initialize(delimiters string, mergeDelimiters bool, enclosers []string, lineFormat string, ignore string) error {
if utf8.RuneCountInString(delimiters) == 0 {
return fmt.Errorf("delimiter shoud be at least one character")
}
p.delimiters = delimiters
p.mergeDelimiters = mergeDelimiters
for _, encloser := range enclosers {
if utf8.RuneCountInString(encloser) != 2 {
return fmt.Errorf("encolser should have to characters")
}
}
p.enclosers = enclosers
p.ignore = ignore
// line format is in the form of: field1 field2 ignore ...
// if field name is ignore, it is parsed but not retained
fields := strings.Split(lineFormat, " ")
p.fields = make([]string, 0)
for _, f := range fields {
if len(f) > 0 {
p.fields = append(p.fields, f)
}
}
p.maxFieldIndex = len(p.fields) - 1
return nil
}
type ParserState struct {
inField bool // reading field value
delimiter bool // last char was a delimiter
escape bool // the previaus char was a escaper \\
enclosed bool // the current field is enclosed
enclosedMode bool // the current algo is for enclosed string, do not search delimiter
encloserStart rune // the current enclosed field opener char
encloserEnd rune // the current enclosed field closer char
}
/*
* si pas dans la lecture d'un champ
*/
func (p *CsvParser) Parse(line string) (map[string]string, error) {
currentFieldIndex := 0
valueStart := -1
state := ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
ret := make(map[string]string)
maxIndex := len(line) - 1
for index, r := range line {
if r == '\\' {
state.escape = true
continue
}
// previous rune was \\: ignore the current rune
if state.escape {
state.escape = false
continue
}
// reading en enclosed field: watch for end of this field (encloserEnd)
if state.enclosedMode {
// searching for end of encloser
if r == state.encloserEnd {
state.enclosedMode = false
}
if index < maxIndex {
continue
}
}
isDelimiter := false
for _, d := range p.delimiters {
if r == d {
isDelimiter = true
break
}
}
// previous rune was a delimiter and mergeDelimiters is true, skip
if isDelimiter && state.delimiter && p.mergeDelimiters {
continue
}
// current rune is a delimiter, a value is present and the next char is the next value
if isDelimiter || index == maxIndex {
if index == maxIndex && valueStart == -1 {
valueStart = 0
}
state.delimiter = true
field := line[valueStart : index+1]
field = strings.TrimRight(field, p.delimiters)
if state.enclosed {
field = strings.TrimRight(strings.TrimLeft(field, string(state.encloserStart)), string(state.encloserEnd))
}
if currentFieldIndex <= p.maxFieldIndex && p.fields[currentFieldIndex] != p.ignore {
ret[p.fields[currentFieldIndex]] = field
}
currentFieldIndex++
state = ParserState{inField: false, delimiter: true, escape: false, enclosed: false, enclosedMode: false, encloserStart: '?', encloserEnd: '?'}
continue
}
if state.delimiter {
valueStart = index
for _, e := range p.enclosers {
runes := []rune(e)
if r == runes[0] {
state.enclosed = true
state.enclosedMode = true
state.encloserStart = runes[0]
state.encloserEnd = runes[1]
}
state.delimiter = false
}
}
}
return ret, nil
}

123
csvparser_test.go Normal file
View File

@ -0,0 +1,123 @@
package csvparser
import (
"fmt"
"testing"
)
func TestCorrectLines(t *testing.T) {
var csvParser CsvParser
line := ""
/*
csvParser.Initialize(" ", false, []string{"\"\"", "[]"}, "firstname lastname complete_name")
fmt.Println("parsing:", line)
event, err := csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John \"John Doe\" Doe"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe \"John Doe\""
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe \"John Doe\"\\\\"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
line = "John Doe I don't know him"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
*/
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169\""
fmt.Println("parsing:", line)
event, err := csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
/*
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" 15169"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
csvParser.Initialize(" \t", true, []string{"\"\"", "[]"}, "ignore ignore ignore proxy ignore domain clientip ignore ignore apache-date request status bytes duration referer user-agent", "ignore")
line = "Aug 7 00:00:00 proxy-4 haproxy[17429]: www.yvelines.gouv.fr 66.249.64.10 - - [06/Aug/2024:23:59:59 +0200] \"GET /content/download/19274/117923/file/SE_EAU_20190325_LesJardines_78201900027_LetNotifCompletude+recepisse.pdf HTTP/1.1\" 301 1414 240 \"\" \"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.6478.182 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)\" \"-\" \"GB\" \"15169"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
*/
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
line = "GET /Actions-de-l-Etat/Vos-aides/Particuliers/Pass-culture?_escaped_fragment_=/particuliers/page/R65575 HTTP/1.1"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
csvParser.Initialize(" ", true, []string{"\"\""}, "verb uri http-version", "ignore")
line = "G"
fmt.Println("parsing:", line)
event, err = csvParser.Parse(line)
if err != nil {
t.Fatalf("Parsing of empty line failed %v %v", err, event)
}
fmt.Println("event is", event)
}

5
go.mod
View File

@ -1,3 +1,4 @@
module git.orange.fruit.ovh/laurentu/csv-parser
module git.passke.org/laurentu/csv-parser
go 1.17
toolchain go1.22.1
go 1.22