refactor(detect): create readUntilSafeBoundary + add tests (#1676) · gitleaks/gitleaks@4c3da6e (original) (raw)
1
1
`package detect
`
2
2
``
3
3
`import (
`
``
4
`+
"bufio"
`
4
5
`"bytes"
`
5
6
`"io"
`
6
7
`"os"
`
`@@ -49,64 +50,32 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin
`
49
50
` }
`
50
51
` }
`
51
52
``
52
``
`-
// Buffer to hold file chunks
`
53
``
`-
buf := make([]byte, chunkSize)
`
54
``
`-
totalLines := 0
`
``
53
`+
var (
`
``
54
`+
// Buffer to hold file chunks
`
``
55
`+
reader = bufio.NewReaderSize(f, chunkSize)
`
``
56
`+
buf = make([]byte, chunkSize)
`
``
57
`+
totalLines = 0
`
``
58
`+
)
`
55
59
`for {
`
56
``
`-
n, err := f.Read(buf)
`
57
``
`-
if n > 0 {
`
58
``
`-
// TODO: optimization could be introduced here
`
59
``
`-
if mimetype, err := filetype.Match(buf[:n]); err != nil {
`
60
``
`-
return nil
`
61
``
`-
} else if mimetype.MIME.Type == "application" {
`
62
``
`-
return nil // skip binary files
`
63
``
`-
}
`
64
``
-
65
``
`-
// If the chunk doesn't end in a newline, peek |maxPeekSize| until we find one.
`
66
``
`-
// This hopefully avoids splitting
`
67
``
`-
// See: https://github.com/gitleaks/gitleaks/issues/1651
`
68
``
`-
var (
`
69
``
`-
peekBuf = bytes.NewBuffer(buf[:n])
`
70
``
`-
tempBuf = make([]byte, 1)
`
71
``
`-
newlineCount = 0 // Tracks consecutive newlines
`
72
``
`-
)
`
73
``
`-
for {
`
74
``
`-
data := peekBuf.Bytes()
`
75
``
`-
if len(data) == 0 {
`
76
``
`-
break
`
77
``
`-
}
`
78
``
-
79
``
`-
// Check if the last character is a newline.
`
80
``
`-
lastChar := data[len(data)-1]
`
81
``
`-
if lastChar == '\n' || lastChar == '\r' {
`
82
``
`-
newlineCount++
`
83
``
-
84
``
`-
// Stop if two consecutive newlines are found
`
85
``
`-
if newlineCount >= 2 {
`
86
``
`-
break
`
87
``
`-
}
`
88
``
`-
} else {
`
89
``
`-
newlineCount = 0 // Reset if a non-newline character is found
`
90
``
`-
}
`
91
``
-
92
``
`-
// Stop growing the buffer if it reaches maxSize
`
93
``
`-
if (peekBuf.Len() - n) >= maxPeekSize {
`
94
``
`-
break
`
95
``
`-
}
`
``
60
`+
n, err := reader.Read(buf)
`
96
61
``
97
``
`-
// Read additional data into a temporary buffer
`
98
``
`-
m, readErr := f.Read(tempBuf)
`
99
``
`-
if m > 0 {
`
100
``
`-
peekBuf.Write(tempBuf[:m])
`
``
62
`+
// "Callers should always process the n > 0 bytes returned before considering the error err."
`
``
63
`+
// https://pkg.go.dev/io#Reader
`
``
64
`+
if n > 0 {
`
``
65
`+
// Only check the filetype at the start of file.
`
``
66
`+
if totalLines == 0 {
`
``
67
`+
// TODO: could other optimizations be introduced here?
`
``
68
`+
if mimetype, err := filetype.Match(buf[:n]); err != nil {
`
``
69
`+
return nil
`
``
70
`+
} else if mimetype.MIME.Type == "application" {
`
``
71
`+
return nil // skip binary files
`
101
72
` }
`
``
73
`+
}
`
102
74
``
103
``
`-
// Stop if EOF is reached
`
104
``
`-
if readErr != nil {
`
105
``
`-
if readErr == io.EOF {
`
106
``
`-
break
`
107
``
`-
}
`
108
``
`-
return readErr
`
109
``
`-
}
`
``
75
`+
// Try to split chunks across large areas of whitespace, if possible.
`
``
76
`+
peekBuf := bytes.NewBuffer(buf[:n])
`
``
77
`+
if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {
`
``
78
`+
return readErr
`
110
79
` }
`
111
80
``
112
81
`// Count the number of newlines in this chunk
`
`@@ -145,3 +114,73 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin
`
145
114
``
146
115
`return d.findings, nil
`
147
116
`}
`
``
117
+
``
118
`` +
// readUntilSafeBoundary consumes |f| until it finds two consecutive \n
characters, up to |maxPeekSize|.
``
``
119
`+
// This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)
`
``
120
`+
func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {
`
``
121
`+
if peekBuf.Len() == 0 {
`
``
122
`+
return nil
`
``
123
`+
}
`
``
124
+
``
125
`+
// Does the buffer end in consecutive newlines?
`
``
126
`+
var (
`
``
127
`+
data = peekBuf.Bytes()
`
``
128
`+
lastChar = data[len(data)-1]
`
``
129
`+
newlineCount = 0 // Tracks consecutive newlines
`
``
130
`+
)
`
``
131
`+
if isWhitespace(lastChar) {
`
``
132
`+
for i := len(data) - 1; i >= 0; i-- {
`
``
133
`+
lastChar = data[i]
`
``
134
`+
if lastChar == '\n' {
`
``
135
`+
newlineCount++
`
``
136
+
``
137
`+
// Stop if two consecutive newlines are found
`
``
138
`+
if newlineCount >= 2 {
`
``
139
`+
return nil
`
``
140
`+
}
`
``
141
`+
} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
`
``
142
`` +
// The presence of other whitespace characters (\r
,
, \t
) shouldn't reset the count.
``
``
143
`+
// (Intentionally do nothing.)
`
``
144
`+
} else {
`
``
145
`+
break
`
``
146
`+
}
`
``
147
`+
}
`
``
148
`+
}
`
``
149
+
``
150
`+
// If not, read ahead until we (hopefully) find some.
`
``
151
`+
newlineCount = 0
`
``
152
`+
for {
`
``
153
`+
data = peekBuf.Bytes()
`
``
154
`+
// Check if the last character is a newline.
`
``
155
`+
lastChar = data[len(data)-1]
`
``
156
`+
if lastChar == '\n' {
`
``
157
`+
newlineCount++
`
``
158
+
``
159
`+
// Stop if two consecutive newlines are found
`
``
160
`+
if newlineCount >= 2 {
`
``
161
`+
break
`
``
162
`+
}
`
``
163
`+
} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {
`
``
164
`` +
// The presence of other whitespace characters (\r
,
, \t
) shouldn't reset the count.
``
``
165
`+
// (Intentionally do nothing.)
`
``
166
`+
} else {
`
``
167
`+
newlineCount = 0 // Reset if a non-newline character is found
`
``
168
`+
}
`
``
169
+
``
170
`+
// Stop growing the buffer if it reaches maxSize
`
``
171
`+
if (peekBuf.Len() - n) >= maxPeekSize {
`
``
172
`+
break
`
``
173
`+
}
`
``
174
+
``
175
`+
// Read additional data into a temporary buffer
`
``
176
`+
b, err := r.ReadByte()
`
``
177
`+
if err != nil {
`
``
178
`+
if err == io.EOF {
`
``
179
`+
break
`
``
180
`+
}
`
``
181
`+
return err
`
``
182
`+
}
`
``
183
`+
peekBuf.WriteByte(b)
`
``
184
`+
}
`
``
185
`+
return nil
`
``
186
`+
}
`