refactor(detect): create readUntilSafeBoundary + add tests (#1676) · gitleaks/gitleaks@4c3da6e (original) (raw)

1

`package detect

2

3

`import (

4

"bufio"

4

5

`"bytes"

5

6

`"io"

6

7

`"os"

`@@ -49,64 +50,32 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin

49

50

` }

50

51

` }

51

52

// Buffer to hold file chunks

53

buf := make([]byte, chunkSize)

54

totalLines := 0

53

var (

54

// Buffer to hold file chunks

55

reader = bufio.NewReaderSize(f, chunkSize)

56

buf = make([]byte, chunkSize)

57

totalLines = 0

58

)

55

59

`for {

56

n, err := f.Read(buf)

57

if n > 0 {

58

// TODO: optimization could be introduced here

59

if mimetype, err := filetype.Match(buf[:n]); err != nil {

60

return nil

61

} else if mimetype.MIME.Type == "application" {

62

return nil // skip binary files

63

}

64

-

65

// If the chunk doesn't end in a newline, peek |maxPeekSize| until we find one.

66

// This hopefully avoids splitting

67

// See: https://github.com/gitleaks/gitleaks/issues/1651

68

var (

69

peekBuf = bytes.NewBuffer(buf[:n])

70

tempBuf = make([]byte, 1)

71

newlineCount = 0 // Tracks consecutive newlines

72

)

73

for {

74

data := peekBuf.Bytes()

75

if len(data) == 0 {

76

break

77

}

78

-

79

// Check if the last character is a newline.

80

lastChar := data[len(data)-1]

81

if lastChar == '\n' || lastChar == '\r' {

82

newlineCount++

83

-

84

// Stop if two consecutive newlines are found

85

if newlineCount >= 2 {

86

break

87

}

88

} else {

89

newlineCount = 0 // Reset if a non-newline character is found

90

}

91

-

92

// Stop growing the buffer if it reaches maxSize

93

if (peekBuf.Len() - n) >= maxPeekSize {

94

break

95

}

60

n, err := reader.Read(buf)

96

61

97

// Read additional data into a temporary buffer

98

m, readErr := f.Read(tempBuf)

99

if m > 0 {

100

peekBuf.Write(tempBuf[:m])

62

// "Callers should always process the n > 0 bytes returned before considering the error err."

63

// https://pkg.go.dev/io#Reader

64

if n > 0 {

65

// Only check the filetype at the start of file.

66

if totalLines == 0 {

67

// TODO: could other optimizations be introduced here?

68

if mimetype, err := filetype.Match(buf[:n]); err != nil {

69

return nil

70

} else if mimetype.MIME.Type == "application" {

71

return nil // skip binary files

101

72

` }

73

}

102

74

103

// Stop if EOF is reached

104

if readErr != nil {

105

if readErr == io.EOF {

106

break

107

}

108

return readErr

109

}

75

// Try to split chunks across large areas of whitespace, if possible.

76

peekBuf := bytes.NewBuffer(buf[:n])

77

if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {

78

return readErr

110

79

` }

111

80

112

81

`// Count the number of newlines in this chunk

`@@ -145,3 +114,73 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin

145

114

146

115

`return d.findings, nil

147

116

117

+

118

`` +

// readUntilSafeBoundary consumes |f| until it finds two consecutive \n characters, up to |maxPeekSize|.

119

// This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)

120

func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {

121

if peekBuf.Len() == 0 {

122

return nil

123

}

124

+

125

// Does the buffer end in consecutive newlines?

126

var (

127

data = peekBuf.Bytes()

128

lastChar = data[len(data)-1]

129

newlineCount = 0 // Tracks consecutive newlines

130

)

131

if isWhitespace(lastChar) {

132

for i := len(data) - 1; i >= 0; i-- {

133

lastChar = data[i]

134

if lastChar == '\n' {

135

newlineCount++

136

+

137

// Stop if two consecutive newlines are found

138

if newlineCount >= 2 {

139

return nil

140

}

141

} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {

142

`` +

// The presence of other whitespace characters (\r, , \t) shouldn't reset the count.

143

// (Intentionally do nothing.)

144

} else {

145

break

146

}

147

}

148

}

149

+

150

// If not, read ahead until we (hopefully) find some.

151

newlineCount = 0

152

for {

153

data = peekBuf.Bytes()

154

// Check if the last character is a newline.

155

lastChar = data[len(data)-1]

156

if lastChar == '\n' {

157

newlineCount++

158

+

159

// Stop if two consecutive newlines are found

160

if newlineCount >= 2 {

161

break

162

}

163

} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {

164

`` +

// The presence of other whitespace characters (\r, , \t) shouldn't reset the count.

165

// (Intentionally do nothing.)

166

} else {

167

newlineCount = 0 // Reset if a non-newline character is found

168

}

169

+

170

// Stop growing the buffer if it reaches maxSize

171

if (peekBuf.Len() - n) >= maxPeekSize {

172

break

173

}

174

+

175

// Read additional data into a temporary buffer

176

b, err := r.ReadByte()

177

if err != nil {

178

if err == io.EOF {

179

break

180

}

181

return err

182

}

183

peekBuf.WriteByte(b)

184

}

185

return nil

186

}