refactor(detect): create readUntilSafeBoundary + add tests (#1676) · gitleaks/gitleaks@4c3da6e (original) (raw)

1

1

`package detect

`

2

2

``

3

3

`import (

`

``

4

`+

"bufio"

`

4

5

`"bytes"

`

5

6

`"io"

`

6

7

`"os"

`

`@@ -49,64 +50,32 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin

`

49

50

` }

`

50

51

` }

`

51

52

``

52

``

`-

// Buffer to hold file chunks

`

53

``

`-

buf := make([]byte, chunkSize)

`

54

``

`-

totalLines := 0

`

``

53

`+

var (

`

``

54

`+

// Buffer to hold file chunks

`

``

55

`+

reader = bufio.NewReaderSize(f, chunkSize)

`

``

56

`+

buf = make([]byte, chunkSize)

`

``

57

`+

totalLines = 0

`

``

58

`+

)

`

55

59

`for {

`

56

``

`-

n, err := f.Read(buf)

`

57

``

`-

if n > 0 {

`

58

``

`-

// TODO: optimization could be introduced here

`

59

``

`-

if mimetype, err := filetype.Match(buf[:n]); err != nil {

`

60

``

`-

return nil

`

61

``

`-

} else if mimetype.MIME.Type == "application" {

`

62

``

`-

return nil // skip binary files

`

63

``

`-

}

`

64

``

-

65

``

`-

// If the chunk doesn't end in a newline, peek |maxPeekSize| until we find one.

`

66

``

`-

// This hopefully avoids splitting

`

67

``

`-

// See: https://github.com/gitleaks/gitleaks/issues/1651

`

68

``

`-

var (

`

69

``

`-

peekBuf = bytes.NewBuffer(buf[:n])

`

70

``

`-

tempBuf = make([]byte, 1)

`

71

``

`-

newlineCount = 0 // Tracks consecutive newlines

`

72

``

`-

)

`

73

``

`-

for {

`

74

``

`-

data := peekBuf.Bytes()

`

75

``

`-

if len(data) == 0 {

`

76

``

`-

break

`

77

``

`-

}

`

78

``

-

79

``

`-

// Check if the last character is a newline.

`

80

``

`-

lastChar := data[len(data)-1]

`

81

``

`-

if lastChar == '\n' || lastChar == '\r' {

`

82

``

`-

newlineCount++

`

83

``

-

84

``

`-

// Stop if two consecutive newlines are found

`

85

``

`-

if newlineCount >= 2 {

`

86

``

`-

break

`

87

``

`-

}

`

88

``

`-

} else {

`

89

``

`-

newlineCount = 0 // Reset if a non-newline character is found

`

90

``

`-

}

`

91

``

-

92

``

`-

// Stop growing the buffer if it reaches maxSize

`

93

``

`-

if (peekBuf.Len() - n) >= maxPeekSize {

`

94

``

`-

break

`

95

``

`-

}

`

``

60

`+

n, err := reader.Read(buf)

`

96

61

``

97

``

`-

// Read additional data into a temporary buffer

`

98

``

`-

m, readErr := f.Read(tempBuf)

`

99

``

`-

if m > 0 {

`

100

``

`-

peekBuf.Write(tempBuf[:m])

`

``

62

`+

// "Callers should always process the n > 0 bytes returned before considering the error err."

`

``

63

`+

// https://pkg.go.dev/io#Reader

`

``

64

`+

if n > 0 {

`

``

65

`+

// Only check the filetype at the start of file.

`

``

66

`+

if totalLines == 0 {

`

``

67

`+

// TODO: could other optimizations be introduced here?

`

``

68

`+

if mimetype, err := filetype.Match(buf[:n]); err != nil {

`

``

69

`+

return nil

`

``

70

`+

} else if mimetype.MIME.Type == "application" {

`

``

71

`+

return nil // skip binary files

`

101

72

` }

`

``

73

`+

}

`

102

74

``

103

``

`-

// Stop if EOF is reached

`

104

``

`-

if readErr != nil {

`

105

``

`-

if readErr == io.EOF {

`

106

``

`-

break

`

107

``

`-

}

`

108

``

`-

return readErr

`

109

``

`-

}

`

``

75

`+

// Try to split chunks across large areas of whitespace, if possible.

`

``

76

`+

peekBuf := bytes.NewBuffer(buf[:n])

`

``

77

`+

if readErr := readUntilSafeBoundary(reader, n, maxPeekSize, peekBuf); readErr != nil {

`

``

78

`+

return readErr

`

110

79

` }

`

111

80

``

112

81

`// Count the number of newlines in this chunk

`

`@@ -145,3 +114,73 @@ func (d *Detector) DetectFiles(paths <-chan sources.ScanTarget) ([]report.Findin

`

145

114

``

146

115

`return d.findings, nil

`

147

116

`}

`

``

117

+

``

118

`` +

// readUntilSafeBoundary consumes |f| until it finds two consecutive \n characters, up to |maxPeekSize|.

``

``

119

`+

// This hopefully avoids splitting. (https://github.com/gitleaks/gitleaks/issues/1651)

`

``

120

`+

func readUntilSafeBoundary(r *bufio.Reader, n int, maxPeekSize int, peekBuf *bytes.Buffer) error {

`

``

121

`+

if peekBuf.Len() == 0 {

`

``

122

`+

return nil

`

``

123

`+

}

`

``

124

+

``

125

`+

// Does the buffer end in consecutive newlines?

`

``

126

`+

var (

`

``

127

`+

data = peekBuf.Bytes()

`

``

128

`+

lastChar = data[len(data)-1]

`

``

129

`+

newlineCount = 0 // Tracks consecutive newlines

`

``

130

`+

)

`

``

131

`+

if isWhitespace(lastChar) {

`

``

132

`+

for i := len(data) - 1; i >= 0; i-- {

`

``

133

`+

lastChar = data[i]

`

``

134

`+

if lastChar == '\n' {

`

``

135

`+

newlineCount++

`

``

136

+

``

137

`+

// Stop if two consecutive newlines are found

`

``

138

`+

if newlineCount >= 2 {

`

``

139

`+

return nil

`

``

140

`+

}

`

``

141

`+

} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {

`

``

142

`` +

// The presence of other whitespace characters (\r, , \t) shouldn't reset the count.

``

``

143

`+

// (Intentionally do nothing.)

`

``

144

`+

} else {

`

``

145

`+

break

`

``

146

`+

}

`

``

147

`+

}

`

``

148

`+

}

`

``

149

+

``

150

`+

// If not, read ahead until we (hopefully) find some.

`

``

151

`+

newlineCount = 0

`

``

152

`+

for {

`

``

153

`+

data = peekBuf.Bytes()

`

``

154

`+

// Check if the last character is a newline.

`

``

155

`+

lastChar = data[len(data)-1]

`

``

156

`+

if lastChar == '\n' {

`

``

157

`+

newlineCount++

`

``

158

+

``

159

`+

// Stop if two consecutive newlines are found

`

``

160

`+

if newlineCount >= 2 {

`

``

161

`+

break

`

``

162

`+

}

`

``

163

`+

} else if lastChar == '\r' || lastChar == ' ' || lastChar == '\t' {

`

``

164

`` +

// The presence of other whitespace characters (\r, , \t) shouldn't reset the count.

``

``

165

`+

// (Intentionally do nothing.)

`

``

166

`+

} else {

`

``

167

`+

newlineCount = 0 // Reset if a non-newline character is found

`

``

168

`+

}

`

``

169

+

``

170

`+

// Stop growing the buffer if it reaches maxSize

`

``

171

`+

if (peekBuf.Len() - n) >= maxPeekSize {

`

``

172

`+

break

`

``

173

`+

}

`

``

174

+

``

175

`+

// Read additional data into a temporary buffer

`

``

176

`+

b, err := r.ReadByte()

`

``

177

`+

if err != nil {

`

``

178

`+

if err == io.EOF {

`

``

179

`+

break

`

``

180

`+

}

`

``

181

`+

return err

`

``

182

`+

}

`

``

183

`+

peekBuf.WriteByte(b)

`

``

184

`+

}

`

``

185

`+

return nil

`

``

186

`+

}

`