1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
|
// Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
// It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
// interface but provides automatic BOM checking and removing as necessary.
package utfbom
import (
"errors"
"io"
)
// Encoding is type alias for detected UTF encoding.
type Encoding int
// Constants to identify detected UTF encodings.
const (
// Unknown encoding, returned when no BOM was detected
Unknown Encoding = iota
// UTF8, BOM bytes: EF BB BF
UTF8
// UTF-16, big-endian, BOM bytes: FE FF
UTF16BigEndian
// UTF-16, little-endian, BOM bytes: FF FE
UTF16LittleEndian
// UTF-32, big-endian, BOM bytes: 00 00 FE FF
UTF32BigEndian
// UTF-32, little-endian, BOM bytes: FF FE 00 00
UTF32LittleEndian
)
const maxConsecutiveEmptyReads = 100
// Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
// It also returns the encoding detected by the BOM.
// If the detected encoding is not needed, you can call the SkipOnly function.
func Skip(rd io.Reader) (*Reader, Encoding) {
// Is it already a Reader?
b, ok := rd.(*Reader)
if ok {
return b, Unknown
}
enc, left, err := detectUtf(rd)
return &Reader{
rd: rd,
buf: left,
err: err,
}, enc
}
// SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
func SkipOnly(rd io.Reader) *Reader {
r, _ := Skip(rd)
return r
}
// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
// removing as necessary for an io.Reader object.
type Reader struct {
rd io.Reader // reader provided by the client
buf []byte // buffered data
err error // last error
}
// Read is an implementation of io.Reader interface.
// The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
func (r *Reader) Read(p []byte) (n int, err error) {
if len(p) == 0 {
return 0, nil
}
if r.buf == nil {
if r.err != nil {
return 0, r.readErr()
}
return r.rd.Read(p)
}
// copy as much as we can
n = copy(p, r.buf)
r.buf = nilIfEmpty(r.buf[n:])
return n, nil
}
func (r *Reader) readErr() error {
err := r.err
r.err = nil
return err
}
var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
buf, err = readBOM(rd)
if len(buf) >= 4 {
if isUTF32BigEndianBOM4(buf) {
return UTF32BigEndian, nilIfEmpty(buf[4:]), err
}
if isUTF32LittleEndianBOM4(buf) {
return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
}
}
if len(buf) > 2 && isUTF8BOM3(buf) {
return UTF8, nilIfEmpty(buf[3:]), err
}
if (err != nil && err != io.EOF) || (len(buf) < 2) {
return Unknown, nilIfEmpty(buf), err
}
if isUTF16BigEndianBOM2(buf) {
return UTF16BigEndian, nilIfEmpty(buf[2:]), err
}
if isUTF16LittleEndianBOM2(buf) {
return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
}
return Unknown, nilIfEmpty(buf), err
}
func readBOM(rd io.Reader) (buf []byte, err error) {
const maxBOMSize = 4
var bom [maxBOMSize]byte // used to read BOM
// read as many bytes as possible
for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
if n, err = rd.Read(bom[len(buf):]); n < 0 {
panic(errNegativeRead)
}
if n > 0 {
nEmpty = 0
} else {
nEmpty++
if nEmpty >= maxConsecutiveEmptyReads {
err = io.ErrNoProgress
}
}
}
return
}
func isUTF32BigEndianBOM4(buf []byte) bool {
return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
}
func isUTF32LittleEndianBOM4(buf []byte) bool {
return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
}
func isUTF8BOM3(buf []byte) bool {
return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
}
func isUTF16BigEndianBOM2(buf []byte) bool {
return buf[0] == 0xFE && buf[1] == 0xFF
}
func isUTF16LittleEndianBOM2(buf []byte) bool {
return buf[0] == 0xFF && buf[1] == 0xFE
}
func nilIfEmpty(buf []byte) (res []byte) {
if len(buf) > 0 {
res = buf
}
return
}
|