// Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary. // It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader // interface but provides automatic BOM checking and removing as necessary. package utfbom import ( "errors" "io" ) // Encoding is type alias for detected UTF encoding. type Encoding int // Constants to identify detected UTF encodings. const ( // Unknown encoding, returned when no BOM was detected Unknown Encoding = iota // UTF8, BOM bytes: EF BB BF UTF8 // UTF-16, big-endian, BOM bytes: FE FF UTF16BigEndian // UTF-16, little-endian, BOM bytes: FF FE UTF16LittleEndian // UTF-32, big-endian, BOM bytes: 00 00 FE FF UTF32BigEndian // UTF-32, little-endian, BOM bytes: FF FE 00 00 UTF32LittleEndian ) const maxConsecutiveEmptyReads = 100 // Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary. // It also returns the encoding detected by the BOM. // If the detected encoding is not needed, you can call the SkipOnly function. func Skip(rd io.Reader) (*Reader, Encoding) { // Is it already a Reader? b, ok := rd.(*Reader) if ok { return b, Unknown } enc, left, err := detectUtf(rd) return &Reader{ rd: rd, buf: left, err: err, }, enc } // SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary. func SkipOnly(rd io.Reader) *Reader { r, _ := Skip(rd) return r } // Reader implements automatic BOM (Unicode Byte Order Mark) checking and // removing as necessary for an io.Reader object. type Reader struct { rd io.Reader // reader provided by the client buf []byte // buffered data err error // last error } // Read is an implementation of io.Reader interface. // The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary. func (r *Reader) Read(p []byte) (n int, err error) { if len(p) == 0 { return 0, nil } if r.buf == nil { if r.err != nil { return 0, r.readErr() } return r.rd.Read(p) } // copy as much as we can n = copy(p, r.buf) r.buf = nilIfEmpty(r.buf[n:]) return n, nil } func (r *Reader) readErr() error { err := r.err r.err = nil return err } var errNegativeRead = errors.New("utfbom: reader returned negative count from Read") func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) { buf, err = readBOM(rd) if len(buf) >= 4 { if isUTF32BigEndianBOM4(buf) { return UTF32BigEndian, nilIfEmpty(buf[4:]), err } if isUTF32LittleEndianBOM4(buf) { return UTF32LittleEndian, nilIfEmpty(buf[4:]), err } } if len(buf) > 2 && isUTF8BOM3(buf) { return UTF8, nilIfEmpty(buf[3:]), err } if (err != nil && err != io.EOF) || (len(buf) < 2) { return Unknown, nilIfEmpty(buf), err } if isUTF16BigEndianBOM2(buf) { return UTF16BigEndian, nilIfEmpty(buf[2:]), err } if isUTF16LittleEndianBOM2(buf) { return UTF16LittleEndian, nilIfEmpty(buf[2:]), err } return Unknown, nilIfEmpty(buf), err } func readBOM(rd io.Reader) (buf []byte, err error) { const maxBOMSize = 4 var bom [maxBOMSize]byte // used to read BOM // read as many bytes as possible for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] { if n, err = rd.Read(bom[len(buf):]); n < 0 { panic(errNegativeRead) } if n > 0 { nEmpty = 0 } else { nEmpty++ if nEmpty >= maxConsecutiveEmptyReads { err = io.ErrNoProgress } } } return } func isUTF32BigEndianBOM4(buf []byte) bool { return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF } func isUTF32LittleEndianBOM4(buf []byte) bool { return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00 } func isUTF8BOM3(buf []byte) bool { return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF } func isUTF16BigEndianBOM2(buf []byte) bool { return buf[0] == 0xFE && buf[1] == 0xFF } func isUTF16LittleEndianBOM2(buf []byte) bool { return buf[0] == 0xFF && buf[1] == 0xFE } func nilIfEmpty(buf []byte) (res []byte) { if len(buf) > 0 { res = buf } return }