// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package encoding_test import ( "io/ioutil" "strings" "testing" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/transform" ) func TestEncodeInvalidUTF8(t *testing.T) { inputs := []string{ "hello.", "wo\ufffdld.", "ABC\xff\x80\x80", // Invalid UTF-8. "\x80\x80\x80\x80\x80", "\x80\x80D\x80\x80", // Valid rune at "D". "E\xed\xa0\x80\xed\xbf\xbfF", // Two invalid UTF-8 runes (surrogates). "G", "H\xe2\x82", // U+20AC in UTF-8 is "\xe2\x82\xac", which we split over two "\xacI\xe2\x82", // input lines. It maps to 0x80 in the Windows-1252 encoding. } // Each invalid source byte becomes '\x1a'. want := strings.Replace("hello.wo?ld.ABC??????????D??E??????FGH\x80I??", "?", "\x1a", -1) transformer := encoding.ReplaceUnsupported(charmap.Windows1252.NewEncoder()) gotBuf := make([]byte, 0, 1024) src := make([]byte, 0, 1024) for i, input := range inputs { dst := make([]byte, 1024) src = append(src, input...) atEOF := i == len(inputs)-1 nDst, nSrc, err := transformer.Transform(dst, src, atEOF) gotBuf = append(gotBuf, dst[:nDst]...) src = src[nSrc:] if err != nil && err != transform.ErrShortSrc { t.Fatalf("i=%d: %v", i, err) } if atEOF && err != nil { t.Fatalf("i=%d: atEOF: %v", i, err) } } if got := string(gotBuf); got != want { t.Fatalf("\ngot %+q\nwant %+q", got, want) } } func TestReplacement(t *testing.T) { for _, direction := range []string{"Decode", "Encode"} { enc, want := (transform.Transformer)(nil), "" if direction == "Decode" { enc = encoding.Replacement.NewDecoder() want = "\ufffd" } else { enc = encoding.Replacement.NewEncoder() want = "AB\x00CD\ufffdYZ" } sr := strings.NewReader("AB\x00CD\x80YZ") g, err := ioutil.ReadAll(transform.NewReader(sr, enc)) if err != nil { t.Errorf("%s: ReadAll: %v", direction, err) continue } if got := string(g); got != want { t.Errorf("%s:\ngot %q\nwant %q", direction, got, want) continue } } } func TestUTF8Validator(t *testing.T) { testCases := []struct { desc string dstSize int src string atEOF bool want string wantErr error }{ { "empty input", 100, "", false, "", nil, }, { "valid 1-byte 1-rune input", 100, "a", false, "a", nil, }, { "valid 3-byte 1-rune input", 100, "\u1234", false, "\u1234", nil, }, { "valid 5-byte 3-rune input", 100, "a\u0100\u0101", false, "a\u0100\u0101", nil, }, { "perfectly sized dst (non-ASCII)", 5, "a\u0100\u0101", false, "a\u0100\u0101", nil, }, { "short dst (non-ASCII)", 4, "a\u0100\u0101", false, "a\u0100", transform.ErrShortDst, }, { "perfectly sized dst (ASCII)", 5, "abcde", false, "abcde", nil, }, { "short dst (ASCII)", 4, "abcde", false, "abcd", transform.ErrShortDst, }, { "partial input (!EOF)", 100, "a\u0100\xf1", false, "a\u0100", transform.ErrShortSrc, }, { "invalid input (EOF)", 100, "a\u0100\xf1", true, "a\u0100", encoding.ErrInvalidUTF8, }, { "invalid input (!EOF)", 100, "a\u0100\x80", false, "a\u0100", encoding.ErrInvalidUTF8, }, { "invalid input (above U+10FFFF)", 100, "a\u0100\xf7\xbf\xbf\xbf", false, "a\u0100", encoding.ErrInvalidUTF8, }, { "invalid input (surrogate half)", 100, "a\u0100\xed\xa0\x80", false, "a\u0100", encoding.ErrInvalidUTF8, }, } for _, tc := range testCases { dst := make([]byte, tc.dstSize) nDst, nSrc, err := encoding.UTF8Validator.Transform(dst, []byte(tc.src), tc.atEOF) if nDst < 0 || len(dst) < nDst { t.Errorf("%s: nDst=%d out of range", tc.desc, nDst) continue } got := string(dst[:nDst]) if got != tc.want || nSrc != len(tc.want) || err != tc.wantErr { t.Errorf("%s:\ngot %+q, %d, %v\nwant %+q, %d, %v", tc.desc, got, nSrc, err, tc.want, len(tc.want), tc.wantErr) continue } } } func TestErrorHandler(t *testing.T) { testCases := []struct { desc string handler func(*encoding.Encoder) *encoding.Encoder sizeDst int src, want string nSrc int err error }{ { desc: "one rune replacement", handler: encoding.ReplaceUnsupported, sizeDst: 100, src: "\uAC00", want: "\x1a", nSrc: 3, }, { desc: "mid-stream rune replacement", handler: encoding.ReplaceUnsupported, sizeDst: 100, src: "a\uAC00bcd\u00e9", want: "a\x1abcd\xe9", nSrc: 9, }, { desc: "at end rune replacement", handler: encoding.ReplaceUnsupported, sizeDst: 10, src: "\u00e9\uAC00", want: "\xe9\x1a", nSrc: 5, }, { desc: "short buffer replacement", handler: encoding.ReplaceUnsupported, sizeDst: 1, src: "\u00e9\uAC00", want: "\xe9", nSrc: 2, err: transform.ErrShortDst, }, { desc: "one rune html escape", handler: encoding.HTMLEscapeUnsupported, sizeDst: 100, src: "\uAC00", want: "가", nSrc: 3, }, { desc: "mid-stream html escape", handler: encoding.HTMLEscapeUnsupported, sizeDst: 100, src: "\u00e9\uAC00dcba", want: "\xe9가dcba", nSrc: 9, }, { desc: "short buffer html escape", handler: encoding.HTMLEscapeUnsupported, sizeDst: 9, src: "ab\uAC01", want: "ab", nSrc: 2, err: transform.ErrShortDst, }, } for i, tc := range testCases { tr := tc.handler(charmap.Windows1250.NewEncoder()) b := make([]byte, tc.sizeDst) nDst, nSrc, err := tr.Transform(b, []byte(tc.src), true) if err != tc.err { t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err) } if got := string(b[:nDst]); got != tc.want { t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want) } if nSrc != tc.nSrc { t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc) } } }