diff options
Diffstat (limited to 'vendor/golang.org/x/net/html/token_test.go')
-rw-r--r-- | vendor/golang.org/x/net/html/token_test.go | 748 |
1 files changed, 0 insertions, 748 deletions
diff --git a/vendor/golang.org/x/net/html/token_test.go b/vendor/golang.org/x/net/html/token_test.go deleted file mode 100644 index 20221c328..000000000 --- a/vendor/golang.org/x/net/html/token_test.go +++ /dev/null @@ -1,748 +0,0 @@ -// Copyright 2010 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package html - -import ( - "bytes" - "io" - "io/ioutil" - "reflect" - "runtime" - "strings" - "testing" -) - -type tokenTest struct { - // A short description of the test case. - desc string - // The HTML to parse. - html string - // The string representations of the expected tokens, joined by '$'. - golden string -} - -var tokenTests = []tokenTest{ - { - "empty", - "", - "", - }, - // A single text node. The tokenizer should not break text nodes on whitespace, - // nor should it normalize whitespace within a text node. - { - "text", - "foo bar", - "foo bar", - }, - // An entity. - { - "entity", - "one < two", - "one < two", - }, - // A start, self-closing and end tag. The tokenizer does not care if the start - // and end tokens don't match; that is the job of the parser. - { - "tags", - "<a>b<c/>d</e>", - "<a>$b$<c/>$d$</e>", - }, - // Angle brackets that aren't a tag. - { - "not a tag #0", - "<", - "<", - }, - { - "not a tag #1", - "</", - "</", - }, - { - "not a tag #2", - "</>", - "<!---->", - }, - { - "not a tag #3", - "a</>b", - "a$<!---->$b", - }, - { - "not a tag #4", - "</ >", - "<!-- -->", - }, - { - "not a tag #5", - "</.", - "<!--.-->", - }, - { - "not a tag #6", - "</.>", - "<!--.-->", - }, - { - "not a tag #7", - "a < b", - "a < b", - }, - { - "not a tag #8", - "<.>", - "<.>", - }, - { - "not a tag #9", - "a<<<b>>>c", - "a<<$<b>$>>c", - }, - { - "not a tag #10", - "if x<0 and y < 0 then x*y>0", - "if x<0 and y < 0 then x*y>0", - }, - { - "not a tag #11", - "<<p>", - "<$<p>", - }, - // EOF in a tag name. - { - "tag name eof #0", - "<a", - "", - }, - { - "tag name eof #1", - "<a ", - "", - }, - { - "tag name eof #2", - "a<b", - "a", - }, - { - "tag name eof #3", - "<a><b", - "<a>", - }, - { - "tag name eof #4", - `<a x`, - ``, - }, - // Some malformed tags that are missing a '>'. - { - "malformed tag #0", - `<p</p>`, - `<p< p="">`, - }, - { - "malformed tag #1", - `<p </p>`, - `<p <="" p="">`, - }, - { - "malformed tag #2", - `<p id`, - ``, - }, - { - "malformed tag #3", - `<p id=`, - ``, - }, - { - "malformed tag #4", - `<p id=>`, - `<p id="">`, - }, - { - "malformed tag #5", - `<p id=0`, - ``, - }, - { - "malformed tag #6", - `<p id=0</p>`, - `<p id="0</p">`, - }, - { - "malformed tag #7", - `<p id="0</p>`, - ``, - }, - { - "malformed tag #8", - `<p id="0"</p>`, - `<p id="0" <="" p="">`, - }, - { - "malformed tag #9", - `<p></p id`, - `<p>`, - }, - // Raw text and RCDATA. - { - "basic raw text", - "<script><a></b></script>", - "<script>$<a></b>$</script>", - }, - { - "unfinished script end tag", - "<SCRIPT>a</SCR", - "<script>$a</SCR", - }, - { - "broken script end tag", - "<SCRIPT>a</SCR ipt>", - "<script>$a</SCR ipt>", - }, - { - "EOF in script end tag", - "<SCRIPT>a</SCRipt", - "<script>$a</SCRipt", - }, - { - "scriptx end tag", - "<SCRIPT>a</SCRiptx", - "<script>$a</SCRiptx", - }, - { - "' ' completes script end tag", - "<SCRIPT>a</SCRipt ", - "<script>$a", - }, - { - "'>' completes script end tag", - "<SCRIPT>a</SCRipt>", - "<script>$a$</script>", - }, - { - "self-closing script end tag", - "<SCRIPT>a</SCRipt/>", - "<script>$a$</script>", - }, - { - "nested script tag", - "<SCRIPT>a</SCRipt<script>", - "<script>$a</SCRipt<script>", - }, - { - "script end tag after unfinished", - "<SCRIPT>a</SCRipt</script>", - "<script>$a</SCRipt$</script>", - }, - { - "script/style mismatched tags", - "<script>a</style>", - "<script>$a</style>", - }, - { - "style element with entity", - "<style>'", - "<style>$&apos;", - }, - { - "textarea with tag", - "<textarea><div></textarea>", - "<textarea>$<div>$</textarea>", - }, - { - "title with tag and entity", - "<title><b>K&R C</b></title>", - "<title>$<b>K&R C</b>$</title>", - }, - // DOCTYPE tests. - { - "Proper DOCTYPE", - "<!DOCTYPE html>", - "<!DOCTYPE html>", - }, - { - "DOCTYPE with no space", - "<!doctypehtml>", - "<!DOCTYPE html>", - }, - { - "DOCTYPE with two spaces", - "<!doctype html>", - "<!DOCTYPE html>", - }, - { - "looks like DOCTYPE but isn't", - "<!DOCUMENT html>", - "<!--DOCUMENT html-->", - }, - { - "DOCTYPE at EOF", - "<!DOCtype", - "<!DOCTYPE >", - }, - // XML processing instructions. - { - "XML processing instruction", - "<?xml?>", - "<!--?xml?-->", - }, - // Comments. - { - "comment0", - "abc<b><!-- skipme --></b>def", - "abc$<b>$<!-- skipme -->$</b>$def", - }, - { - "comment1", - "a<!-->z", - "a$<!---->$z", - }, - { - "comment2", - "a<!--->z", - "a$<!---->$z", - }, - { - "comment3", - "a<!--x>-->z", - "a$<!--x>-->$z", - }, - { - "comment4", - "a<!--x->-->z", - "a$<!--x->-->$z", - }, - { - "comment5", - "a<!>z", - "a$<!---->$z", - }, - { - "comment6", - "a<!->z", - "a$<!----->$z", - }, - { - "comment7", - "a<!---<>z", - "a$<!---<>z-->", - }, - { - "comment8", - "a<!--z", - "a$<!--z-->", - }, - { - "comment9", - "a<!--z-", - "a$<!--z-->", - }, - { - "comment10", - "a<!--z--", - "a$<!--z-->", - }, - { - "comment11", - "a<!--z---", - "a$<!--z--->", - }, - { - "comment12", - "a<!--z----", - "a$<!--z---->", - }, - { - "comment13", - "a<!--x--!>z", - "a$<!--x-->$z", - }, - // An attribute with a backslash. - { - "backslash", - `<p id="a\"b">`, - `<p id="a\" b"="">`, - }, - // Entities, tag name and attribute key lower-casing, and whitespace - // normalization within a tag. - { - "tricky", - "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", - `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, - }, - // A nonexistent entity. Tokenizing and converting back to a string should - // escape the "&" to become "&". - { - "noSuchEntity", - `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, - `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, - }, - { - "entity without semicolon", - `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, - `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, - }, - { - "entity with digits", - "½", - "½", - }, - // Attribute tests: - // http://dev.w3.org/html5/pf-summary/Overview.html#attributes - { - "Empty attribute", - `<input disabled FOO>`, - `<input disabled="" foo="">`, - }, - { - "Empty attribute, whitespace", - `<input disabled FOO >`, - `<input disabled="" foo="">`, - }, - { - "Unquoted attribute value", - `<input value=yes FOO=BAR>`, - `<input value="yes" foo="BAR">`, - }, - { - "Unquoted attribute value, spaces", - `<input value = yes FOO = BAR>`, - `<input value="yes" foo="BAR">`, - }, - { - "Unquoted attribute value, trailing space", - `<input value=yes FOO=BAR >`, - `<input value="yes" foo="BAR">`, - }, - { - "Single-quoted attribute value", - `<input value='yes' FOO='BAR'>`, - `<input value="yes" foo="BAR">`, - }, - { - "Single-quoted attribute value, trailing space", - `<input value='yes' FOO='BAR' >`, - `<input value="yes" foo="BAR">`, - }, - { - "Double-quoted attribute value", - `<input value="I'm an attribute" FOO="BAR">`, - `<input value="I'm an attribute" foo="BAR">`, - }, - { - "Attribute name characters", - `<meta http-equiv="content-type">`, - `<meta http-equiv="content-type">`, - }, - { - "Mixed attributes", - `a<P V="0 1" w='2' X=3 y>z`, - `a$<p v="0 1" w="2" x="3" y="">$z`, - }, - { - "Attributes with a solitary single quote", - `<p id=can't><p id=won't>`, - `<p id="can't">$<p id="won't">`, - }, -} - -func TestTokenizer(t *testing.T) { -loop: - for _, tt := range tokenTests { - z := NewTokenizer(strings.NewReader(tt.html)) - if tt.golden != "" { - for i, s := range strings.Split(tt.golden, "$") { - if z.Next() == ErrorToken { - t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err()) - continue loop - } - actual := z.Token().String() - if s != actual { - t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) - continue loop - } - } - } - z.Next() - if z.Err() != io.EOF { - t.Errorf("%s: want EOF got %q", tt.desc, z.Err()) - } - } -} - -func TestMaxBuffer(t *testing.T) { - // Exceeding the maximum buffer size generates ErrBufferExceeded. - z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10))) - z.SetMaxBuf(5) - tt := z.Next() - if got, want := tt, ErrorToken; got != want { - t.Fatalf("token type: got: %v want: %v", got, want) - } - if got, want := z.Err(), ErrBufferExceeded; got != want { - t.Errorf("error type: got: %v want: %v", got, want) - } - if got, want := string(z.Raw()), "<tttt"; got != want { - t.Fatalf("buffered before overflow: got: %q want: %q", got, want) - } -} - -func TestMaxBufferReconstruction(t *testing.T) { - // Exceeding the maximum buffer size at any point while tokenizing permits - // reconstructing the original input. -tests: - for _, test := range tokenTests { - for maxBuf := 1; ; maxBuf++ { - r := strings.NewReader(test.html) - z := NewTokenizer(r) - z.SetMaxBuf(maxBuf) - var tokenized bytes.Buffer - for { - tt := z.Next() - tokenized.Write(z.Raw()) - if tt == ErrorToken { - if err := z.Err(); err != io.EOF && err != ErrBufferExceeded { - t.Errorf("%s: unexpected error: %v", test.desc, err) - } - break - } - } - // Anything tokenized along with untokenized input or data left in the reader. - assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r)) - if err != nil { - t.Errorf("%s: ReadAll: %v", test.desc, err) - continue tests - } - if got, want := string(assembled), test.html; got != want { - t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want) - continue tests - } - // EOF indicates that we completed tokenization and hence found the max - // maxBuf that generates ErrBufferExceeded, so continue to the next test. - if z.Err() == io.EOF { - break - } - } // buffer sizes - } // tests -} - -func TestPassthrough(t *testing.T) { - // Accumulating the raw output for each parse event should reconstruct the - // original input. - for _, test := range tokenTests { - z := NewTokenizer(strings.NewReader(test.html)) - var parsed bytes.Buffer - for { - tt := z.Next() - parsed.Write(z.Raw()) - if tt == ErrorToken { - break - } - } - if got, want := parsed.String(), test.html; got != want { - t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want) - } - } -} - -func TestBufAPI(t *testing.T) { - s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" - z := NewTokenizer(bytes.NewBufferString(s)) - var result bytes.Buffer - depth := 0 -loop: - for { - tt := z.Next() - switch tt { - case ErrorToken: - if z.Err() != io.EOF { - t.Error(z.Err()) - } - break loop - case TextToken: - if depth > 0 { - result.Write(z.Text()) - } - case StartTagToken, EndTagToken: - tn, _ := z.TagName() - if len(tn) == 1 && tn[0] == 'a' { - if tt == StartTagToken { - depth++ - } else { - depth-- - } - } - } - } - u := "14567" - v := string(result.Bytes()) - if u != v { - t.Errorf("TestBufAPI: want %q got %q", u, v) - } -} - -func TestConvertNewlines(t *testing.T) { - testCases := map[string]string{ - "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", - "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", - "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", - "": "", - "\n": "\n", - "\n\r": "\n\n", - "\r": "\n", - "\r\n": "\n", - "\r\n\n": "\n\n", - "\r\n\r": "\n\n", - "\r\n\r\n": "\n\n", - "\r\r": "\n\n", - "\r\r\n": "\n\n", - "\r\r\n\n": "\n\n\n", - "\r\r\r\n": "\n\n\n", - "\r \n": "\n \n", - "xyz": "xyz", - } - for in, want := range testCases { - if got := string(convertNewlines([]byte(in))); got != want { - t.Errorf("input %q: got %q, want %q", in, got, want) - } - } -} - -func TestReaderEdgeCases(t *testing.T) { - const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>" - testCases := []io.Reader{ - &zeroOneByteReader{s: s}, - &eofStringsReader{s: s}, - &stuckReader{}, - } - for i, tc := range testCases { - got := []TokenType{} - z := NewTokenizer(tc) - for { - tt := z.Next() - if tt == ErrorToken { - break - } - got = append(got, tt) - } - if err := z.Err(); err != nil && err != io.EOF { - if err != io.ErrNoProgress { - t.Errorf("i=%d: %v", i, err) - } - continue - } - want := []TokenType{ - StartTagToken, - TextToken, - EndTagToken, - } - if !reflect.DeepEqual(got, want) { - t.Errorf("i=%d: got %v, want %v", i, got, want) - continue - } - } -} - -// zeroOneByteReader is like a strings.Reader that alternates between -// returning 0 bytes and 1 byte at a time. -type zeroOneByteReader struct { - s string - n int -} - -func (r *zeroOneByteReader) Read(p []byte) (int, error) { - if len(p) == 0 { - return 0, nil - } - if len(r.s) == 0 { - return 0, io.EOF - } - r.n++ - if r.n%2 != 0 { - return 0, nil - } - p[0], r.s = r.s[0], r.s[1:] - return 1, nil -} - -// eofStringsReader is like a strings.Reader but can return an (n, err) where -// n > 0 && err != nil. -type eofStringsReader struct { - s string -} - -func (r *eofStringsReader) Read(p []byte) (int, error) { - n := copy(p, r.s) - r.s = r.s[n:] - if r.s != "" { - return n, nil - } - return n, io.EOF -} - -// stuckReader is an io.Reader that always returns no data and no error. -type stuckReader struct{} - -func (*stuckReader) Read(p []byte) (int, error) { - return 0, nil -} - -const ( - rawLevel = iota - lowLevel - highLevel -) - -func benchmarkTokenizer(b *testing.B, level int) { - buf, err := ioutil.ReadFile("testdata/go1.html") - if err != nil { - b.Fatalf("could not read testdata/go1.html: %v", err) - } - b.SetBytes(int64(len(buf))) - runtime.GC() - b.ReportAllocs() - b.ResetTimer() - for i := 0; i < b.N; i++ { - z := NewTokenizer(bytes.NewBuffer(buf)) - for { - tt := z.Next() - if tt == ErrorToken { - if err := z.Err(); err != nil && err != io.EOF { - b.Fatalf("tokenizer error: %v", err) - } - break - } - switch level { - case rawLevel: - // Calling z.Raw just returns the raw bytes of the token. It does - // not unescape < to <, or lower-case tag names and attribute keys. - z.Raw() - case lowLevel: - // Caling z.Text, z.TagName and z.TagAttr returns []byte values - // whose contents may change on the next call to z.Next. - switch tt { - case TextToken, CommentToken, DoctypeToken: - z.Text() - case StartTagToken, SelfClosingTagToken: - _, more := z.TagName() - for more { - _, _, more = z.TagAttr() - } - case EndTagToken: - z.TagName() - } - case highLevel: - // Calling z.Token converts []byte values to strings whose validity - // extend beyond the next call to z.Next. - z.Token() - } - } - } -} - -func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } -func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } -func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) } |