// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package precis import ( "bytes" "fmt" "reflect" "testing" "golang.org/x/text/internal/testtext" "golang.org/x/text/secure/bidirule" "golang.org/x/text/transform" ) type testCase struct { input string output string err error } var enforceTestCases = []struct { name string p *Profile cases []testCase }{ {"Basic", NewFreeform(), []testCase{ {"e\u0301\u031f", "\u00e9\u031f", nil}, // normalize }}, {"Context Rule 1", NewFreeform(), []testCase{ // Rule 1: zero-width non-joiner (U+200C) // From RFC: // False // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True; // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C // (Joining_Type:T)*(Joining_Type:{R,D})) Then True; // // Example runes for different joining types: // Join L: U+A872; PHAGS-PA SUPERFIXED LETTER RA // Join D: U+062C; HAH WITH DOT BELOW // Join T: U+0610; ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM // Join R: U+0627; ALEF // Virama: U+0A4D; GURMUKHI SIGN VIRAMA // Virama and Join T: U+0ACD; GUJARATI SIGN VIRAMA {"\u200c", "", errContext}, {"\u200ca", "", errContext}, {"a\u200c", "", errContext}, {"\u200c\u0627", "", errContext}, // missing JoinStart {"\u062c\u200c", "", errContext}, // missing JoinEnd {"\u0610\u200c\u0610\u0627", "", errContext}, // missing JoinStart {"\u062c\u0610\u200c\u0610", "", errContext}, // missing JoinEnd // Variants of: D T* U+200c T* R {"\u062c\u200c\u0627", "\u062c\u200c\u0627", nil}, {"\u062c\u0610\u200c\u0610\u0627", "\u062c\u0610\u200c\u0610\u0627", nil}, {"\u062c\u0610\u0610\u200c\u0610\u0610\u0627", "\u062c\u0610\u0610\u200c\u0610\u0610\u0627", nil}, {"\u062c\u0610\u200c\u0627", "\u062c\u0610\u200c\u0627", nil}, {"\u062c\u200c\u0610\u0627", "\u062c\u200c\u0610\u0627", nil}, // Variants of: L T* U+200c T* D {"\ua872\u200c\u062c", "\ua872\u200c\u062c", nil}, {"\ua872\u0610\u200c\u0610\u062c", "\ua872\u0610\u200c\u0610\u062c", nil}, {"\ua872\u0610\u0610\u200c\u0610\u0610\u062c", "\ua872\u0610\u0610\u200c\u0610\u0610\u062c", nil}, {"\ua872\u0610\u200c\u062c", "\ua872\u0610\u200c\u062c", nil}, {"\ua872\u200c\u0610\u062c", "\ua872\u200c\u0610\u062c", nil}, // Virama {"\u0a4d\u200c", "\u0a4d\u200c", nil}, {"\ua872\u0a4d\u200c", "\ua872\u0a4d\u200c", nil}, {"\ua872\u0a4d\u0610\u200c", "", errContext}, {"\ua872\u0a4d\u0610\u200c", "", errContext}, {"\u0acd\u200c", "\u0acd\u200c", nil}, {"\ua872\u0acd\u200c", "\ua872\u0acd\u200c", nil}, {"\ua872\u0acd\u0610\u200c", "", errContext}, {"\ua872\u0acd\u0610\u200c", "", errContext}, // Using Virama as join T {"\ua872\u0acd\u200c\u062c", "\ua872\u0acd\u200c\u062c", nil}, {"\ua872\u200c\u0acd\u062c", "\ua872\u200c\u0acd\u062c", nil}, }}, {"Context Rule 2", NewFreeform(), []testCase{ // Rule 2: zero-width joiner (U+200D) {"\u200d", "", errContext}, {"\u200da", "", errContext}, {"a\u200d", "", errContext}, {"\u0a4d\u200d", "\u0a4d\u200d", nil}, {"\ua872\u0a4d\u200d", "\ua872\u0a4d\u200d", nil}, {"\u0a4da\u200d", "", errContext}, }}, {"Context Rule 3", NewFreeform(), []testCase{ // Rule 3: middle dot {"·", "", errContext}, {"l·", "", errContext}, {"·l", "", errContext}, {"a·", "", errContext}, {"l·a", "", errContext}, {"a·a", "", errContext}, {"l·l", "l·l", nil}, {"al·la", "al·la", nil}, }}, {"Context Rule 4", NewFreeform(), []testCase{ // Rule 4: Greek lower numeral U+0375 {"͵", "", errContext}, {"͵a", "", errContext}, {"α͵", "", errContext}, {"͵α", "͵α", nil}, {"α͵α", "α͵α", nil}, {"͵͵α", "͵͵α", nil}, // The numeric sign is itself Greek. {"α͵͵α", "α͵͵α", nil}, {"α͵͵", "", errContext}, {"α͵͵a", "", errContext}, }}, {"Context Rule 5+6", NewFreeform(), []testCase{ // Rule 5+6: Hebrew preceding // U+05f3: Geresh {"׳", "", errContext}, {"׳ה", "", errContext}, {"a׳b", "", errContext}, {"ש׳", "ש׳", nil}, // U+05e9 U+05f3 {"ש׳׳׳", "ש׳׳׳", nil}, // U+05e9 U+05f3 // U+05f4: Gershayim {"״", "", errContext}, {"״ה", "", errContext}, {"a״b", "", errContext}, {"ש״", "ש״", nil}, // U+05e9 U+05f4 {"ש״״״", "ש״״״", nil}, // U+05e9 U+05f4 {"aש״״״", "aש״״״", nil}, // U+05e9 U+05f4 }}, {"Context Rule 7", NewFreeform(), []testCase{ // Rule 7: Katakana middle Dot {"・", "", errContext}, {"abc・", "", errContext}, {"・def", "", errContext}, {"abc・def", "", errContext}, {"aヅc・def", "aヅc・def", nil}, {"abc・dぶf", "abc・dぶf", nil}, {"⺐bc・def", "⺐bc・def", nil}, }}, {"Context Rule 8+9", NewFreeform(), []testCase{ // Rule 8+9: Arabic Indic Digit {"١٢٣٤٥۶", "", errContext}, {"۱۲۳۴۵٦", "", errContext}, {"١٢٣٤٥", "١٢٣٤٥", nil}, {"۱۲۳۴۵", "۱۲۳۴۵", nil}, }}, {"Nickname", Nickname, []testCase{ {" Swan of Avon ", "Swan of Avon", nil}, {"", "", errEmptyString}, {" ", "", errEmptyString}, {" ", "", errEmptyString}, {"a\u00A0a\u1680a\u2000a\u2001a\u2002a\u2003a\u2004a\u2005a\u2006a\u2007a\u2008a\u2009a\u200Aa\u202Fa\u205Fa\u3000a", "a a a a a a a a a a a a a a a a a", nil}, {"Foo", "Foo", nil}, {"foo", "foo", nil}, {"Foo Bar", "Foo Bar", nil}, {"foo bar", "foo bar", nil}, {"\u03A3", "\u03A3", nil}, {"\u03C3", "\u03C3", nil}, // Greek final sigma is left as is (do not fold!) {"\u03C2", "\u03C2", nil}, {"\u265A", "♚", nil}, {"Richard \u2163", "Richard IV", nil}, {"\u212B", "Å", nil}, {"\uFB00", "ff", nil}, // because of NFKC {"שa", "שa", nil}, // no bidi rule {"동일조건변경허락", "동일조건변경허락", nil}, }}, {"OpaqueString", OpaqueString, []testCase{ {" Swan of Avon ", " Swan of Avon ", nil}, {"", "", errEmptyString}, {" ", " ", nil}, {" ", " ", nil}, {"a\u00A0a\u1680a\u2000a\u2001a\u2002a\u2003a\u2004a\u2005a\u2006a\u2007a\u2008a\u2009a\u200Aa\u202Fa\u205Fa\u3000a", "a a a a a a a a a a a a a a a a a", nil}, {"Foo", "Foo", nil}, {"foo", "foo", nil}, {"Foo Bar", "Foo Bar", nil}, {"foo bar", "foo bar", nil}, {"\u03C3", "\u03C3", nil}, {"Richard \u2163", "Richard \u2163", nil}, {"\u212B", "Å", nil}, {"Jack of \u2666s", "Jack of \u2666s", nil}, {"my cat is a \u0009by", "", errDisallowedRune}, {"שa", "שa", nil}, // no bidi rule }}, {"UsernameCaseMapped", UsernameCaseMapped, []testCase{ // TODO: Should this work? // {UsernameCaseMapped, "", "", errDisallowedRune}, {"juliet@example.com", "juliet@example.com", nil}, {"fussball", "fussball", nil}, {"fu\u00DFball", "fu\u00DFball", nil}, {"\u03C0", "\u03C0", nil}, {"\u03A3", "\u03C3", nil}, {"\u03C3", "\u03C3", nil}, // Greek final sigma is left as is (do not fold!) {"\u03C2", "\u03C2", nil}, {"\u0049", "\u0069", nil}, {"\u0049", "\u0069", nil}, {"\u03D2", "", errDisallowedRune}, {"\u03B0", "\u03B0", nil}, {"foo bar", "", errDisallowedRune}, {"♚", "", bidirule.ErrInvalid}, {"\u007E", "~", nil}, {"a", "a", nil}, {"!", "!", nil}, {"²", "", bidirule.ErrInvalid}, {"\t", "", errDisallowedRune}, {"\n", "", errDisallowedRune}, {"\u26D6", "", bidirule.ErrInvalid}, {"\u26FF", "", bidirule.ErrInvalid}, {"\uFB00", "", errDisallowedRune}, {"\u1680", "", bidirule.ErrInvalid}, {" ", "", errDisallowedRune}, {" ", "", errDisallowedRune}, {"\u01C5", "", errDisallowedRune}, {"\u16EE", "", errDisallowedRune}, // Nl RUNIC ARLAUG SYMBOL {"\u0488", "", bidirule.ErrInvalid}, // Me COMBINING CYRILLIC HUNDRED THOUSANDS SIGN {"\u212B", "\u00e5", nil}, // Angstrom sign, NFC -> U+00E5 {"A\u030A", "å", nil}, // A + ring {"\u00C5", "å", nil}, // A with ring {"\u00E7", "ç", nil}, // c cedille {"\u0063\u0327", "ç", nil}, // c + cedille {"\u0158", "ř", nil}, {"\u0052\u030C", "ř", nil}, {"\u1E61", "\u1E61", nil}, // LATIN SMALL LETTER S WITH DOT ABOVE // Confusable characters ARE allowed and should NOT be mapped. {"\u0410", "\u0430", nil}, // CYRILLIC CAPITAL LETTER A // Full width should be mapped to the canonical decomposition. {"AB", "ab", nil}, {"שc", "", bidirule.ErrInvalid}, // bidi rule }}, {"UsernameCasePreserved", UsernameCasePreserved, []testCase{ {"ABC", "ABC", nil}, {"AB", "AB", nil}, {"שc", "", bidirule.ErrInvalid}, // bidi rule {"\uFB00", "", errDisallowedRune}, {"\u212B", "\u00c5", nil}, // Angstrom sign, NFC -> U+00E5 {"ẛ", "", errDisallowedRune}, // LATIN SMALL LETTER LONG S WITH DOT ABOVE }}, } func doTests(t *testing.T, fn func(t *testing.T, p *Profile, tc testCase)) { for _, g := range enforceTestCases { for i, tc := range g.cases { name := fmt.Sprintf("%s:%d:%+q", g.name, i, tc.input) testtext.Run(t, name, func(t *testing.T) { fn(t, g.p, tc) }) } } } func TestString(t *testing.T) { doTests(t, func(t *testing.T, p *Profile, tc testCase) { if e, err := p.String(tc.input); tc.err != err || e != tc.output { t.Errorf("got %+q (err: %v); want %+q (err: %v)", e, err, tc.output, tc.err) } }) } func TestBytes(t *testing.T) { doTests(t, func(t *testing.T, p *Profile, tc testCase) { if e, err := p.Bytes([]byte(tc.input)); tc.err != err || string(e) != tc.output { t.Errorf("got %+q (err: %v); want %+q (err: %v)", string(e), err, tc.output, tc.err) } }) // Test that calling Bytes with something that doesn't transform returns a // copy. orig := []byte("hello") b, _ := NewFreeform().Bytes(orig) if reflect.ValueOf(b).Pointer() == reflect.ValueOf(orig).Pointer() { t.Error("original and result are the same slice; should be a copy") } } func TestAppend(t *testing.T) { doTests(t, func(t *testing.T, p *Profile, tc testCase) { if e, err := p.Append(nil, []byte(tc.input)); tc.err != err || string(e) != tc.output { t.Errorf("got %+q (err: %v); want %+q (err: %v)", string(e), err, tc.output, tc.err) } }) } func TestStringMallocs(t *testing.T) { if n := testtext.AllocsPerRun(100, func() { UsernameCaseMapped.String("helloworld") }); n > 0 { // TODO: reduce this to 0. t.Skipf("got %f allocs, want 0", n) } } func TestAppendMallocs(t *testing.T) { str := []byte("helloworld") out := make([]byte, 0, len(str)) if n := testtext.AllocsPerRun(100, func() { UsernameCaseMapped.Append(out, str) }); n > 0 { t.Errorf("got %f allocs, want 0", n) } } func TestTransformMallocs(t *testing.T) { str := []byte("helloworld") out := make([]byte, 0, len(str)) tr := UsernameCaseMapped.NewTransformer() if n := testtext.AllocsPerRun(100, func() { tr.Reset() tr.Transform(out, str, true) }); n > 0 { t.Errorf("got %f allocs, want 0", n) } } func min(a, b int) int { if a < b { return a } return b } // TestTransformerShortBuffers tests that the precis.Transformer implements the // spirit, not just the letter (the method signatures), of the // transform.Transformer interface. // // In particular, it tests that, if one or both of the dst or src buffers are // short, so that multiple Transform calls are required to complete the overall // transformation, the end result is identical to one Transform call with // sufficiently long buffers. func TestTransformerShortBuffers(t *testing.T) { srcUnit := []byte("a\u0300cce\u0301nts") // NFD normalization form. wantUnit := []byte("àccénts") // NFC normalization form. src := bytes.Repeat(srcUnit, 16) want := bytes.Repeat(wantUnit, 16) const long = 4096 dst := make([]byte, long) // 5, 7, 9, 11, 13, 16 and 17 are all pair-wise co-prime, which means that // slicing the dst and src buffers into 5, 7, 13 and 17 byte chunks will // fall at different places inside the repeated srcUnit's and wantUnit's. if len(srcUnit) != 11 || len(wantUnit) != 9 || len(src) > long || len(want) > long { t.Fatal("inconsistent lengths") } tr := NewFreeform().NewTransformer() for _, deltaD := range []int{5, 7, 13, 17, long} { loop: for _, deltaS := range []int{5, 7, 13, 17, long} { tr.Reset() d0 := 0 s0 := 0 for { d1 := min(len(dst), d0+deltaD) s1 := min(len(src), s0+deltaS) nDst, nSrc, err := tr.Transform(dst[d0:d1:d1], src[s0:s1:s1], s1 == len(src)) d0 += nDst s0 += nSrc if err == nil { break } if err == transform.ErrShortDst || err == transform.ErrShortSrc { continue } t.Errorf("deltaD=%d, deltaS=%d: %v", deltaD, deltaS, err) continue loop } if s0 != len(src) { t.Errorf("deltaD=%d, deltaS=%d: s0: got %d, want %d", deltaD, deltaS, s0, len(src)) continue } if d0 != len(want) { t.Errorf("deltaD=%d, deltaS=%d: d0: got %d, want %d", deltaD, deltaS, d0, len(want)) continue } got := dst[:d0] if !bytes.Equal(got, want) { t.Errorf("deltaD=%d, deltaS=%d:\ngot %q\nwant %q", deltaD, deltaS, got, want) continue } } } }