summaryrefslogtreecommitdiffstats
path: root/vendor/golang.org/x/text/cases/icu_test.go
blob: 3d07e2588e468370fc6a38af2e701dbba9962f50 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build icu

package cases

import (
	"path"
	"strings"
	"testing"

	"golang.org/x/text/internal/testtext"
	"golang.org/x/text/language"
	"golang.org/x/text/unicode/norm"
)

func TestICUConformance(t *testing.T) {
	// Build test set.
	input := []string{
		"a.a a_a",
		"a\u05d0a",
		"\u05d0'a",
		"a\u03084a",
		"a\u0308a",
		"a3\u30a3a",
		"a\u303aa",
		"a_\u303a_a",
		"1_a..a",
		"1_a.a",
		"a..a.",
		"a--a-",
		"a-a-",
		"a\u200ba",
		"a\u200b\u200ba",
		"a\u00ad\u00ada", // Format
		"a\u00ada",
		"a''a", // SingleQuote
		"a'a",
		"a::a", // MidLetter
		"a:a",
		"a..a", // MidNumLet
		"a.a",
		"a;;a", // MidNum
		"a;a",
		"a__a", // ExtendNumlet
		"a_a",
		"ΟΣ''a",
	}
	add := func(x interface{}) {
		switch v := x.(type) {
		case string:
			input = append(input, v)
		case []string:
			for _, s := range v {
				input = append(input, s)
			}
		}
	}
	for _, tc := range testCases {
		add(tc.src)
		add(tc.lower)
		add(tc.upper)
		add(tc.title)
	}
	for _, tc := range bufferTests {
		add(tc.src)
	}
	for _, tc := range breakTest {
		add(strings.Replace(tc, "|", "", -1))
	}
	for _, tc := range foldTestCases {
		add(tc)
	}

	// Compare ICU to Go.
	for _, c := range []string{"lower", "upper", "title", "fold"} {
		for _, tag := range []string{
			"und", "af", "az", "el", "lt", "nl", "tr",
		} {
			for _, s := range input {
				if exclude(c, tag, s) {
					continue
				}
				testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
					want := doICU(tag, c, s)
					got := doGo(tag, c, s)
					if norm.NFC.String(got) != norm.NFC.String(want) {
						t.Errorf("\n    in %[3]q (%+[3]q)\n   got %[1]q (%+[1]q)\n  want %[2]q (%+[2]q)", got, want, s)
					}
				})
			}
		}
	}
}

// exclude indicates if a string should be excluded from testing.
func exclude(cm, tag, s string) bool {
	list := []struct{ cm, tags, pattern string }{
		// TODO: Go does not handle certain esoteric breaks correctly. This will be
		// fixed once we have a real word break iterator. Alternatively, it
		// seems like we're not too far off from making it work, so we could
		// fix these last steps. But first verify that using a separate word
		// breaker does not hurt performance.
		{"title", "af nl", "a''a"},
		{"", "", "א'a"},

		// All the exclusions below seem to be issues with the ICU
		// implementation (at version 57) and thus are not marked as TODO.

		// ICU does not handle leading apostrophe for Dutch and
		// Afrikaans correctly. See http://unicode.org/cldr/trac/ticket/7078.
		{"title", "af nl", "'n"},
		{"title", "af nl", "'N"},

		// Go terminates the final sigma check after a fixed number of
		// ignorables have been found. This ensures that the algorithm can make
		// progress in a streaming scenario.
		{"lower title", "", "\u039f\u03a3...............................a"},
		// This also applies to upper in Greek.
		// NOTE: we could fix the following two cases by adding state to elUpper
		// and aztrLower. However, considering a modifier to not belong to the
		// preceding letter after the maximum modifiers count is reached is
		// consistent with the behavior of unicode/norm.
		{"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
		{"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
		{"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
		{"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},

		// ICU title case seems to erroneously removes \u0307 from an upper case
		// I unconditionally, instead of only when lowercasing. The ICU
		// transform algorithm transforms these cases consistently with our
		// implementation.
		{"title", "az tr", "\u0307"},

		// The spec says to remove \u0307 after Soft-Dotted characters. ICU
		// transforms conform but ucasemap_utf8ToUpper does not.
		{"upper title", "lt", "i\u0307"},
		{"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},

		// Both Unicode and CLDR prescribe an extra explicit dot above after a
		// Soft_Dotted character if there are other modifiers.
		// ucasemap_utf8ToUpper does not do this; ICU transforms do.
		// The issue with ucasemap_utf8ToUpper seems to be that it does not
		// consider the modifiers that are part of composition in the evaluation
		// of More_Above. For instance, according to the More_Above rule for lt,
		// a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
		// two additional dots). This seems odd, but is correct. ICU is
		// definitely not correct as it produces different results for different
		// normal forms. For instance, for an İ:
		//    \u0130  (NFC) -> i\u0307         (incorrect)
		//    I\u0307 (NFD) -> i\u0307\u0307   (correct)
		// We could argue that we should not add a \u0307 if there already is
		// one, but this may be hard to get correct and is not conform the
		// standard.
		{"lower title", "lt", "\u0130"},
		{"lower title", "lt", "\u00cf"},

		// We are conform ICU ucasemap_utf8ToUpper if we remove support for
		// elUpper. However, this is clearly not conform the spec. Moreover, the
		// ICU transforms _do_ implement this transform and produces results
		// consistent with our implementation. Note that we still prefer to use
		// ucasemap_utf8ToUpper instead of transforms as the latter have
		// inconsistencies in the word breaking algorithm.
		{"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
		{"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
		{"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS

		{"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
		{"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
		{"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA

		{"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
		{"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
		{"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA

		{"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
		{"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
		{"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
	}
	for _, x := range list {
		if x.cm != "" && strings.Index(x.cm, cm) == -1 {
			continue
		}
		if x.tags != "" && strings.Index(x.tags, tag) == -1 {
			continue
		}
		if strings.Index(s, x.pattern) != -1 {
			return true
		}
	}
	return false
}

func doGo(tag, caser, input string) string {
	var c Caser
	t := language.MustParse(tag)
	switch caser {
	case "lower":
		c = Lower(t)
	case "upper":
		c = Upper(t)
	case "title":
		c = Title(t)
	case "fold":
		c = Fold()
	}
	return c.String(input)
}