From e73f1d73143ebba9c7e80d21c45bba9b61f2611c Mon Sep 17 00:00:00 2001 From: Hyeseong Kim Date: Tue, 1 May 2018 01:54:11 +0900 Subject: MM-9072/MM-10185 Force-convert the encoding of OpenGraph metadata to UTF-8 (#8631) * Force-convert non-UTF8 HTML to UTF8 before opengraph processing * Split the force-encoding function * Add benchmark Test for the forceHTMLEncodingToUTF8() ``` Running tool: /home/comet/go-v1.9.2/bin/go test -benchmem -run=^$ github.com/mattermost/mattermost-server/app -bench ^BenchmarkForceHTMLEncodingToUTF8$ [03:32:58 KST 2018/04/21] [INFO] (github.com/mattermost/mattermost-server/app.TestMain:28) -test.run used, not creating temporary containers goos: linux goarch: amd64 pkg: github.com/mattermost/mattermost-server/app BenchmarkForceHTMLEncodingToUTF8/with_converting-4 100000 11201 ns/op 18704 B/op 32 allocs/op BenchmarkForceHTMLEncodingToUTF8/without_converting-4 300000 3931 ns/op 4632 B/op 13 allocs/op PASS ok github.com/mattermost/mattermost-server/app 2.703s Success: Benchmarks passed. ``` * Remove an unnecessary constraint * Add pre-check if content-type header is already utf-8 * Move the checking for utf-8 into forceHTMLEncodingToUTF8() for testing * Revert df3f347213faa0d023c26d201fa6531f46391086..HEAD, without Gopkg.lock --- vendor/golang.org/x/text/runes/cond.go | 187 +++++++++++++++++ vendor/golang.org/x/text/runes/runes.go | 355 ++++++++++++++++++++++++++++++++ 2 files changed, 542 insertions(+) create mode 100644 vendor/golang.org/x/text/runes/cond.go create mode 100644 vendor/golang.org/x/text/runes/runes.go (limited to 'vendor/golang.org/x/text/runes') diff --git a/vendor/golang.org/x/text/runes/cond.go b/vendor/golang.org/x/text/runes/cond.go new file mode 100644 index 000000000..df7aa02db --- /dev/null +++ b/vendor/golang.org/x/text/runes/cond.go @@ -0,0 +1,187 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package runes + +import ( + "unicode/utf8" + + "golang.org/x/text/transform" +) + +// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is. +// This is done for various reasons: +// - To retain the semantics of the Nop transformer: if input is passed to a Nop +// one would expect it to be unchanged. +// - It would be very expensive to pass a converted RuneError to a transformer: +// a transformer might need more source bytes after RuneError, meaning that +// the only way to pass it safely is to create a new buffer and manage the +// intermingling of RuneErrors and normal input. +// - Many transformers leave ill-formed UTF-8 as is, so this is not +// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a +// logical consequence of the operation (as for Map) or if it otherwise would +// pose security concerns (as for Remove). +// - An alternative would be to return an error on ill-formed UTF-8, but this +// would be inconsistent with other operations. + +// If returns a transformer that applies tIn to consecutive runes for which +// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset +// is called on tIn and tNotIn at the start of each run. A Nop transformer will +// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated +// to RuneError to determine which transformer to apply, but is passed as is to +// the respective transformer. +func If(s Set, tIn, tNotIn transform.Transformer) Transformer { + if tIn == nil && tNotIn == nil { + return Transformer{transform.Nop} + } + if tIn == nil { + tIn = transform.Nop + } + if tNotIn == nil { + tNotIn = transform.Nop + } + sIn, ok := tIn.(transform.SpanningTransformer) + if !ok { + sIn = dummySpan{tIn} + } + sNotIn, ok := tNotIn.(transform.SpanningTransformer) + if !ok { + sNotIn = dummySpan{tNotIn} + } + + a := &cond{ + tIn: sIn, + tNotIn: sNotIn, + f: s.Contains, + } + a.Reset() + return Transformer{a} +} + +type dummySpan struct{ transform.Transformer } + +func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) { + return 0, transform.ErrEndOfSpan +} + +type cond struct { + tIn, tNotIn transform.SpanningTransformer + f func(rune) bool + check func(rune) bool // current check to perform + t transform.SpanningTransformer // current transformer to use +} + +// Reset implements transform.Transformer. +func (t *cond) Reset() { + t.check = t.is + t.t = t.tIn + t.t.Reset() // notIn will be reset on first usage. +} + +func (t *cond) is(r rune) bool { + if t.f(r) { + return true + } + t.check = t.isNot + t.t = t.tNotIn + t.tNotIn.Reset() + return false +} + +func (t *cond) isNot(r rune) bool { + if !t.f(r) { + return true + } + t.check = t.is + t.t = t.tIn + t.tIn.Reset() + return false +} + +// This implementation of Span doesn't help all too much, but it needs to be +// there to satisfy this package's Transformer interface. +// TODO: there are certainly room for improvements, though. For example, if +// t.t == transform.Nop (which will a common occurrence) it will save a bundle +// to special-case that loop. +func (t *cond) Span(src []byte, atEOF bool) (n int, err error) { + p := 0 + for n < len(src) && err == nil { + // Don't process too much at a time as the Spanner that will be + // called on this block may terminate early. + const maxChunk = 4096 + max := len(src) + if v := n + maxChunk; v < max { + max = v + } + atEnd := false + size := 0 + current := t.t + for ; p < max; p += size { + r := rune(src[p]) + if r < utf8.RuneSelf { + size = 1 + } else if r, size = utf8.DecodeRune(src[p:]); size == 1 { + if !atEOF && !utf8.FullRune(src[p:]) { + err = transform.ErrShortSrc + break + } + } + if !t.check(r) { + // The next rune will be the start of a new run. + atEnd = true + break + } + } + n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src))) + n += n2 + if err2 != nil { + return n, err2 + } + // At this point either err != nil or t.check will pass for the rune at p. + p = n + size + } + return n, err +} + +func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + p := 0 + for nSrc < len(src) && err == nil { + // Don't process too much at a time, as the work might be wasted if the + // destination buffer isn't large enough to hold the result or a + // transform returns an error early. + const maxChunk = 4096 + max := len(src) + if n := nSrc + maxChunk; n < len(src) { + max = n + } + atEnd := false + size := 0 + current := t.t + for ; p < max; p += size { + r := rune(src[p]) + if r < utf8.RuneSelf { + size = 1 + } else if r, size = utf8.DecodeRune(src[p:]); size == 1 { + if !atEOF && !utf8.FullRune(src[p:]) { + err = transform.ErrShortSrc + break + } + } + if !t.check(r) { + // The next rune will be the start of a new run. + atEnd = true + break + } + } + nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src))) + nDst += nDst2 + nSrc += nSrc2 + if err2 != nil { + return nDst, nSrc, err2 + } + // At this point either err != nil or t.check will pass for the rune at p. + p = nSrc + size + } + return nDst, nSrc, err +} diff --git a/vendor/golang.org/x/text/runes/runes.go b/vendor/golang.org/x/text/runes/runes.go new file mode 100644 index 000000000..71933696f --- /dev/null +++ b/vendor/golang.org/x/text/runes/runes.go @@ -0,0 +1,355 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package runes provide transforms for UTF-8 encoded text. +package runes // import "golang.org/x/text/runes" + +import ( + "unicode" + "unicode/utf8" + + "golang.org/x/text/transform" +) + +// A Set is a collection of runes. +type Set interface { + // Contains returns true if r is contained in the set. + Contains(r rune) bool +} + +type setFunc func(rune) bool + +func (s setFunc) Contains(r rune) bool { + return s(r) +} + +// Note: using funcs here instead of wrapping types result in cleaner +// documentation and a smaller API. + +// In creates a Set with a Contains method that returns true for all runes in +// the given RangeTable. +func In(rt *unicode.RangeTable) Set { + return setFunc(func(r rune) bool { return unicode.Is(rt, r) }) +} + +// In creates a Set with a Contains method that returns true for all runes not +// in the given RangeTable. +func NotIn(rt *unicode.RangeTable) Set { + return setFunc(func(r rune) bool { return !unicode.Is(rt, r) }) +} + +// Predicate creates a Set with a Contains method that returns f(r). +func Predicate(f func(rune) bool) Set { + return setFunc(f) +} + +// Transformer implements the transform.Transformer interface. +type Transformer struct { + t transform.SpanningTransformer +} + +func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + return t.t.Transform(dst, src, atEOF) +} + +func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) { + return t.t.Span(b, atEOF) +} + +func (t Transformer) Reset() { t.t.Reset() } + +// Bytes returns a new byte slice with the result of converting b using t. It +// calls Reset on t. It returns nil if any error was found. This can only happen +// if an error-producing Transformer is passed to If. +func (t Transformer) Bytes(b []byte) []byte { + b, _, err := transform.Bytes(t, b) + if err != nil { + return nil + } + return b +} + +// String returns a string with the result of converting s using t. It calls +// Reset on t. It returns the empty string if any error was found. This can only +// happen if an error-producing Transformer is passed to If. +func (t Transformer) String(s string) string { + s, _, err := transform.String(t, s) + if err != nil { + return "" + } + return s +} + +// TODO: +// - Copy: copying strings and bytes in whole-rune units. +// - Validation (maybe) +// - Well-formed-ness (maybe) + +const runeErrorString = string(utf8.RuneError) + +// Remove returns a Transformer that removes runes r for which s.Contains(r). +// Illegal input bytes are replaced by RuneError before being passed to f. +func Remove(s Set) Transformer { + if f, ok := s.(setFunc); ok { + // This little trick cuts the running time of BenchmarkRemove for sets + // created by Predicate roughly in half. + // TODO: special-case RangeTables as well. + return Transformer{remove(f)} + } + return Transformer{remove(s.Contains)} +} + +// TODO: remove transform.RemoveFunc. + +type remove func(r rune) bool + +func (remove) Reset() {} + +// Span implements transform.Spanner. +func (t remove) Span(src []byte, atEOF bool) (n int, err error) { + for r, size := rune(0), 0; n < len(src); { + if r = rune(src[n]); r < utf8.RuneSelf { + size = 1 + } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { + // Invalid rune. + if !atEOF && !utf8.FullRune(src[n:]) { + err = transform.ErrShortSrc + } else { + err = transform.ErrEndOfSpan + } + break + } + if t(r) { + err = transform.ErrEndOfSpan + break + } + n += size + } + return +} + +// Transform implements transform.Transformer. +func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + for r, size := rune(0), 0; nSrc < len(src); { + if r = rune(src[nSrc]); r < utf8.RuneSelf { + size = 1 + } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { + // Invalid rune. + if !atEOF && !utf8.FullRune(src[nSrc:]) { + err = transform.ErrShortSrc + break + } + // We replace illegal bytes with RuneError. Not doing so might + // otherwise turn a sequence of invalid UTF-8 into valid UTF-8. + // The resulting byte sequence may subsequently contain runes + // for which t(r) is true that were passed unnoticed. + if !t(utf8.RuneError) { + if nDst+3 > len(dst) { + err = transform.ErrShortDst + break + } + dst[nDst+0] = runeErrorString[0] + dst[nDst+1] = runeErrorString[1] + dst[nDst+2] = runeErrorString[2] + nDst += 3 + } + nSrc++ + continue + } + if t(r) { + nSrc += size + continue + } + if nDst+size > len(dst) { + err = transform.ErrShortDst + break + } + for i := 0; i < size; i++ { + dst[nDst] = src[nSrc] + nDst++ + nSrc++ + } + } + return +} + +// Map returns a Transformer that maps the runes in the input using the given +// mapping. Illegal bytes in the input are converted to utf8.RuneError before +// being passed to the mapping func. +func Map(mapping func(rune) rune) Transformer { + return Transformer{mapper(mapping)} +} + +type mapper func(rune) rune + +func (mapper) Reset() {} + +// Span implements transform.Spanner. +func (t mapper) Span(src []byte, atEOF bool) (n int, err error) { + for r, size := rune(0), 0; n < len(src); n += size { + if r = rune(src[n]); r < utf8.RuneSelf { + size = 1 + } else if r, size = utf8.DecodeRune(src[n:]); size == 1 { + // Invalid rune. + if !atEOF && !utf8.FullRune(src[n:]) { + err = transform.ErrShortSrc + } else { + err = transform.ErrEndOfSpan + } + break + } + if t(r) != r { + err = transform.ErrEndOfSpan + break + } + } + return n, err +} + +// Transform implements transform.Transformer. +func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + var replacement rune + var b [utf8.UTFMax]byte + + for r, size := rune(0), 0; nSrc < len(src); { + if r = rune(src[nSrc]); r < utf8.RuneSelf { + if replacement = t(r); replacement < utf8.RuneSelf { + if nDst == len(dst) { + err = transform.ErrShortDst + break + } + dst[nDst] = byte(replacement) + nDst++ + nSrc++ + continue + } + size = 1 + } else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { + // Invalid rune. + if !atEOF && !utf8.FullRune(src[nSrc:]) { + err = transform.ErrShortSrc + break + } + + if replacement = t(utf8.RuneError); replacement == utf8.RuneError { + if nDst+3 > len(dst) { + err = transform.ErrShortDst + break + } + dst[nDst+0] = runeErrorString[0] + dst[nDst+1] = runeErrorString[1] + dst[nDst+2] = runeErrorString[2] + nDst += 3 + nSrc++ + continue + } + } else if replacement = t(r); replacement == r { + if nDst+size > len(dst) { + err = transform.ErrShortDst + break + } + for i := 0; i < size; i++ { + dst[nDst] = src[nSrc] + nDst++ + nSrc++ + } + continue + } + + n := utf8.EncodeRune(b[:], replacement) + + if nDst+n > len(dst) { + err = transform.ErrShortDst + break + } + for i := 0; i < n; i++ { + dst[nDst] = b[i] + nDst++ + } + nSrc += size + } + return +} + +// ReplaceIllFormed returns a transformer that replaces all input bytes that are +// not part of a well-formed UTF-8 code sequence with utf8.RuneError. +func ReplaceIllFormed() Transformer { + return Transformer{&replaceIllFormed{}} +} + +type replaceIllFormed struct{ transform.NopResetter } + +func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) { + for n < len(src) { + // ASCII fast path. + if src[n] < utf8.RuneSelf { + n++ + continue + } + + r, size := utf8.DecodeRune(src[n:]) + + // Look for a valid non-ASCII rune. + if r != utf8.RuneError || size != 1 { + n += size + continue + } + + // Look for short source data. + if !atEOF && !utf8.FullRune(src[n:]) { + err = transform.ErrShortSrc + break + } + + // We have an invalid rune. + err = transform.ErrEndOfSpan + break + } + return n, err +} + +func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { + for nSrc < len(src) { + // ASCII fast path. + if r := src[nSrc]; r < utf8.RuneSelf { + if nDst == len(dst) { + err = transform.ErrShortDst + break + } + dst[nDst] = r + nDst++ + nSrc++ + continue + } + + // Look for a valid non-ASCII rune. + if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 { + if size != copy(dst[nDst:], src[nSrc:nSrc+size]) { + err = transform.ErrShortDst + break + } + nDst += size + nSrc += size + continue + } + + // Look for short source data. + if !atEOF && !utf8.FullRune(src[nSrc:]) { + err = transform.ErrShortSrc + break + } + + // We have an invalid rune. + if nDst+3 > len(dst) { + err = transform.ErrShortDst + break + } + dst[nDst+0] = runeErrorString[0] + dst[nDst+1] = runeErrorString[1] + dst[nDst+2] = runeErrorString[2] + nDst += 3 + nSrc++ + } + return nDst, nSrc, err +} -- cgit v1.2.3-1-g7c22