summaryrefslogtreecommitdiffstats
path: root/vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go')
-rw-r--r--vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go270
1 files changed, 270 insertions, 0 deletions
diff --git a/vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go b/vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go
new file mode 100644
index 000000000..605e01c10
--- /dev/null
+++ b/vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go
@@ -0,0 +1,270 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Copy of code.google.com/p/codesearch/regexp/utf.go.
+
+package main
+
+import (
+ "regexp/syntax"
+ "unicode"
+ "unicode/utf8"
+)
+
+const (
+ instFail = syntax.InstFail
+ instAlt = syntax.InstAlt
+ instByteRange = syntax.InstRune | 0x80 // local opcode
+
+ argFold = 1 << 16
+)
+
+func toByteProg(prog *syntax.Prog) error {
+ var b runeBuilder
+ for pc := range prog.Inst {
+ i := &prog.Inst[pc]
+ switch i.Op {
+ case syntax.InstRune, syntax.InstRune1:
+ // General rune range. PIA.
+ // TODO: Pick off single-byte case.
+ if lo, hi, fold, ok := oneByteRange(i); ok {
+ i.Op = instByteRange
+ i.Arg = uint32(lo)<<8 | uint32(hi)
+ if fold {
+ i.Arg |= argFold
+ }
+ break
+ }
+
+ r := i.Rune
+ if syntax.Flags(i.Arg)&syntax.FoldCase != 0 {
+ // Build folded list.
+ var rr []rune
+ if len(r) == 1 {
+ rr = appendFoldedRange(rr, r[0], r[0])
+ } else {
+ for j := 0; j < len(r); j += 2 {
+ rr = appendFoldedRange(rr, r[j], r[j+1])
+ }
+ }
+ r = rr
+ }
+
+ b.init(prog, uint32(pc), i.Out)
+ if len(r) == 1 {
+ b.addRange(r[0], r[0], false)
+ } else {
+ for j := 0; j < len(r); j += 2 {
+ b.addRange(r[j], r[j+1], false)
+ }
+ }
+
+ case syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+ // All runes.
+ // AnyNotNL should exclude \n but the line-at-a-time
+ // execution takes care of that for us.
+ b.init(prog, uint32(pc), i.Out)
+ b.addRange(0, unicode.MaxRune, false)
+ }
+ }
+ return nil
+}
+
+func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) {
+ if i.Op == syntax.InstRune1 {
+ r := i.Rune[0]
+ if r < utf8.RuneSelf {
+ return byte(r), byte(r), false, true
+ }
+ }
+ if i.Op != syntax.InstRune {
+ return
+ }
+ fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0
+ if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] {
+ r := i.Rune[0]
+ if r >= utf8.RuneSelf {
+ return
+ }
+ if fold && !asciiFold(r) {
+ return
+ }
+ return byte(r), byte(r), fold, true
+ }
+ if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf {
+ if fold {
+ for r := i.Rune[0]; r <= i.Rune[1]; r++ {
+ if asciiFold(r) {
+ return
+ }
+ }
+ }
+ return byte(i.Rune[0]), byte(i.Rune[1]), fold, true
+ }
+ if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] {
+ return byte(i.Rune[0]), byte(i.Rune[0]), true, true
+ }
+
+ return
+}
+
+func asciiFold(r rune) bool {
+ if r >= utf8.RuneSelf {
+ return false
+ }
+ r1 := unicode.SimpleFold(r)
+ if r1 >= utf8.RuneSelf {
+ return false
+ }
+ if r1 == r {
+ return true
+ }
+ return unicode.SimpleFold(r1) == r
+}
+
+func maxRune(n int) rune {
+ b := 0
+ if n == 1 {
+ b = 7
+ } else {
+ b = 8 - (n + 1) + 6*(n-1)
+ }
+ return 1<<uint(b) - 1
+}
+
+type cacheKey struct {
+ lo, hi uint8
+ fold bool
+ next uint32
+}
+
+type runeBuilder struct {
+ begin uint32
+ out uint32
+ cache map[cacheKey]uint32
+ p *syntax.Prog
+}
+
+func (b *runeBuilder) init(p *syntax.Prog, begin, out uint32) {
+ // We will rewrite p.Inst[begin] to hold the accumulated
+ // machine. For now, there is no match.
+ p.Inst[begin].Op = instFail
+
+ b.begin = begin
+ b.out = out
+ if b.cache == nil {
+ b.cache = make(map[cacheKey]uint32)
+ }
+ for k := range b.cache {
+ delete(b.cache, k)
+ }
+ b.p = p
+}
+
+func (b *runeBuilder) uncachedSuffix(lo, hi byte, fold bool, next uint32) uint32 {
+ if next == 0 {
+ next = b.out
+ }
+ pc := len(b.p.Inst)
+ i := syntax.Inst{Op: instByteRange, Arg: uint32(lo)<<8 | uint32(hi), Out: next}
+ if fold {
+ i.Arg |= argFold
+ }
+ b.p.Inst = append(b.p.Inst, i)
+ return uint32(pc)
+}
+
+func (b *runeBuilder) suffix(lo, hi byte, fold bool, next uint32) uint32 {
+ if lo < 0x80 || hi > 0xbf {
+ // Not a continuation byte, no need to cache.
+ return b.uncachedSuffix(lo, hi, fold, next)
+ }
+
+ key := cacheKey{lo, hi, fold, next}
+ if pc, ok := b.cache[key]; ok {
+ return pc
+ }
+
+ pc := b.uncachedSuffix(lo, hi, fold, next)
+ b.cache[key] = pc
+ return pc
+}
+
+func (b *runeBuilder) addBranch(pc uint32) {
+ // Add pc to the branch at the beginning.
+ i := &b.p.Inst[b.begin]
+ switch i.Op {
+ case syntax.InstFail:
+ i.Op = syntax.InstNop
+ i.Out = pc
+ return
+ case syntax.InstNop:
+ i.Op = syntax.InstAlt
+ i.Arg = pc
+ return
+ case syntax.InstAlt:
+ apc := uint32(len(b.p.Inst))
+ b.p.Inst = append(b.p.Inst, syntax.Inst{Op: instAlt, Out: i.Arg, Arg: pc})
+ i = &b.p.Inst[b.begin]
+ i.Arg = apc
+ b.begin = apc
+ }
+}
+
+func (b *runeBuilder) addRange(lo, hi rune, fold bool) {
+ if lo > hi {
+ return
+ }
+
+ // TODO: Pick off 80-10FFFF for special handling?
+ if lo == 0x80 && hi == 0x10FFFF {
+ }
+
+ // Split range into same-length sized ranges.
+ for i := 1; i < utf8.UTFMax; i++ {
+ max := maxRune(i)
+ if lo <= max && max < hi {
+ b.addRange(lo, max, fold)
+ b.addRange(max+1, hi, fold)
+ return
+ }
+ }
+
+ // ASCII range is special.
+ if hi < utf8.RuneSelf {
+ b.addBranch(b.suffix(byte(lo), byte(hi), fold, 0))
+ return
+ }
+
+ // Split range into sections that agree on leading bytes.
+ for i := 1; i < utf8.UTFMax; i++ {
+ m := rune(1)<<uint(6*i) - 1 // last i bytes of UTF-8 sequence
+ if lo&^m != hi&^m {
+ if lo&m != 0 {
+ b.addRange(lo, lo|m, fold)
+ b.addRange((lo|m)+1, hi, fold)
+ return
+ }
+ if hi&m != m {
+ b.addRange(lo, hi&^m-1, fold)
+ b.addRange(hi&^m, hi, fold)
+ return
+ }
+ }
+ }
+
+ // Finally. Generate byte matching equivalent for lo-hi.
+ var ulo, uhi [utf8.UTFMax]byte
+ n := utf8.EncodeRune(ulo[:], lo)
+ m := utf8.EncodeRune(uhi[:], hi)
+ if n != m {
+ panic("codesearch/regexp: bad utf-8 math")
+ }
+
+ pc := uint32(0)
+ for i := n - 1; i >= 0; i-- {
+ pc = b.suffix(ulo[i], uhi[i], false, pc)
+ }
+ b.addBranch(pc)
+}