diff options
author | Christopher Speller <crspeller@gmail.com> | 2016-05-12 23:56:07 -0400 |
---|---|---|
committer | Christopher Speller <crspeller@gmail.com> | 2016-05-12 23:56:07 -0400 |
commit | 38ee83e45b4de7edf89bf9f0ef629eb4c6ad0fa8 (patch) | |
tree | a4fde09672192b97d453ad605b030bd5a10c5a45 /vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go | |
parent | 84d2482ddbff9564c9ad75b2d30af66e3ddfd44d (diff) | |
download | chat-38ee83e45b4de7edf89bf9f0ef629eb4c6ad0fa8.tar.gz chat-38ee83e45b4de7edf89bf9f0ef629eb4c6ad0fa8.tar.bz2 chat-38ee83e45b4de7edf89bf9f0ef629eb4c6ad0fa8.zip |
Moving to glide
Diffstat (limited to 'vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go')
-rw-r--r-- | vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go b/vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go new file mode 100644 index 000000000..605e01c10 --- /dev/null +++ b/vendor/github.com/mattermost/rsc/regexp/regmerge/utf.go @@ -0,0 +1,270 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Copy of code.google.com/p/codesearch/regexp/utf.go. + +package main + +import ( + "regexp/syntax" + "unicode" + "unicode/utf8" +) + +const ( + instFail = syntax.InstFail + instAlt = syntax.InstAlt + instByteRange = syntax.InstRune | 0x80 // local opcode + + argFold = 1 << 16 +) + +func toByteProg(prog *syntax.Prog) error { + var b runeBuilder + for pc := range prog.Inst { + i := &prog.Inst[pc] + switch i.Op { + case syntax.InstRune, syntax.InstRune1: + // General rune range. PIA. + // TODO: Pick off single-byte case. + if lo, hi, fold, ok := oneByteRange(i); ok { + i.Op = instByteRange + i.Arg = uint32(lo)<<8 | uint32(hi) + if fold { + i.Arg |= argFold + } + break + } + + r := i.Rune + if syntax.Flags(i.Arg)&syntax.FoldCase != 0 { + // Build folded list. + var rr []rune + if len(r) == 1 { + rr = appendFoldedRange(rr, r[0], r[0]) + } else { + for j := 0; j < len(r); j += 2 { + rr = appendFoldedRange(rr, r[j], r[j+1]) + } + } + r = rr + } + + b.init(prog, uint32(pc), i.Out) + if len(r) == 1 { + b.addRange(r[0], r[0], false) + } else { + for j := 0; j < len(r); j += 2 { + b.addRange(r[j], r[j+1], false) + } + } + + case syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + // All runes. + // AnyNotNL should exclude \n but the line-at-a-time + // execution takes care of that for us. + b.init(prog, uint32(pc), i.Out) + b.addRange(0, unicode.MaxRune, false) + } + } + return nil +} + +func oneByteRange(i *syntax.Inst) (lo, hi byte, fold, ok bool) { + if i.Op == syntax.InstRune1 { + r := i.Rune[0] + if r < utf8.RuneSelf { + return byte(r), byte(r), false, true + } + } + if i.Op != syntax.InstRune { + return + } + fold = syntax.Flags(i.Arg)&syntax.FoldCase != 0 + if len(i.Rune) == 1 || len(i.Rune) == 2 && i.Rune[0] == i.Rune[1] { + r := i.Rune[0] + if r >= utf8.RuneSelf { + return + } + if fold && !asciiFold(r) { + return + } + return byte(r), byte(r), fold, true + } + if len(i.Rune) == 2 && i.Rune[1] < utf8.RuneSelf { + if fold { + for r := i.Rune[0]; r <= i.Rune[1]; r++ { + if asciiFold(r) { + return + } + } + } + return byte(i.Rune[0]), byte(i.Rune[1]), fold, true + } + if len(i.Rune) == 4 && i.Rune[0] == i.Rune[1] && i.Rune[2] == i.Rune[3] && unicode.SimpleFold(i.Rune[0]) == i.Rune[2] && unicode.SimpleFold(i.Rune[2]) == i.Rune[0] { + return byte(i.Rune[0]), byte(i.Rune[0]), true, true + } + + return +} + +func asciiFold(r rune) bool { + if r >= utf8.RuneSelf { + return false + } + r1 := unicode.SimpleFold(r) + if r1 >= utf8.RuneSelf { + return false + } + if r1 == r { + return true + } + return unicode.SimpleFold(r1) == r +} + +func maxRune(n int) rune { + b := 0 + if n == 1 { + b = 7 + } else { + b = 8 - (n + 1) + 6*(n-1) + } + return 1<<uint(b) - 1 +} + +type cacheKey struct { + lo, hi uint8 + fold bool + next uint32 +} + +type runeBuilder struct { + begin uint32 + out uint32 + cache map[cacheKey]uint32 + p *syntax.Prog +} + +func (b *runeBuilder) init(p *syntax.Prog, begin, out uint32) { + // We will rewrite p.Inst[begin] to hold the accumulated + // machine. For now, there is no match. + p.Inst[begin].Op = instFail + + b.begin = begin + b.out = out + if b.cache == nil { + b.cache = make(map[cacheKey]uint32) + } + for k := range b.cache { + delete(b.cache, k) + } + b.p = p +} + +func (b *runeBuilder) uncachedSuffix(lo, hi byte, fold bool, next uint32) uint32 { + if next == 0 { + next = b.out + } + pc := len(b.p.Inst) + i := syntax.Inst{Op: instByteRange, Arg: uint32(lo)<<8 | uint32(hi), Out: next} + if fold { + i.Arg |= argFold + } + b.p.Inst = append(b.p.Inst, i) + return uint32(pc) +} + +func (b *runeBuilder) suffix(lo, hi byte, fold bool, next uint32) uint32 { + if lo < 0x80 || hi > 0xbf { + // Not a continuation byte, no need to cache. + return b.uncachedSuffix(lo, hi, fold, next) + } + + key := cacheKey{lo, hi, fold, next} + if pc, ok := b.cache[key]; ok { + return pc + } + + pc := b.uncachedSuffix(lo, hi, fold, next) + b.cache[key] = pc + return pc +} + +func (b *runeBuilder) addBranch(pc uint32) { + // Add pc to the branch at the beginning. + i := &b.p.Inst[b.begin] + switch i.Op { + case syntax.InstFail: + i.Op = syntax.InstNop + i.Out = pc + return + case syntax.InstNop: + i.Op = syntax.InstAlt + i.Arg = pc + return + case syntax.InstAlt: + apc := uint32(len(b.p.Inst)) + b.p.Inst = append(b.p.Inst, syntax.Inst{Op: instAlt, Out: i.Arg, Arg: pc}) + i = &b.p.Inst[b.begin] + i.Arg = apc + b.begin = apc + } +} + +func (b *runeBuilder) addRange(lo, hi rune, fold bool) { + if lo > hi { + return + } + + // TODO: Pick off 80-10FFFF for special handling? + if lo == 0x80 && hi == 0x10FFFF { + } + + // Split range into same-length sized ranges. + for i := 1; i < utf8.UTFMax; i++ { + max := maxRune(i) + if lo <= max && max < hi { + b.addRange(lo, max, fold) + b.addRange(max+1, hi, fold) + return + } + } + + // ASCII range is special. + if hi < utf8.RuneSelf { + b.addBranch(b.suffix(byte(lo), byte(hi), fold, 0)) + return + } + + // Split range into sections that agree on leading bytes. + for i := 1; i < utf8.UTFMax; i++ { + m := rune(1)<<uint(6*i) - 1 // last i bytes of UTF-8 sequence + if lo&^m != hi&^m { + if lo&m != 0 { + b.addRange(lo, lo|m, fold) + b.addRange((lo|m)+1, hi, fold) + return + } + if hi&m != m { + b.addRange(lo, hi&^m-1, fold) + b.addRange(hi&^m, hi, fold) + return + } + } + } + + // Finally. Generate byte matching equivalent for lo-hi. + var ulo, uhi [utf8.UTFMax]byte + n := utf8.EncodeRune(ulo[:], lo) + m := utf8.EncodeRune(uhi[:], hi) + if n != m { + panic("codesearch/regexp: bad utf-8 math") + } + + pc := uint32(0) + for i := n - 1; i >= 0; i-- { + pc = b.suffix(ulo[i], uhi[i], false, pc) + } + b.addBranch(pc) +} |