diff options
Diffstat (limited to 'vendor/github.com/mattermost/rsc/regexp/regmerge/match.go')
-rw-r--r-- | vendor/github.com/mattermost/rsc/regexp/regmerge/match.go | 406 |
1 files changed, 406 insertions, 0 deletions
diff --git a/vendor/github.com/mattermost/rsc/regexp/regmerge/match.go b/vendor/github.com/mattermost/rsc/regexp/regmerge/match.go new file mode 100644 index 000000000..6a4b1f967 --- /dev/null +++ b/vendor/github.com/mattermost/rsc/regexp/regmerge/match.go @@ -0,0 +1,406 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Copied from code.google.com/p/codesearch/regexp/copy.go +// and adapted for the problem of finding a matching string, not +// testing whether a particular string matches. + +package main + +import ( + "encoding/binary" + "errors" + "fmt" + "hash/fnv" + "hash" + "log" + "regexp/syntax" +) + +// A matcher holds the state for running regular expression search. +type matcher struct { + buf []byte + prog *syntax.Prog // compiled program + dstate map[uint32]*dstate // dstate cache + start *dstate // start state + startLine *dstate // start state for beginning of line + z1, z2, z3 nstate // three temporary nstates + ids []int + numState int + maxState int + numByte int + numMatch int + undo [256]byte + all *dstate + allTail **dstate + h hash.Hash32 +} + +// An nstate corresponds to an NFA state. +type nstate struct { + q Set // queue of program instructions + flag flags // flags (TODO) + needFlag syntax.EmptyOp +} + +// The flags record state about a position between bytes in the text. +type flags uint32 + +const ( + flagBOL flags = 1 << iota // beginning of line + flagEOL // end of line + flagBOT // beginning of text + flagEOT // end of text + flagWord // last byte was word byte +) + +// A dstate corresponds to a DFA state. +type dstate struct { + enc string // encoded nstate + nextAll *dstate + nextHash *dstate + prev *dstate + prevByte int + done bool +} + +func (z *nstate) String() string { + return fmt.Sprintf("%v/%#x+%#x", z.q.Dense(), z.flag, z.needFlag) +} + +// enc encodes z as a string. +func (m *matcher) enc(z *nstate) []byte { + buf := m.buf[:0] + buf = append(buf, byte(z.needFlag), byte(z.flag)) + ids := m.ids[:0] + for _, id := range z.q.Dense() { + ids = append(ids, int(id)) + } + sortInts(ids) + last := ^uint32(0) + for _, id := range ids { + x := uint32(id)-last + last = uint32(id) + for x >= 0x80 { + buf = append(buf, byte(x)|0x80) + x >>= 7 + } + buf = append(buf, byte(x)) + } + m.buf = buf + return buf +} + +// dec decodes the encoding s into z. +func (m *matcher) dec(z *nstate, s string) { + b := append(m.buf[:0], s...) + m.buf = b + z.needFlag = syntax.EmptyOp(b[0]) + b = b[1:] + i, n := binary.Uvarint(b) + if n <= 0 { + bug() + } + b = b[n:] + z.flag = flags(i) + z.q.Reset() + last := ^uint32(0) + for len(b) > 0 { + i, n = binary.Uvarint(b) + if n <= 0 { + bug() + } + b = b[n:] + last += uint32(i) + z.q.Add(last, 0, 0) + } +} + +// init initializes the matcher. +func (m *matcher) init(prog *syntax.Prog, n int) error { + m.prog = prog + m.dstate = make(map[uint32]*dstate) + m.numMatch = n + m.maxState = 10 + m.allTail = &m.all + m.numByte = 256 + for i := range m.undo { + m.undo[i] = byte(i) + } + m.h = fnv.New32() + + m.z1.q.Init(uint32(len(prog.Inst))) + m.z2.q.Init(uint32(len(prog.Inst))) + m.z3.q.Init(uint32(len(prog.Inst))) + m.ids = make([]int, 0, len(prog.Inst)) + + m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine|syntax.EmptyBeginText) + m.z1.flag = flagBOL | flagBOT + m.start = m.cache(&m.z1, nil, 0) + + m.z1.q.Reset() + m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine) + m.z1.flag = flagBOL + m.startLine = m.cache(&m.z1, nil, 0) + + m.crunchProg() + + return nil +} + +// stepEmpty steps runq to nextq expanding according to flag. +func (m *matcher) stepEmpty(runq, nextq *Set, flag syntax.EmptyOp) { + nextq.Reset() + for _, id := range runq.Dense() { + m.addq(nextq, id, flag) + } +} + +// stepByte steps runq to nextq consuming c and then expanding according to flag. +// It returns true if a match ends immediately before c. +// c is either an input byte or endText. +func (m *matcher) stepByte(runq, nextq *Set, c int, flag syntax.EmptyOp) (match bool) { + nextq.Reset() + m.addq(nextq, uint32(m.prog.Start), flag) + + nmatch := 0 + for _, id := range runq.Dense() { + i := &m.prog.Inst[id] + switch i.Op { + default: + continue + case syntax.InstMatch: + nmatch++ + continue + case instByteRange: + if c == endText { + break + } + lo := int((i.Arg >> 8) & 0xFF) + hi := int(i.Arg & 0xFF) + if i.Arg&argFold != 0 && 'a' <= c && c <= 'z' { + c += 'A' - 'a' + } + if lo <= c && c <= hi { + m.addq(nextq, i.Out, flag) + } + } + } + return nmatch == m.numMatch +} + +// addq adds id to the queue, expanding according to flag. +func (m *matcher) addq(q *Set, id uint32, flag syntax.EmptyOp) { + if q.Has(id, 0) { + return + } + q.MustAdd(id) + i := &m.prog.Inst[id] + switch i.Op { + case syntax.InstCapture, syntax.InstNop: + m.addq(q, i.Out, flag) + case syntax.InstAlt, syntax.InstAltMatch: + m.addq(q, i.Out, flag) + m.addq(q, i.Arg, flag) + case syntax.InstEmptyWidth: + if syntax.EmptyOp(i.Arg)&^flag == 0 { + m.addq(q, i.Out, flag) + } + } +} + +const endText = -1 + +// computeNext computes the next DFA state if we're in d reading c (an input byte or endText). +func (m *matcher) computeNext(this, next *nstate, d *dstate, c int) bool { + // compute flags in effect before c + flag := syntax.EmptyOp(0) + if this.flag&flagBOL != 0 { + flag |= syntax.EmptyBeginLine + } + if this.flag&flagBOT != 0 { + flag |= syntax.EmptyBeginText + } + if this.flag&flagWord != 0 { + if !isWordByte(c) { + flag |= syntax.EmptyWordBoundary + } else { + flag |= syntax.EmptyNoWordBoundary + } + } else { + if isWordByte(c) { + flag |= syntax.EmptyWordBoundary + } else { + flag |= syntax.EmptyNoWordBoundary + } + } + if c == '\n' { + flag |= syntax.EmptyEndLine + } + if c == endText { + flag |= syntax.EmptyEndLine | syntax.EmptyEndText + } + + if flag &= this.needFlag; flag != 0 { + // re-expand queue using new flags. + // TODO: only do this when it matters + // (something is gating on word boundaries). + m.stepEmpty(&this.q, &next.q, flag) + this, next = next, &m.z3 + } + + // now compute flags after c. + flag = 0 + next.flag = 0 + if c == '\n' { + flag |= syntax.EmptyBeginLine + next.flag |= flagBOL + } + if isWordByte(c) { + next.flag |= flagWord + } + + // re-add start, process rune + expand according to flags. + if m.stepByte(&this.q, &next.q, c, flag) { + return true + } + next.needFlag = m.queueFlag(&next.q) + if next.needFlag&syntax.EmptyBeginLine == 0 { + next.flag &^= flagBOL + } + if next.needFlag&(syntax.EmptyWordBoundary|syntax.EmptyNoWordBoundary) == 0{ + next.flag &^= flagWord + } + + m.cache(next, d, c) + return false +} + +func (m *matcher) queueFlag(runq *Set) syntax.EmptyOp { + var e uint32 + for _, id := range runq.Dense() { + i := &m.prog.Inst[id] + if i.Op == syntax.InstEmptyWidth { + e |= i.Arg + } + } + return syntax.EmptyOp(e) +} + +func (m *matcher) hash(enc []byte) uint32 { + m.h.Reset() + m.h.Write(enc) + return m.h.Sum32() +} + +func (m *matcher) find(h uint32, enc []byte) *dstate { +Search: + for d := m.dstate[h]; d!=nil; d=d.nextHash { + s := d.enc + if len(s) != len(enc) { + continue Search + } + for i, b := range enc { + if s[i] != b { + continue Search + } + } + return d + } + return nil +} + +func (m *matcher) cache(z *nstate, prev *dstate, prevByte int) *dstate { + enc := m.enc(z) + h := m.hash(enc) + d := m.find(h, enc) + if d != nil { + return d + } + + if m.numState >= m.maxState { + panic(ErrMemory) + } + m.numState++ + d = &dstate{ + enc: string(enc), + prev: prev, + prevByte: prevByte, + nextHash: m.dstate[h], + } + m.dstate[h] = d + *m.allTail = d + m.allTail = &d.nextAll + + return d +} + +// isWordByte reports whether the byte c is a word character: ASCII only. +// This is used to implement \b and \B. This is not right for Unicode, but: +// - it's hard to get right in a byte-at-a-time matching world +// (the DFA has only one-byte lookahead) +// - this crude approximation is the same one PCRE uses +func isWordByte(c int) bool { + return 'A' <= c && c <= 'Z' || + 'a' <= c && c <= 'z' || + '0' <= c && c <= '9' || + c == '_' +} + +var ErrNoMatch = errors.New("no matching strings") +var ErrMemory = errors.New("exhausted memory") + +func (m *matcher) findMatch(maxState int) (s string, err error) { + defer func() { + switch r := recover().(type) { + case nil: + return + case error: + err = r + return + default: + panic(r) + } + }() + + m.maxState = maxState + numState := 0 + var d *dstate + var c int + for d = m.all; d != nil; d = d.nextAll { + numState++ + if d.done { + continue + } + this, next := &m.z1, &m.z2 + m.dec(this, d.enc) + if m.computeNext(this, next, d, endText) { + c = endText + goto Found + } + for _, cb := range m.undo[:m.numByte] { + if m.computeNext(this, next, d, int(cb)) { + c = int(cb) + goto Found + } + } + d.done = true + } + log.Printf("searched %d states; queued %d states", numState, m.numState) + return "", ErrNoMatch + +Found: + var buf []byte + if c >= 0 { + buf = append(buf, byte(c)) + } + for d1 := d; d1.prev != nil; d1= d1.prev { + buf = append(buf, byte(d1.prevByte)) + } + for i, j := 0, len(buf)-1; i < j; i, j = i+1, j-1 { + buf[i], buf[j] = buf[j], buf[i] + } + log.Printf("searched %d states; queued %d states", numState, m.numState) + return string(buf), nil +} |