summaryrefslogtreecommitdiffstats
path: root/utils/markdown/autolink.go
diff options
context:
space:
mode:
Diffstat (limited to 'utils/markdown/autolink.go')
-rw-r--r--utils/markdown/autolink.go253
1 files changed, 253 insertions, 0 deletions
diff --git a/utils/markdown/autolink.go b/utils/markdown/autolink.go
new file mode 100644
index 000000000..16c40e609
--- /dev/null
+++ b/utils/markdown/autolink.go
@@ -0,0 +1,253 @@
+// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved.
+// See License.txt for license information.
+
+package markdown
+
+import (
+ "regexp"
+ "strings"
+ "unicode"
+ "unicode/utf8"
+)
+
+// Based off of extensions/autolink.c from https://github.com/github/cmark
+
+var (
+ DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
+)
+
+// Given a string with a w at the given position, tries to parse and return a link starting with "www."
+// if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
+// www_match from the reference code.
+func parseWWWAutolink(data string, position int) string {
+ // Check that this isn't part of another word
+ if position > 1 {
+ prevChar := data[position-1]
+
+ if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
+ return ""
+ }
+ }
+
+ // Check that this starts with www
+ if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) {
+ return ""
+ }
+
+ end := checkDomain(data[position:], false)
+ if end == 0 {
+ return ""
+ }
+
+ end += position
+
+ // Grab all text until the end of the string or the next whitespace character
+ for end < len(data) && !isWhitespaceByte(data[end]) {
+ end += 1
+ }
+
+ // Trim trailing punctuation
+ link := trimTrailingCharactersFromLink(data[position:end])
+ if link == "" {
+ return ""
+ }
+
+ return link
+}
+
+func isAllowedBeforeWWWLink(c byte) bool {
+ switch c {
+ case '*', '_', '~', ')':
+ return true
+ default:
+ return false
+ }
+}
+
+// Given a string with a : at the given position, tried to parse and return a link starting with a URL scheme
+// if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
+// url_match from the reference code.
+func parseURLAutolink(data string, position int) string {
+ // Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
+ if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
+ return ""
+ }
+
+ start := position - 1
+ for start > 0 && isAlphanumericByte(data[start-1]) {
+ start -= 1
+ }
+
+ // Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
+ scheme := data[start:position]
+ if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
+ return ""
+ }
+
+ end := checkDomain(data[position+3:], true)
+ if end == 0 {
+ return ""
+ }
+
+ end += position
+
+ // Grab all text until the end of the string or the next whitespace character
+ for end < len(data) && !isWhitespaceByte(data[end]) {
+ end += 1
+ }
+
+ // Trim trailing punctuation
+ link := trimTrailingCharactersFromLink(data[start:end])
+ if link == "" {
+ return ""
+ }
+
+ return link
+}
+
+func isSchemeAllowed(scheme string) bool {
+ // Note that this doesn't support the custom URL schemes implemented by the client
+ for _, allowed := range DefaultUrlSchemes {
+ if strings.EqualFold(allowed, scheme) {
+ return true
+ }
+ }
+
+ return false
+}
+
+// Given a string starting with a URL, returns the number of valid characters that make up the URL's domain.
+// Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain
+// needs to contain a period to be considered valid. Equivalent to check_domain from the reference code.
+func checkDomain(data string, allowShort bool) int {
+ foundUnderscore := false
+ foundPeriod := false
+
+ i := 1
+ for ; i < len(data)-1; i++ {
+ if data[i] == '_' {
+ foundUnderscore = true
+ break
+ } else if data[i] == '.' {
+ foundPeriod = true
+ } else if !isValidHostCharacter(data[i:]) && data[i] != '-' {
+ break
+ }
+ }
+
+ if foundUnderscore {
+ return 0
+ }
+
+ if allowShort {
+ // If allowShort is set, accept any string of valid domain characters
+ return i
+ }
+
+ // If allowShort isn't set, a valid domain just requires at least a single period. Note that this
+ // logic isn't entirely necessary because we already know the string starts with "www." when
+ // this is called from parseWWWAutolink
+ if foundPeriod {
+ return i
+ } else {
+ return 0
+ }
+}
+
+// Returns true if the provided link starts with a valid character for a domain name. Equivalent to
+// is_valid_hostchar from the reference code.
+func isValidHostCharacter(link string) bool {
+ c, _ := utf8.DecodeRuneInString(link)
+ if c == utf8.RuneError {
+ return false
+ }
+
+ return !unicode.IsSpace(c) && !unicode.IsPunct(c)
+}
+
+// Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
+// Equivalent to autolink_delim from the reference code.
+func trimTrailingCharactersFromLink(link string) string {
+ runes := []rune(link)
+ linkEnd := len(runes)
+
+ // Cut off the link before an open angle bracket if it contains one
+ for i, c := range runes {
+ if c == '<' {
+ linkEnd = i
+ break
+ }
+ }
+
+ for linkEnd > 0 {
+ c := runes[linkEnd-1]
+
+ if !canEndAutolink(c) {
+ // Trim trailing quotes, periods, etc
+ linkEnd = linkEnd - 1
+ } else if c == ';' {
+ // Trim a trailing HTML entity
+ newEnd := linkEnd - 2
+
+ for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) {
+ newEnd -= 1
+ }
+
+ if newEnd < linkEnd-2 && runes[newEnd] == '&' {
+ linkEnd = newEnd
+ } else {
+ // This isn't actually an HTML entity, so just trim the semicolon
+ linkEnd = linkEnd - 1
+ }
+ } else if c == ')' {
+ // Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets.
+ // If there are more closing brackets than opening ones, remove the extra bracket
+
+ numClosing := 0
+ numOpening := 0
+
+ // Examples (input text => output linked portion):
+ //
+ // http://www.pokemon.com/Pikachu_(Electric)
+ // => http://www.pokemon.com/Pikachu_(Electric)
+ //
+ // http://www.pokemon.com/Pikachu_((Electric)
+ // => http://www.pokemon.com/Pikachu_((Electric)
+ //
+ // http://www.pokemon.com/Pikachu_(Electric))
+ // => http://www.pokemon.com/Pikachu_(Electric)
+ //
+ // http://www.pokemon.com/Pikachu_((Electric))
+ // => http://www.pokemon.com/Pikachu_((Electric))
+
+ for i := 0; i < linkEnd; i++ {
+ if runes[i] == '(' {
+ numOpening += 1
+ } else if runes[i] == ')' {
+ numClosing += 1
+ }
+ }
+
+ if numClosing <= numOpening {
+ // There's fewer or equal closing brackets, so we've found the end of the link
+ break
+ }
+
+ linkEnd -= 1
+ } else {
+ // There's no special characters at the end of the link, so we're at the end
+ break
+ }
+ }
+
+ return string(runes[:linkEnd])
+}
+
+func canEndAutolink(c rune) bool {
+ switch c {
+ case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"':
+ return false
+ default:
+ return true
+ }
+}