// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved. // See License.txt for license information. package markdown import ( "regexp" "strings" "unicode" "unicode/utf8" ) // Based off of extensions/autolink.c from https://github.com/github/cmark var ( DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"} ) // Given a string with a w at the given position, tries to parse and return a range containing a www link. // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to // www_match from the reference code. func parseWWWAutolink(data string, position int) (Range, bool) { // Check that this isn't part of another word if position > 1 { prevChar := data[position-1] if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) { return Range{}, false } } // Check that this starts with www if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) { return Range{}, false } end := checkDomain(data[position:], false) if end == 0 { return Range{}, false } end += position // Grab all text until the end of the string or the next whitespace character for end < len(data) && !isWhitespaceByte(data[end]) { end += 1 } // Trim trailing punctuation end = trimTrailingCharactersFromLink(data, position, end) if position == end { return Range{}, false } return Range{position, end}, true } func isAllowedBeforeWWWLink(c byte) bool { switch c { case '*', '_', '~', ')': return true default: return false } } // Given a string with a : at the given position, tried to parse and return a range containing a URL scheme // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to // url_match from the reference code. func parseURLAutolink(data string, position int) (Range, bool) { // Check that a :// exists. This doesn't match the clients that treat the slashes as optional. if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' { return Range{}, false } start := position - 1 for start > 0 && isAlphanumericByte(data[start-1]) { start -= 1 } // Ensure that the URL scheme is allowed and that at least one character after the scheme is valid. scheme := data[start:position] if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) { return Range{}, false } end := checkDomain(data[position+3:], true) if end == 0 { return Range{}, false } end += position // Grab all text until the end of the string or the next whitespace character for end < len(data) && !isWhitespaceByte(data[end]) { end += 1 } // Trim trailing punctuation end = trimTrailingCharactersFromLink(data, start, end) if start == end { return Range{}, false } return Range{start, end}, true } func isSchemeAllowed(scheme string) bool { // Note that this doesn't support the custom URL schemes implemented by the client for _, allowed := range DefaultUrlSchemes { if strings.EqualFold(allowed, scheme) { return true } } return false } // Given a string starting with a URL, returns the number of valid characters that make up the URL's domain. // Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain // needs to contain a period to be considered valid. Equivalent to check_domain from the reference code. func checkDomain(data string, allowShort bool) int { foundUnderscore := false foundPeriod := false i := 1 for ; i < len(data)-1; i++ { if data[i] == '_' { foundUnderscore = true break } else if data[i] == '.' { foundPeriod = true } else if !isValidHostCharacter(data[i:]) && data[i] != '-' { break } } if foundUnderscore { return 0 } if allowShort { // If allowShort is set, accept any string of valid domain characters return i } // If allowShort isn't set, a valid domain just requires at least a single period. Note that this // logic isn't entirely necessary because we already know the string starts with "www." when // this is called from parseWWWAutolink if foundPeriod { return i } else { return 0 } } // Returns true if the provided link starts with a valid character for a domain name. Equivalent to // is_valid_hostchar from the reference code. func isValidHostCharacter(link string) bool { c, _ := utf8.DecodeRuneInString(link) if c == utf8.RuneError { return false } return !unicode.IsSpace(c) && !unicode.IsPunct(c) } // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link. // Returns a new end position for the link. Equivalent to autolink_delim from the reference code. func trimTrailingCharactersFromLink(markdown string, start int, end int) int { runes := []rune(markdown[start:end]) linkEnd := len(runes) // Cut off the link before an open angle bracket if it contains one for i, c := range runes { if c == '<' { linkEnd = i break } } for linkEnd > 0 { c := runes[linkEnd-1] if !canEndAutolink(c) { // Trim trailing quotes, periods, etc linkEnd = linkEnd - 1 } else if c == ';' { // Trim a trailing HTML entity newEnd := linkEnd - 2 for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) { newEnd -= 1 } if newEnd < linkEnd-2 && runes[newEnd] == '&' { linkEnd = newEnd } else { // This isn't actually an HTML entity, so just trim the semicolon linkEnd = linkEnd - 1 } } else if c == ')' { // Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets. // If there are more closing brackets than opening ones, remove the extra bracket numClosing := 0 numOpening := 0 // Examples (input text => output linked portion): // // http://www.pokemon.com/Pikachu_(Electric) // => http://www.pokemon.com/Pikachu_(Electric) // // http://www.pokemon.com/Pikachu_((Electric) // => http://www.pokemon.com/Pikachu_((Electric) // // http://www.pokemon.com/Pikachu_(Electric)) // => http://www.pokemon.com/Pikachu_(Electric) // // http://www.pokemon.com/Pikachu_((Electric)) // => http://www.pokemon.com/Pikachu_((Electric)) for i := 0; i < linkEnd; i++ { if runes[i] == '(' { numOpening += 1 } else if runes[i] == ')' { numClosing += 1 } } if numClosing <= numOpening { // There's fewer or equal closing brackets, so we've found the end of the link break } linkEnd -= 1 } else { // There's no special characters at the end of the link, so we're at the end break } } return start + len(string(runes[:linkEnd])) } func canEndAutolink(c rune) bool { switch c { case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"': return false default: return true } }