summaryrefslogtreecommitdiffstats
path: root/utils/markdown/autolink.go
blob: 7f7d1117f1af0d8a4f4a9a629f0b974c8f5e197b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved.
// See License.txt for license information.

package markdown

import (
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// Based off of extensions/autolink.c from https://github.com/github/cmark

var (
	DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
)

// Given a string with a w at the given position, tries to parse and return a range containing a www link.
// if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
// www_match from the reference code.
func parseWWWAutolink(data string, position int) (Range, bool) {
	// Check that this isn't part of another word
	if position > 1 {
		prevChar := data[position-1]

		if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
			return Range{}, false
		}
	}

	// Check that this starts with www
	if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) {
		return Range{}, false
	}

	end := checkDomain(data[position:], false)
	if end == 0 {
		return Range{}, false
	}

	end += position

	// Grab all text until the end of the string or the next whitespace character
	for end < len(data) && !isWhitespaceByte(data[end]) {
		end += 1
	}

	// Trim trailing punctuation
	end = trimTrailingCharactersFromLink(data, position, end)
	if position == end {
		return Range{}, false
	}

	return Range{position, end}, true
}

func isAllowedBeforeWWWLink(c byte) bool {
	switch c {
	case '*', '_', '~', ')':
		return true
	default:
		return false
	}
}

// Given a string with a : at the given position, tried to parse and return a range containing a URL scheme
// if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
// url_match from the reference code.
func parseURLAutolink(data string, position int) (Range, bool) {
	// Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
	if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
		return Range{}, false
	}

	start := position - 1
	for start > 0 && isAlphanumericByte(data[start-1]) {
		start -= 1
	}

	// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
	scheme := data[start:position]
	if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
		return Range{}, false
	}

	end := checkDomain(data[position+3:], true)
	if end == 0 {
		return Range{}, false
	}

	end += position

	// Grab all text until the end of the string or the next whitespace character
	for end < len(data) && !isWhitespaceByte(data[end]) {
		end += 1
	}

	// Trim trailing punctuation
	end = trimTrailingCharactersFromLink(data, start, end)
	if start == end {
		return Range{}, false
	}

	return Range{start, end}, true
}

func isSchemeAllowed(scheme string) bool {
	// Note that this doesn't support the custom URL schemes implemented by the client
	for _, allowed := range DefaultUrlSchemes {
		if strings.EqualFold(allowed, scheme) {
			return true
		}
	}

	return false
}

// Given a string starting with a URL, returns the number of valid characters that make up the URL's domain.
// Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain
// needs to contain a period to be considered valid. Equivalent to check_domain from the reference code.
func checkDomain(data string, allowShort bool) int {
	foundUnderscore := false
	foundPeriod := false

	i := 1
	for ; i < len(data)-1; i++ {
		if data[i] == '_' {
			foundUnderscore = true
			break
		} else if data[i] == '.' {
			foundPeriod = true
		} else if !isValidHostCharacter(data[i:]) && data[i] != '-' {
			break
		}
	}

	if foundUnderscore {
		return 0
	}

	if allowShort {
		// If allowShort is set, accept any string of valid domain characters
		return i
	}

	// If allowShort isn't set, a valid domain just requires at least a single period. Note that this
	// logic isn't entirely necessary because we already know the string starts with "www." when
	// this is called from parseWWWAutolink
	if foundPeriod {
		return i
	} else {
		return 0
	}
}

// Returns true if the provided link starts with a valid character for a domain name. Equivalent to
// is_valid_hostchar from the reference code.
func isValidHostCharacter(link string) bool {
	c, _ := utf8.DecodeRuneInString(link)
	if c == utf8.RuneError {
		return false
	}

	return !unicode.IsSpace(c) && !unicode.IsPunct(c)
}

// Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
// Returns a new end position for the link. Equivalent to autolink_delim from the reference code.
func trimTrailingCharactersFromLink(markdown string, start int, end int) int {
	runes := []rune(markdown[start:end])
	linkEnd := len(runes)

	// Cut off the link before an open angle bracket if it contains one
	for i, c := range runes {
		if c == '<' {
			linkEnd = i
			break
		}
	}

	for linkEnd > 0 {
		c := runes[linkEnd-1]

		if !canEndAutolink(c) {
			// Trim trailing quotes, periods, etc
			linkEnd = linkEnd - 1
		} else if c == ';' {
			// Trim a trailing HTML entity
			newEnd := linkEnd - 2

			for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) {
				newEnd -= 1
			}

			if newEnd < linkEnd-2 && runes[newEnd] == '&' {
				linkEnd = newEnd
			} else {
				// This isn't actually an HTML entity, so just trim the semicolon
				linkEnd = linkEnd - 1
			}
		} else if c == ')' {
			// Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets.
			// If there are more closing brackets than opening ones, remove the extra bracket

			numClosing := 0
			numOpening := 0

			// Examples (input text => output linked portion):
			//
			//  http://www.pokemon.com/Pikachu_(Electric)
			//    => http://www.pokemon.com/Pikachu_(Electric)
			//
			//  http://www.pokemon.com/Pikachu_((Electric)
			//    => http://www.pokemon.com/Pikachu_((Electric)
			//
			//  http://www.pokemon.com/Pikachu_(Electric))
			//    => http://www.pokemon.com/Pikachu_(Electric)
			//
			//  http://www.pokemon.com/Pikachu_((Electric))
			//    => http://www.pokemon.com/Pikachu_((Electric))

			for i := 0; i < linkEnd; i++ {
				if runes[i] == '(' {
					numOpening += 1
				} else if runes[i] == ')' {
					numClosing += 1
				}
			}

			if numClosing <= numOpening {
				// There's fewer or equal closing brackets, so we've found the end of the link
				break
			}

			linkEnd -= 1
		} else {
			// There's no special characters at the end of the link, so we're at the end
			break
		}
	}

	return start + len(string(runes[:linkEnd]))
}

func canEndAutolink(c rune) bool {
	switch c {
	case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"':
		return false
	default:
		return true
	}
}