Update serverside markdown parser to respect unicode and capitalized links (#9235)

author: Harrison Healey <harrisonmhealey@gmail.com> 2018-08-08 06:13:04 -0400
committer: Jesús Espino <jespinog@gmail.com> 2018-08-08 12:13:04 +0200
commit: 99cf15b56eb561dc30def86ad7e3cd97af4c338c (patch)
tree: 18636fea7eee2674ae70ce8adf151e9be8cf826c /utils
parent: 935f0c5ff9a7e3dea3db70f4df59d0db52543f29 (diff)
download: chat-99cf15b56eb561dc30def86ad7e3cd97af4c338c.tar.gz
chat-99cf15b56eb561dc30def86ad7e3cd97af4c338c.tar.bz2
chat-99cf15b56eb561dc30def86ad7e3cd97af4c338c.zip
5 files changed, 173 insertions, 78 deletions
diff --git a/utils/markdown/autolink.go b/utils/markdown/autolink.go
index 16c40e609..7f7d1117f 100644
--- a/utils/markdown/autolink.go
+++ b/utils/markdown/autolink.go
@@ -16,27 +16,27 @@ var (
 	DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"}
 )
 
-// Given a string with a w at the given position, tries to parse and return a link starting with "www."
+// Given a string with a w at the given position, tries to parse and return a range containing a www link.
 // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to
 // www_match from the reference code.
-func parseWWWAutolink(data string, position int) string {
+func parseWWWAutolink(data string, position int) (Range, bool) {
 	// Check that this isn't part of another word
 	if position > 1 {
 		prevChar := data[position-1]
 
 		if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) {
-			return ""
+			return Range{}, false
 		}
 	}
 
 	// Check that this starts with www
 	if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) {
-		return ""
+		return Range{}, false
 	}
 
 	end := checkDomain(data[position:], false)
 	if end == 0 {
-		return ""
+		return Range{}, false
 	}
 
 	end += position
@@ -47,12 +47,12 @@ func parseWWWAutolink(data string, position int) string {
 	}
 
 	// Trim trailing punctuation
-	link := trimTrailingCharactersFromLink(data[position:end])
-	if link == "" {
-		return ""
+	end = trimTrailingCharactersFromLink(data, position, end)
+	if position == end {
+		return Range{}, false
 	}
 
-	return link
+	return Range{position, end}, true
 }
 
 func isAllowedBeforeWWWLink(c byte) bool {
@@ -64,13 +64,13 @@ func isAllowedBeforeWWWLink(c byte) bool {
 	}
 }
 
-// Given a string with a : at the given position, tried to parse and return a link starting with a URL scheme
+// Given a string with a : at the given position, tried to parse and return a range containing a URL scheme
 // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to
 // url_match from the reference code.
-func parseURLAutolink(data string, position int) string {
+func parseURLAutolink(data string, position int) (Range, bool) {
 	// Check that a :// exists. This doesn't match the clients that treat the slashes as optional.
 	if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' {
-		return ""
+		return Range{}, false
 	}
 
 	start := position - 1
@@ -81,12 +81,12 @@ func parseURLAutolink(data string, position int) string {
 	// Ensure that the URL scheme is allowed and that at least one character after the scheme is valid.
 	scheme := data[start:position]
 	if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) {
-		return ""
+		return Range{}, false
 	}
 
 	end := checkDomain(data[position+3:], true)
 	if end == 0 {
-		return ""
+		return Range{}, false
 	}
 
 	end += position
@@ -97,12 +97,12 @@ func parseURLAutolink(data string, position int) string {
 	}
 
 	// Trim trailing punctuation
-	link := trimTrailingCharactersFromLink(data[start:end])
-	if link == "" {
-		return ""
+	end = trimTrailingCharactersFromLink(data, start, end)
+	if start == end {
+		return Range{}, false
 	}
 
-	return link
+	return Range{start, end}, true
 }
 
 func isSchemeAllowed(scheme string) bool {
@@ -166,9 +166,9 @@ func isValidHostCharacter(link string) bool {
 }
 
 // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link.
-// Equivalent to autolink_delim from the reference code.
-func trimTrailingCharactersFromLink(link string) string {
-	runes := []rune(link)
+// Returns a new end position for the link. Equivalent to autolink_delim from the reference code.
+func trimTrailingCharactersFromLink(markdown string, start int, end int) int {
+	runes := []rune(markdown[start:end])
 	linkEnd := len(runes)
 
 	// Cut off the link before an open angle bracket if it contains one
@@ -240,7 +240,7 @@ func trimTrailingCharactersFromLink(link string) string {
 		}
 	}
 
-	return string(runes[:linkEnd])
+	return start + len(string(runes[:linkEnd]))
 }
 
 func canEndAutolink(c rune) bool {
diff --git a/utils/markdown/autolink_test.go b/utils/markdown/autolink_test.go
index d0ea53fa4..997124338 100644
--- a/utils/markdown/autolink_test.go
+++ b/utils/markdown/autolink_test.go
@@ -134,7 +134,15 @@ func TestParseURLAutolink(t *testing.T) {
 
 	for _, testCase := range testCases {
 		t.Run(testCase.Description, func(t *testing.T) {
-			assert.Equal(t, testCase.Expected, parseURLAutolink(testCase.Input, testCase.Position))
+			rawRange, ok := parseURLAutolink(testCase.Input, testCase.Position)
+
+			if testCase.Expected == "" {
+				assert.False(t, ok)
+				assert.Equal(t, Range{0, 0}, rawRange)
+			} else {
+				assert.True(t, ok)
+				assert.Equal(t, testCase.Expected, testCase.Input[rawRange.Position:rawRange.End])
+			}
 		})
 	}
 }
@@ -264,89 +272,153 @@ func TestParseWWWAutolink(t *testing.T) {
 
 	for _, testCase := range testCases {
 		t.Run(testCase.Description, func(t *testing.T) {
-			assert.Equal(t, testCase.Expected, parseWWWAutolink(testCase.Input, testCase.Position))
+			rawRange, ok := parseWWWAutolink(testCase.Input, testCase.Position)
+
+			if testCase.Expected == "" {
+				assert.False(t, ok)
+				assert.Equal(t, Range{0, 0}, rawRange)
+			} else {
+				assert.True(t, ok)
+				assert.Equal(t, testCase.Expected, testCase.Input[rawRange.Position:rawRange.End])
+			}
 		})
 	}
 }
 
 func TestTrimTrailingCharactersFromLink(t *testing.T) {
 	testCases := []struct {
-		Input    string
-		Expected string
+		Input       string
+		Start       int
+		End         int
+		ExpectedEnd int
 	}{
 		{
-			Input:    "http://www.example.com",
-			Expected: "http://www.example.com",
+			Input:       "http://www.example.com",
+			ExpectedEnd: 22,
+		},
+		{
+			Input:       "http://www.example.com/abcd",
+			ExpectedEnd: 27,
+		},
+		{
+			Input:       "http://www.example.com/abcd/",
+			ExpectedEnd: 28,
+		},
+		{
+			Input:       "http://www.example.com/1234",
+			ExpectedEnd: 27,
+		},
+		{
+			Input:       "http://www.example.com/abcd?foo=bar",
+			ExpectedEnd: 35,
 		},
 		{
-			Input:    "http://www.example.com/abcd",
-			Expected: "http://www.example.com/abcd",
+			Input:       "http://www.example.com/abcd#heading",
+			ExpectedEnd: 35,
 		},
 		{
-			Input:    "http://www.example.com/abcd/",
-			Expected: "http://www.example.com/abcd/",
+			Input:       "http://www.example.com.",
+			ExpectedEnd: 22,
 		},
 		{
-			Input:    "http://www.example.com/1234",
-			Expected: "http://www.example.com/1234",
+			Input:       "http://www.example.com,",
+			ExpectedEnd: 22,
 		},
 		{
-			Input:    "http://www.example.com/abcd?foo=bar",
-			Expected: "http://www.example.com/abcd?foo=bar",
+			Input:       "http://www.example.com?",
+			ExpectedEnd: 22,
 		},
 		{
-			Input:    "http://www.example.com/abcd#heading",
-			Expected: "http://www.example.com/abcd#heading",
+			Input:       "http://www.example.com)",
+			ExpectedEnd: 22,
 		},
 		{
-			Input:    "http://www.example.com.",
-			Expected: "http://www.example.com",
+			Input:       "http://www.example.com",
+			ExpectedEnd: 22,
 		},
 		{
-			Input:    "http://www.example.com,",
-			Expected: "http://www.example.com",
+			Input:       "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)",
+			ExpectedEnd: 54,
 		},
 		{
-			Input:    "http://www.example.com?",
-			Expected: "http://www.example.com",
+			Input:       "https://en.wikipedia.org/wiki/Dolphin_(disambiguation",
+			ExpectedEnd: 53,
 		},
 		{
-			Input:    "http://www.example.com)",
-			Expected: "http://www.example.com",
+			Input:       "https://en.wikipedia.org/wiki/Dolphin_(disambiguation))",
+			ExpectedEnd: 54,
 		},
 		{
-			Input:    "http://www.example.com",
-			Expected: "http://www.example.com",
+			Input:       "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)",
+			ExpectedEnd: 71,
 		},
 		{
-			Input:    "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)",
-			Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)",
+			Input:       "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))",
+			ExpectedEnd: 71,
 		},
 		{
-			Input:    "https://en.wikipedia.org/wiki/Dolphin_(disambiguation",
-			Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation",
+			Input:       "http://www.example.com&quot;",
+			ExpectedEnd: 22,
 		},
 		{
-			Input:    "https://en.wikipedia.org/wiki/Dolphin_(disambiguation))",
-			Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)",
+			Input:       "this is a sentence containing http://www.example.com in it",
+			Start:       30,
+			End:         52,
+			ExpectedEnd: 52,
 		},
 		{
-			Input:    "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)",
-			Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)",
+			Input:       "this is a sentence containing http://www.example.com???",
+			Start:       30,
+			End:         55,
+			ExpectedEnd: 52,
 		},
 		{
-			Input:    "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))",
-			Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))",
+			Input:       "http://google.com/å",
+			ExpectedEnd: len("http://google.com/å"),
 		},
 		{
-			Input:    "http://www.example.com&quot;",
-			Expected: "http://www.example.com",
+			Input:       "http://google.com/å...",
+			ExpectedEnd: len("http://google.com/å"),
+		},
+		{
+			Input:       "This is http://google.com/å, a link, and http://google.com/å",
+			Start:       8,
+			End:         len("This is http://google.com/å,"),
+			ExpectedEnd: len("This is http://google.com/å"),
+		},
+		{
+			Input:       "This is http://google.com/å, a link, and http://google.com/å",
+			Start:       41,
+			End:         len("This is http://google.com/å, a link, and http://google.com/å"),
+			ExpectedEnd: len("This is http://google.com/å, a link, and http://google.com/å"),
+		},
+		{
+			Input:       "This is http://google.com/å, a link, and http://google.com/å.",
+			Start:       41,
+			End:         len("This is http://google.com/å, a link, and http://google.com/å."),
+			ExpectedEnd: len("This is http://google.com/å, a link, and http://google.com/å"),
+		},
+		{
+			Input:       "http://🍄.ga/ http://x🍄.ga/",
+			Start:       0,
+			End:         len("http://🍄.ga/"),
+			ExpectedEnd: len("http://🍄.ga/"),
+		},
+		{
+			Input:       "http://🍄.ga/ http://x🍄.ga/",
+			Start:       len("http://🍄.ga/ "),
+			End:         len("http://🍄.ga/ http://x🍄.ga/"),
+			ExpectedEnd: len("http://🍄.ga/ http://x🍄.ga/"),
 		},
 	}
 
 	for _, testCase := range testCases {
 		t.Run(testCase.Input, func(t *testing.T) {
-			assert.Equal(t, testCase.Expected, trimTrailingCharactersFromLink(testCase.Input))
+			if testCase.End == 0 {
+				testCase.End = len(testCase.Input) - testCase.Start
+			}
+
+			assert.Equal(t, testCase.ExpectedEnd, trimTrailingCharactersFromLink(testCase.Input, testCase.Start, testCase.End))
 		})
 	}
 }
diff --git a/utils/markdown/commonmark_test.go b/utils/markdown/commonmark_test.go
index 13e61f52d..d1381cee5 100644
--- a/utils/markdown/commonmark_test.go
+++ b/utils/markdown/commonmark_test.go
@@ -1000,7 +1000,7 @@ func TestCommonMarkReferenceStrings(t *testing.T) {
 	}
 }
 
-func TestCommonMarkRefernceAutolinks(t *testing.T) {
+func TestCommonMarkReferenceAutolinks(t *testing.T) {
 	// These tests are adapted from the GitHub-flavoured CommonMark extension tests located at
 	// https://github.com/github/cmark/blob/master/test/extensions.txt
 	for name, tc := range map[string]struct {
diff --git a/utils/markdown/html.go b/utils/markdown/html.go
index 1a857afed..afb72bff3 100644
--- a/utils/markdown/html.go
+++ b/utils/markdown/html.go
@@ -157,7 +157,7 @@ func RenderInlineHTML(inline Inline) (result string) {
 		}
 		result += "</a>"
 	case *Autolink:
-		result += `<a href="` + htmlEscaper.Replace(escapeURL(v.Link)) + `">`
+		result += `<a href="` + htmlEscaper.Replace(escapeURL(v.Destination())) + `">`
 		for _, inline := range v.Children {
 			result += RenderInlineHTML(inline)
 		}
diff --git a/utils/markdown/inlines.go b/utils/markdown/inlines.go
index 453f4bbe5..a3abccef3 100644
--- a/utils/markdown/inlines.go
+++ b/utils/markdown/inlines.go
@@ -86,7 +86,19 @@ type Autolink struct {
 
 	Children []Inline
 
-	Link string
+	RawDestination Range
+
+	markdown string
+}
+
+func (i *Autolink) Destination() string {
+	destination := Unescape(i.markdown[i.RawDestination.Position:i.RawDestination.End])
+
+	if strings.HasPrefix(destination, "www") {
+		destination = "http://" + destination
+	}
+
+	return destination
 }
 
 type delimiterType int
@@ -486,15 +498,18 @@ func (p *inlineParser) parseAutolink(c rune) bool {
 		}
 	}
 
-	link := ""
-	text := ""
+	var link Range
 	if c == ':' {
-		text = parseURLAutolink(p.raw, p.position)
-		link = text
+		var ok bool
+		link, ok = parseURLAutolink(p.raw, p.position)
+
+		if !ok {
+			return false
+		}
 
 		// Since the current position is at the colon, we have to rewind the parsing slightly so that
 		// we don't duplicate the URL scheme
-		rewind := strings.Index(text, ":")
+		rewind := strings.Index(p.raw[link.Position:link.End], ":")
 		if rewind != -1 {
 			lastInline := p.inlines[len(p.inlines)-1]
 			lastText, ok := lastInline.(*Text)
@@ -512,22 +527,30 @@ func (p *inlineParser) parseAutolink(c rune) bool {
 				Range: Range{lastText.Range.Position, lastText.Range.End - rewind},
 			})
 			p.position -= rewind
+		}
+	} else if c == 'w' || c == 'W' {
+		var ok bool
+		link, ok = parseWWWAutolink(p.raw, p.position)
 
+		if !ok {
+			return false
 		}
-	} else if c == 'w' {
-		text = parseWWWAutolink(p.raw, p.position)
-		link = "http://" + text
 	}
 
-	if text == "" {
-		return false
-	}
+	linkMarkdownPosition := relativeToAbsolutePosition(p.ranges, link.Position)
+	linkRange := Range{linkMarkdownPosition, linkMarkdownPosition + link.End - link.Position}
 
 	p.inlines = append(p.inlines, &Autolink{
-		Link:     link,
-		Children: []Inline{&Text{Text: text}},
+		Children: []Inline{
+			&Text{
+				Text:  p.raw[link.Position:link.End],
+				Range: linkRange,
+			},
+		},
+		RawDestination: linkRange,
+		markdown:       p.markdown,
 	})
-	p.position += len(text)
+	p.position += (link.End - link.Position)
 
 	return true
 }
author	Harrison Healey <harrisonmhealey@gmail.com>	2018-08-08 06:13:04 -0400
committer	Jesús Espino <jespinog@gmail.com>	2018-08-08 12:13:04 +0200
commit	99cf15b56eb561dc30def86ad7e3cd97af4c338c (patch)
tree	18636fea7eee2674ae70ce8adf151e9be8cf826c /utils
parent	935f0c5ff9a7e3dea3db70f4df59d0db52543f29 (diff)
download	chat-99cf15b56eb561dc30def86ad7e3cd97af4c338c.tar.gz chat-99cf15b56eb561dc30def86ad7e3cd97af4c338c.tar.bz2 chat-99cf15b56eb561dc30def86ad7e3cd97af4c338c.zip