From 99cf15b56eb561dc30def86ad7e3cd97af4c338c Mon Sep 17 00:00:00 2001 From: Harrison Healey Date: Wed, 8 Aug 2018 06:13:04 -0400 Subject: Update serverside markdown parser to respect unicode and capitalized links (#9235) --- utils/markdown/autolink.go | 44 +++++------ utils/markdown/autolink_test.go | 150 ++++++++++++++++++++++++++++---------- utils/markdown/commonmark_test.go | 2 +- utils/markdown/html.go | 2 +- utils/markdown/inlines.go | 53 ++++++++++---- 5 files changed, 173 insertions(+), 78 deletions(-) (limited to 'utils') diff --git a/utils/markdown/autolink.go b/utils/markdown/autolink.go index 16c40e609..7f7d1117f 100644 --- a/utils/markdown/autolink.go +++ b/utils/markdown/autolink.go @@ -16,27 +16,27 @@ var ( DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"} ) -// Given a string with a w at the given position, tries to parse and return a link starting with "www." +// Given a string with a w at the given position, tries to parse and return a range containing a www link. // if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to // www_match from the reference code. -func parseWWWAutolink(data string, position int) string { +func parseWWWAutolink(data string, position int) (Range, bool) { // Check that this isn't part of another word if position > 1 { prevChar := data[position-1] if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) { - return "" + return Range{}, false } } // Check that this starts with www if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) { - return "" + return Range{}, false } end := checkDomain(data[position:], false) if end == 0 { - return "" + return Range{}, false } end += position @@ -47,12 +47,12 @@ func parseWWWAutolink(data string, position int) string { } // Trim trailing punctuation - link := trimTrailingCharactersFromLink(data[position:end]) - if link == "" { - return "" + end = trimTrailingCharactersFromLink(data, position, end) + if position == end { + return Range{}, false } - return link + return Range{position, end}, true } func isAllowedBeforeWWWLink(c byte) bool { @@ -64,13 +64,13 @@ func isAllowedBeforeWWWLink(c byte) bool { } } -// Given a string with a : at the given position, tried to parse and return a link starting with a URL scheme +// Given a string with a : at the given position, tried to parse and return a range containing a URL scheme // if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to // url_match from the reference code. -func parseURLAutolink(data string, position int) string { +func parseURLAutolink(data string, position int) (Range, bool) { // Check that a :// exists. This doesn't match the clients that treat the slashes as optional. if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' { - return "" + return Range{}, false } start := position - 1 @@ -81,12 +81,12 @@ func parseURLAutolink(data string, position int) string { // Ensure that the URL scheme is allowed and that at least one character after the scheme is valid. scheme := data[start:position] if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) { - return "" + return Range{}, false } end := checkDomain(data[position+3:], true) if end == 0 { - return "" + return Range{}, false } end += position @@ -97,12 +97,12 @@ func parseURLAutolink(data string, position int) string { } // Trim trailing punctuation - link := trimTrailingCharactersFromLink(data[start:end]) - if link == "" { - return "" + end = trimTrailingCharactersFromLink(data, start, end) + if start == end { + return Range{}, false } - return link + return Range{start, end}, true } func isSchemeAllowed(scheme string) bool { @@ -166,9 +166,9 @@ func isValidHostCharacter(link string) bool { } // Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link. -// Equivalent to autolink_delim from the reference code. -func trimTrailingCharactersFromLink(link string) string { - runes := []rune(link) +// Returns a new end position for the link. Equivalent to autolink_delim from the reference code. +func trimTrailingCharactersFromLink(markdown string, start int, end int) int { + runes := []rune(markdown[start:end]) linkEnd := len(runes) // Cut off the link before an open angle bracket if it contains one @@ -240,7 +240,7 @@ func trimTrailingCharactersFromLink(link string) string { } } - return string(runes[:linkEnd]) + return start + len(string(runes[:linkEnd])) } func canEndAutolink(c rune) bool { diff --git a/utils/markdown/autolink_test.go b/utils/markdown/autolink_test.go index d0ea53fa4..997124338 100644 --- a/utils/markdown/autolink_test.go +++ b/utils/markdown/autolink_test.go @@ -134,7 +134,15 @@ func TestParseURLAutolink(t *testing.T) { for _, testCase := range testCases { t.Run(testCase.Description, func(t *testing.T) { - assert.Equal(t, testCase.Expected, parseURLAutolink(testCase.Input, testCase.Position)) + rawRange, ok := parseURLAutolink(testCase.Input, testCase.Position) + + if testCase.Expected == "" { + assert.False(t, ok) + assert.Equal(t, Range{0, 0}, rawRange) + } else { + assert.True(t, ok) + assert.Equal(t, testCase.Expected, testCase.Input[rawRange.Position:rawRange.End]) + } }) } } @@ -264,89 +272,153 @@ func TestParseWWWAutolink(t *testing.T) { for _, testCase := range testCases { t.Run(testCase.Description, func(t *testing.T) { - assert.Equal(t, testCase.Expected, parseWWWAutolink(testCase.Input, testCase.Position)) + rawRange, ok := parseWWWAutolink(testCase.Input, testCase.Position) + + if testCase.Expected == "" { + assert.False(t, ok) + assert.Equal(t, Range{0, 0}, rawRange) + } else { + assert.True(t, ok) + assert.Equal(t, testCase.Expected, testCase.Input[rawRange.Position:rawRange.End]) + } }) } } func TestTrimTrailingCharactersFromLink(t *testing.T) { testCases := []struct { - Input string - Expected string + Input string + Start int + End int + ExpectedEnd int }{ { - Input: "http://www.example.com", - Expected: "http://www.example.com", + Input: "http://www.example.com", + ExpectedEnd: 22, + }, + { + Input: "http://www.example.com/abcd", + ExpectedEnd: 27, + }, + { + Input: "http://www.example.com/abcd/", + ExpectedEnd: 28, + }, + { + Input: "http://www.example.com/1234", + ExpectedEnd: 27, + }, + { + Input: "http://www.example.com/abcd?foo=bar", + ExpectedEnd: 35, }, { - Input: "http://www.example.com/abcd", - Expected: "http://www.example.com/abcd", + Input: "http://www.example.com/abcd#heading", + ExpectedEnd: 35, }, { - Input: "http://www.example.com/abcd/", - Expected: "http://www.example.com/abcd/", + Input: "http://www.example.com.", + ExpectedEnd: 22, }, { - Input: "http://www.example.com/1234", - Expected: "http://www.example.com/1234", + Input: "http://www.example.com,", + ExpectedEnd: 22, }, { - Input: "http://www.example.com/abcd?foo=bar", - Expected: "http://www.example.com/abcd?foo=bar", + Input: "http://www.example.com?", + ExpectedEnd: 22, }, { - Input: "http://www.example.com/abcd#heading", - Expected: "http://www.example.com/abcd#heading", + Input: "http://www.example.com)", + ExpectedEnd: 22, }, { - Input: "http://www.example.com.", - Expected: "http://www.example.com", + Input: "http://www.example.com", + ExpectedEnd: 22, }, { - Input: "http://www.example.com,", - Expected: "http://www.example.com", + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)", + ExpectedEnd: 54, }, { - Input: "http://www.example.com?", - Expected: "http://www.example.com", + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation", + ExpectedEnd: 53, }, { - Input: "http://www.example.com)", - Expected: "http://www.example.com", + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation))", + ExpectedEnd: 54, }, { - Input: "http://www.example.com", - Expected: "http://www.example.com", + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)", + ExpectedEnd: 71, }, { - Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)", - Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)", + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))", + ExpectedEnd: 71, }, { - Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation", - Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation", + Input: "http://www.example.com"", + ExpectedEnd: 22, }, { - Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation))", - Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)", + Input: "this is a sentence containing http://www.example.com in it", + Start: 30, + End: 52, + ExpectedEnd: 52, }, { - Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)", - Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)", + Input: "this is a sentence containing http://www.example.com???", + Start: 30, + End: 55, + ExpectedEnd: 52, }, { - Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))", - Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))", + Input: "http://google.com/å", + ExpectedEnd: len("http://google.com/å"), }, { - Input: "http://www.example.com"", - Expected: "http://www.example.com", + Input: "http://google.com/å...", + ExpectedEnd: len("http://google.com/å"), + }, + { + Input: "This is http://google.com/å, a link, and http://google.com/å", + Start: 8, + End: len("This is http://google.com/å,"), + ExpectedEnd: len("This is http://google.com/å"), + }, + { + Input: "This is http://google.com/å, a link, and http://google.com/å", + Start: 41, + End: len("This is http://google.com/å, a link, and http://google.com/å"), + ExpectedEnd: len("This is http://google.com/å, a link, and http://google.com/å"), + }, + { + Input: "This is http://google.com/å, a link, and http://google.com/å.", + Start: 41, + End: len("This is http://google.com/å, a link, and http://google.com/å."), + ExpectedEnd: len("This is http://google.com/å, a link, and http://google.com/å"), + }, + { + Input: "http://🍄.ga/ http://x🍄.ga/", + Start: 0, + End: len("http://🍄.ga/"), + ExpectedEnd: len("http://🍄.ga/"), + }, + { + Input: "http://🍄.ga/ http://x🍄.ga/", + Start: len("http://🍄.ga/ "), + End: len("http://🍄.ga/ http://x🍄.ga/"), + ExpectedEnd: len("http://🍄.ga/ http://x🍄.ga/"), }, } for _, testCase := range testCases { t.Run(testCase.Input, func(t *testing.T) { - assert.Equal(t, testCase.Expected, trimTrailingCharactersFromLink(testCase.Input)) + if testCase.End == 0 { + testCase.End = len(testCase.Input) - testCase.Start + } + + assert.Equal(t, testCase.ExpectedEnd, trimTrailingCharactersFromLink(testCase.Input, testCase.Start, testCase.End)) }) } } diff --git a/utils/markdown/commonmark_test.go b/utils/markdown/commonmark_test.go index 13e61f52d..d1381cee5 100644 --- a/utils/markdown/commonmark_test.go +++ b/utils/markdown/commonmark_test.go @@ -1000,7 +1000,7 @@ func TestCommonMarkReferenceStrings(t *testing.T) { } } -func TestCommonMarkRefernceAutolinks(t *testing.T) { +func TestCommonMarkReferenceAutolinks(t *testing.T) { // These tests are adapted from the GitHub-flavoured CommonMark extension tests located at // https://github.com/github/cmark/blob/master/test/extensions.txt for name, tc := range map[string]struct { diff --git a/utils/markdown/html.go b/utils/markdown/html.go index 1a857afed..afb72bff3 100644 --- a/utils/markdown/html.go +++ b/utils/markdown/html.go @@ -157,7 +157,7 @@ func RenderInlineHTML(inline Inline) (result string) { } result += "" case *Autolink: - result += `` + result += `` for _, inline := range v.Children { result += RenderInlineHTML(inline) } diff --git a/utils/markdown/inlines.go b/utils/markdown/inlines.go index 453f4bbe5..a3abccef3 100644 --- a/utils/markdown/inlines.go +++ b/utils/markdown/inlines.go @@ -86,7 +86,19 @@ type Autolink struct { Children []Inline - Link string + RawDestination Range + + markdown string +} + +func (i *Autolink) Destination() string { + destination := Unescape(i.markdown[i.RawDestination.Position:i.RawDestination.End]) + + if strings.HasPrefix(destination, "www") { + destination = "http://" + destination + } + + return destination } type delimiterType int @@ -486,15 +498,18 @@ func (p *inlineParser) parseAutolink(c rune) bool { } } - link := "" - text := "" + var link Range if c == ':' { - text = parseURLAutolink(p.raw, p.position) - link = text + var ok bool + link, ok = parseURLAutolink(p.raw, p.position) + + if !ok { + return false + } // Since the current position is at the colon, we have to rewind the parsing slightly so that // we don't duplicate the URL scheme - rewind := strings.Index(text, ":") + rewind := strings.Index(p.raw[link.Position:link.End], ":") if rewind != -1 { lastInline := p.inlines[len(p.inlines)-1] lastText, ok := lastInline.(*Text) @@ -512,22 +527,30 @@ func (p *inlineParser) parseAutolink(c rune) bool { Range: Range{lastText.Range.Position, lastText.Range.End - rewind}, }) p.position -= rewind + } + } else if c == 'w' || c == 'W' { + var ok bool + link, ok = parseWWWAutolink(p.raw, p.position) + if !ok { + return false } - } else if c == 'w' { - text = parseWWWAutolink(p.raw, p.position) - link = "http://" + text } - if text == "" { - return false - } + linkMarkdownPosition := relativeToAbsolutePosition(p.ranges, link.Position) + linkRange := Range{linkMarkdownPosition, linkMarkdownPosition + link.End - link.Position} p.inlines = append(p.inlines, &Autolink{ - Link: link, - Children: []Inline{&Text{Text: text}}, + Children: []Inline{ + &Text{ + Text: p.raw[link.Position:link.End], + Range: linkRange, + }, + }, + RawDestination: linkRange, + markdown: p.markdown, }) - p.position += len(text) + p.position += (link.End - link.Position) return true } -- cgit v1.2.3-1-g7c22