From c8d3e421391520af45d8e0f60a884fee139652a0 Mon Sep 17 00:00:00 2001 From: Harrison Healey Date: Tue, 24 Jul 2018 10:40:01 -0400 Subject: MM-11451 Added autolinking to Markdown parser (#9151) * MM-11451 Added autolinking to Markdown parser * Added missing headers * Added mailto and tel links --- utils/markdown/autolink.go | 253 ++++++++++++++++ utils/markdown/autolink_test.go | 617 ++++++++++++++++++++++++++++++++++++++ utils/markdown/commonmark_test.go | 43 +++ utils/markdown/html.go | 6 + utils/markdown/inlines.go | 76 ++++- utils/markdown/markdown.go | 8 + 6 files changed, 1002 insertions(+), 1 deletion(-) create mode 100644 utils/markdown/autolink.go create mode 100644 utils/markdown/autolink_test.go (limited to 'utils') diff --git a/utils/markdown/autolink.go b/utils/markdown/autolink.go new file mode 100644 index 000000000..16c40e609 --- /dev/null +++ b/utils/markdown/autolink.go @@ -0,0 +1,253 @@ +// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved. +// See License.txt for license information. + +package markdown + +import ( + "regexp" + "strings" + "unicode" + "unicode/utf8" +) + +// Based off of extensions/autolink.c from https://github.com/github/cmark + +var ( + DefaultUrlSchemes = []string{"http", "https", "ftp", "mailto", "tel"} +) + +// Given a string with a w at the given position, tries to parse and return a link starting with "www." +// if one exists. If the text at the given position isn't a link, returns an empty string. Equivalent to +// www_match from the reference code. +func parseWWWAutolink(data string, position int) string { + // Check that this isn't part of another word + if position > 1 { + prevChar := data[position-1] + + if !isWhitespaceByte(prevChar) && !isAllowedBeforeWWWLink(prevChar) { + return "" + } + } + + // Check that this starts with www + if len(data)-position < 4 || !regexp.MustCompile(`^www\d{0,3}\.`).MatchString(data[position:]) { + return "" + } + + end := checkDomain(data[position:], false) + if end == 0 { + return "" + } + + end += position + + // Grab all text until the end of the string or the next whitespace character + for end < len(data) && !isWhitespaceByte(data[end]) { + end += 1 + } + + // Trim trailing punctuation + link := trimTrailingCharactersFromLink(data[position:end]) + if link == "" { + return "" + } + + return link +} + +func isAllowedBeforeWWWLink(c byte) bool { + switch c { + case '*', '_', '~', ')': + return true + default: + return false + } +} + +// Given a string with a : at the given position, tried to parse and return a link starting with a URL scheme +// if one exists. If the text around the given position isn't a link, returns an empty string. Equivalent to +// url_match from the reference code. +func parseURLAutolink(data string, position int) string { + // Check that a :// exists. This doesn't match the clients that treat the slashes as optional. + if len(data)-position < 4 || data[position+1] != '/' || data[position+2] != '/' { + return "" + } + + start := position - 1 + for start > 0 && isAlphanumericByte(data[start-1]) { + start -= 1 + } + + // Ensure that the URL scheme is allowed and that at least one character after the scheme is valid. + scheme := data[start:position] + if !isSchemeAllowed(scheme) || !isValidHostCharacter(data[position+3:]) { + return "" + } + + end := checkDomain(data[position+3:], true) + if end == 0 { + return "" + } + + end += position + + // Grab all text until the end of the string or the next whitespace character + for end < len(data) && !isWhitespaceByte(data[end]) { + end += 1 + } + + // Trim trailing punctuation + link := trimTrailingCharactersFromLink(data[start:end]) + if link == "" { + return "" + } + + return link +} + +func isSchemeAllowed(scheme string) bool { + // Note that this doesn't support the custom URL schemes implemented by the client + for _, allowed := range DefaultUrlSchemes { + if strings.EqualFold(allowed, scheme) { + return true + } + } + + return false +} + +// Given a string starting with a URL, returns the number of valid characters that make up the URL's domain. +// Returns 0 if the string doesn't start with a domain name. allowShort determines whether or not the domain +// needs to contain a period to be considered valid. Equivalent to check_domain from the reference code. +func checkDomain(data string, allowShort bool) int { + foundUnderscore := false + foundPeriod := false + + i := 1 + for ; i < len(data)-1; i++ { + if data[i] == '_' { + foundUnderscore = true + break + } else if data[i] == '.' { + foundPeriod = true + } else if !isValidHostCharacter(data[i:]) && data[i] != '-' { + break + } + } + + if foundUnderscore { + return 0 + } + + if allowShort { + // If allowShort is set, accept any string of valid domain characters + return i + } + + // If allowShort isn't set, a valid domain just requires at least a single period. Note that this + // logic isn't entirely necessary because we already know the string starts with "www." when + // this is called from parseWWWAutolink + if foundPeriod { + return i + } else { + return 0 + } +} + +// Returns true if the provided link starts with a valid character for a domain name. Equivalent to +// is_valid_hostchar from the reference code. +func isValidHostCharacter(link string) bool { + c, _ := utf8.DecodeRuneInString(link) + if c == utf8.RuneError { + return false + } + + return !unicode.IsSpace(c) && !unicode.IsPunct(c) +} + +// Removes any trailing characters such as punctuation or stray brackets that shouldn't be part of the link. +// Equivalent to autolink_delim from the reference code. +func trimTrailingCharactersFromLink(link string) string { + runes := []rune(link) + linkEnd := len(runes) + + // Cut off the link before an open angle bracket if it contains one + for i, c := range runes { + if c == '<' { + linkEnd = i + break + } + } + + for linkEnd > 0 { + c := runes[linkEnd-1] + + if !canEndAutolink(c) { + // Trim trailing quotes, periods, etc + linkEnd = linkEnd - 1 + } else if c == ';' { + // Trim a trailing HTML entity + newEnd := linkEnd - 2 + + for newEnd > 0 && ((runes[newEnd] >= 'a' && runes[newEnd] <= 'z') || (runes[newEnd] >= 'A' && runes[newEnd] <= 'Z')) { + newEnd -= 1 + } + + if newEnd < linkEnd-2 && runes[newEnd] == '&' { + linkEnd = newEnd + } else { + // This isn't actually an HTML entity, so just trim the semicolon + linkEnd = linkEnd - 1 + } + } else if c == ')' { + // Only allow an autolink ending with a bracket if that bracket is part of a matching pair of brackets. + // If there are more closing brackets than opening ones, remove the extra bracket + + numClosing := 0 + numOpening := 0 + + // Examples (input text => output linked portion): + // + // http://www.pokemon.com/Pikachu_(Electric) + // => http://www.pokemon.com/Pikachu_(Electric) + // + // http://www.pokemon.com/Pikachu_((Electric) + // => http://www.pokemon.com/Pikachu_((Electric) + // + // http://www.pokemon.com/Pikachu_(Electric)) + // => http://www.pokemon.com/Pikachu_(Electric) + // + // http://www.pokemon.com/Pikachu_((Electric)) + // => http://www.pokemon.com/Pikachu_((Electric)) + + for i := 0; i < linkEnd; i++ { + if runes[i] == '(' { + numOpening += 1 + } else if runes[i] == ')' { + numClosing += 1 + } + } + + if numClosing <= numOpening { + // There's fewer or equal closing brackets, so we've found the end of the link + break + } + + linkEnd -= 1 + } else { + // There's no special characters at the end of the link, so we're at the end + break + } + } + + return string(runes[:linkEnd]) +} + +func canEndAutolink(c rune) bool { + switch c { + case '?', '!', '.', ',', ':', '*', '_', '~', '\'', '"': + return false + default: + return true + } +} diff --git a/utils/markdown/autolink_test.go b/utils/markdown/autolink_test.go new file mode 100644 index 000000000..d0ea53fa4 --- /dev/null +++ b/utils/markdown/autolink_test.go @@ -0,0 +1,617 @@ +// Copyright (c) 2017-present Mattermost, Inc. All Rights Reserved. +// See License.txt for license information. + +package markdown + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseURLAutolink(t *testing.T) { + testCases := []struct { + Description string + Input string + Position int + Expected string + }{ + { + Description: "no link", + Input: "This is an :emoji:", + Position: 11, + Expected: "", + }, + { + Description: "no link 2", + Input: "These are two things: apple and orange", + Position: 20, + Expected: "", + }, + { + Description: "link with http", + Input: "http://example.com and some text", + Position: 4, + Expected: "http://example.com", + }, + { + Description: "link with https", + Input: "https://example.com and some text", + Position: 5, + Expected: "https://example.com", + }, + { + Description: "link with ftp", + Input: "ftp://example.com and some text", + Position: 3, + Expected: "ftp://example.com", + }, + { + Description: "link with a path", + Input: "https://example.com/abcd and some text", + Position: 5, + Expected: "https://example.com/abcd", + }, + { + Description: "link with parameters", + Input: "ftp://example.com/abcd?foo=bar and some text", + Position: 3, + Expected: "ftp://example.com/abcd?foo=bar", + }, + { + Description: "link, not at start", + Input: "This is https://example.com and some text", + Position: 13, + Expected: "https://example.com", + }, + { + Description: "link with a path, not at start", + Input: "This is also http://www.example.com/abcd and some text", + Position: 17, + Expected: "http://www.example.com/abcd", + }, + { + Description: "link with parameters, not at start", + Input: "These are https://www.example.com/abcd?foo=bar and some text", + Position: 15, + Expected: "https://www.example.com/abcd?foo=bar", + }, + { + Description: "link with trailing characters", + Input: "This is ftp://www.example.com??", + Position: 11, + Expected: "ftp://www.example.com", + }, + { + Description: "multiple links", + Input: "This is https://example.com/abcd and ftp://www.example.com/1234", + Position: 13, + Expected: "https://example.com/abcd", + }, + { + Description: "second of multiple links", + Input: "This is https://example.com/abcd and ftp://www.example.com/1234", + Position: 40, + Expected: "ftp://www.example.com/1234", + }, + { + Description: "link with brackets", + Input: "Go to ftp://www.example.com/my/page_(disambiguation) and some text", + Position: 9, + Expected: "ftp://www.example.com/my/page_(disambiguation)", + }, + { + Description: "link in brackets", + Input: "(https://www.example.com/foo/bar)", + Position: 6, + Expected: "https://www.example.com/foo/bar", + }, + { + Description: "link in underscores", + Input: "_http://www.example.com_", + Position: 5, + Expected: "http://www.example.com", + }, + { + Description: "link in asterisks", + Input: "This is **ftp://example.com**", + Position: 13, + Expected: "ftp://example.com", + }, + { + Description: "link in strikethrough", + Input: "Those were ~~https://example.com~~", + Position: 18, + Expected: "https://example.com", + }, + { + Description: "link with angle brackets", + Input: "We use http://example.com", + Position: 14, + Expected: "http://example.com", + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Description, func(t *testing.T) { + assert.Equal(t, testCase.Expected, parseURLAutolink(testCase.Input, testCase.Position)) + }) + } +} + +func TestParseWWWAutolink(t *testing.T) { + testCases := []struct { + Description string + Input string + Position int + Expected string + }{ + { + Description: "no link", + Input: "This is some text", + Position: 0, + Expected: "", + }, + { + Description: "link", + Input: "www.example.com and some text", + Position: 0, + Expected: "www.example.com", + }, + { + Description: "link with a path", + Input: "www.example.com/abcd and some text", + Position: 0, + Expected: "www.example.com/abcd", + }, + { + Description: "link with parameters", + Input: "www.example.com/abcd?foo=bar and some text", + Position: 0, + Expected: "www.example.com/abcd?foo=bar", + }, + { + Description: "link, not at start", + Input: "This is www.example.com and some text", + Position: 8, + Expected: "www.example.com", + }, + { + Description: "link with a path, not at start", + Input: "This is also www.example.com/abcd and some text", + Position: 13, + Expected: "www.example.com/abcd", + }, + { + Description: "link with parameters, not at start", + Input: "These are www.example.com/abcd?foo=bar and some text", + Position: 10, + Expected: "www.example.com/abcd?foo=bar", + }, + { + Description: "link with trailing characters", + Input: "This is www.example.com??", + Position: 8, + Expected: "www.example.com", + }, + { + Description: "link after current position", + Input: "This is some text and www.example.com", + Position: 0, + Expected: "", + }, + { + Description: "multiple links", + Input: "This is www.example.com/abcd and www.example.com/1234", + Position: 8, + Expected: "www.example.com/abcd", + }, + { + Description: "multiple links 2", + Input: "This is www.example.com/abcd and www.example.com/1234", + Position: 33, + Expected: "www.example.com/1234", + }, + { + Description: "link with brackets", + Input: "Go to www.example.com/my/page_(disambiguation) and some text", + Position: 6, + Expected: "www.example.com/my/page_(disambiguation)", + }, + { + Description: "link following other letters", + Input: "aaawww.example.com and some text", + Position: 3, + Expected: "", + }, + { + Description: "link in brackets", + Input: "(www.example.com)", + Position: 1, + Expected: "www.example.com", + }, + { + Description: "link in underscores", + Input: "_www.example.com_", + Position: 1, + Expected: "www.example.com", + }, + { + Description: "link in asterisks", + Input: "This is **www.example.com**", + Position: 10, + Expected: "www.example.com", + }, + { + Description: "link in strikethrough", + Input: "Those were ~~www.example.com~~", + Position: 13, + Expected: "www.example.com", + }, + { + Description: "using www1", + Input: "Our backup site is at www1.example.com/foo", + Position: 22, + Expected: "www1.example.com/foo", + }, + { + Description: "link with angle brackets", + Input: "We use www2.example.com", + Position: 10, + Expected: "www2.example.com", + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Description, func(t *testing.T) { + assert.Equal(t, testCase.Expected, parseWWWAutolink(testCase.Input, testCase.Position)) + }) + } +} + +func TestTrimTrailingCharactersFromLink(t *testing.T) { + testCases := []struct { + Input string + Expected string + }{ + { + Input: "http://www.example.com", + Expected: "http://www.example.com", + }, + { + Input: "http://www.example.com/abcd", + Expected: "http://www.example.com/abcd", + }, + { + Input: "http://www.example.com/abcd/", + Expected: "http://www.example.com/abcd/", + }, + { + Input: "http://www.example.com/1234", + Expected: "http://www.example.com/1234", + }, + { + Input: "http://www.example.com/abcd?foo=bar", + Expected: "http://www.example.com/abcd?foo=bar", + }, + { + Input: "http://www.example.com/abcd#heading", + Expected: "http://www.example.com/abcd#heading", + }, + { + Input: "http://www.example.com.", + Expected: "http://www.example.com", + }, + { + Input: "http://www.example.com,", + Expected: "http://www.example.com", + }, + { + Input: "http://www.example.com?", + Expected: "http://www.example.com", + }, + { + Input: "http://www.example.com)", + Expected: "http://www.example.com", + }, + { + Input: "http://www.example.com", + Expected: "http://www.example.com", + }, + { + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)", + Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)", + }, + { + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation", + Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation", + }, + { + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation))", + Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)", + }, + { + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)", + Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation)_(disambiguation)", + }, + { + Input: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))", + Expected: "https://en.wikipedia.org/wiki/Dolphin_(disambiguation_(disambiguation))", + }, + { + Input: "http://www.example.com"", + Expected: "http://www.example.com", + }, + } + + for _, testCase := range testCases { + t.Run(testCase.Input, func(t *testing.T) { + assert.Equal(t, testCase.Expected, trimTrailingCharactersFromLink(testCase.Input)) + }) + } +} + +func TestAutolinking(t *testing.T) { + // These tests are adapted from https://github.com/mattermost/commonmark.js/test/mattermost.txt. + // It is missing tests for: + // 1. Links surrounded by emphasis (emphasis not implemented on the server) + // 2. IPv6 addresses (not implemented on the server or by GitHub) + // 3. Custom URL schemes (not implemented) + + for name, tc := range map[string]struct { + Markdown string + ExpectedHTML string + }{ + "valid-link-1": { + Markdown: `http://example.com`, + ExpectedHTML: `

http://example.com

`, + }, + "valid-link-2": { + Markdown: `https://example.com`, + ExpectedHTML: `

https://example.com

`, + }, + "valid-link-3": { + Markdown: `ftp://example.com`, + ExpectedHTML: `

ftp://example.com

`, + }, + // "valid-link-4": { + // Markdown: `ts3server://example.com?port=9001`, + // ExpectedHTML: `

ts3server://example.com?port=9001

`, + // }, + "valid-link-5": { + Markdown: `www.example.com`, + ExpectedHTML: `

www.example.com

`, + }, + "valid-link-6": { + Markdown: `www.example.com/index`, + ExpectedHTML: `

www.example.com/index

`, + }, + "valid-link-7": { + Markdown: `www.example.com/index.html`, + ExpectedHTML: `

www.example.com/index.html

`, + }, + "valid-link-8": { + Markdown: `http://example.com/index/sub`, + ExpectedHTML: `

http://example.com/index/sub

`, + }, + "valid-link-9": { + Markdown: `www1.example.com`, + ExpectedHTML: `

www1.example.com

`, + }, + "valid-link-10": { + Markdown: `https://en.wikipedia.org/wiki/URLs#Syntax`, + ExpectedHTML: `

https://en.wikipedia.org/wiki/URLs#Syntax

`, + }, + "valid-link-11": { + Markdown: `https://groups.google.com/forum/#!msg`, + ExpectedHTML: `

https://groups.google.com/forum/#!msg

`, + }, + "valid-link-12": { + Markdown: `www.example.com/index?params=1`, + ExpectedHTML: `

www.example.com/index?params=1

`, + }, + "valid-link-13": { + Markdown: `www.example.com/index?params=1&other=2`, + ExpectedHTML: `

www.example.com/index?params=1&other=2

`, + }, + "valid-link-14": { + Markdown: `www.example.com/index?params=1;other=2`, + ExpectedHTML: `

www.example.com/index?params=1;other=2

`, + }, + "valid-link-15": { + Markdown: `http://www.example.com/_/page`, + ExpectedHTML: `

http://www.example.com/_/page

`, + }, + "valid-link-16": { + Markdown: `https://en.wikipedia.org/wiki/🐬`, + ExpectedHTML: `

https://en.wikipedia.org/wiki/🐬

`, + }, + "valid-link-17": { + Markdown: `http://✪df.ws/1234`, + ExpectedHTML: `

http://✪df.ws/1234

`, + }, + "valid-link-18": { + Markdown: `https://groups.google.com/forum/#!msg`, + ExpectedHTML: `

https://groups.google.com/forum/#!msg

`, + }, + "valid-link-19": { + Markdown: `https://пример.срб/пример-26/`, + ExpectedHTML: `

https://пример.срб/пример-26/

`, + }, + "valid-link-20": { + Markdown: `mailto://test@example.com`, + ExpectedHTML: `

mailto://test@example.com

`, + }, + "valid-link-21": { + Markdown: `tel://555-123-4567`, + ExpectedHTML: `

tel://555-123-4567

`, + }, + + "ip-address-1": { + Markdown: `http://127.0.0.1`, + ExpectedHTML: `

http://127.0.0.1

`, + }, + "ip-address-2": { + Markdown: `http://192.168.1.1:4040`, + ExpectedHTML: `

http://192.168.1.1:4040

`, + }, + "ip-address-3": { + Markdown: `http://username:password@127.0.0.1`, + ExpectedHTML: `

http://username:password@127.0.0.1

`, + }, + "ip-address-4": { + Markdown: `http://username:password@[2001:0:5ef5:79fb:303a:62d5:3312:ff42]:80`, + ExpectedHTML: `

http://username:password@[2001:0:5ef5:79fb:303a:62d5:3312:ff42]:80

`, + }, + + "link-with-brackets-1": { + Markdown: `https://en.wikipedia.org/wiki/Rendering_(computer_graphics)`, + ExpectedHTML: `

https://en.wikipedia.org/wiki/Rendering_(computer_graphics)

`, + }, + "link-with-brackets-2": { + Markdown: `http://example.com/more_(than)_one_(parens)`, + ExpectedHTML: `

http://example.com/more_(than)_one_(parens)

`, + }, + "link-with-brackets-3": { + Markdown: `http://example.com/(something)?after=parens`, + ExpectedHTML: `

http://example.com/(something)?after=parens

`, + }, + "link-with-brackets-4": { + Markdown: `http://foo.com/unicode_(✪)_in_parens`, + ExpectedHTML: `

http://foo.com/unicode_(✪)_in_parens

`, + }, + + "inside-another-link-1": { + Markdown: `[www.example.com](https://example.com)`, + ExpectedHTML: `

www.example.com

`, + }, + "inside-another-link-2": { + Markdown: `[http://www.example.com](https://example.com)`, + ExpectedHTML: `

http://www.example.com

`, + }, + + "link-in-sentence-1": { + Markdown: `(http://example.com)`, + ExpectedHTML: `

(http://example.com)

`, + }, + "link-in-sentence-2": { + Markdown: `(see http://example.com)`, + ExpectedHTML: `

(see http://example.com)

`, + }, + "link-in-sentence-3": { + Markdown: `(http://example.com watch this)`, + ExpectedHTML: `

(http://example.com watch this)

`, + }, + "link-in-sentence-4": { + Markdown: `This is a sentence with a http://example.com in it.`, + ExpectedHTML: `

This is a sentence with a http://example.com in it.

`, + }, + "link-in-sentence-5": { + Markdown: `This is a sentence with a [link](http://example.com) in it.`, + ExpectedHTML: `

This is a sentence with a link in it.

`, + }, + "link-in-sentence-6": { + Markdown: `This is a sentence with a http://example.com/_/underscore in it.`, + ExpectedHTML: `

This is a sentence with a http://example.com/_/underscore in it.

`, + }, + "link-in-sentence-7": { + Markdown: `This is a sentence with a link (http://example.com) in it.`, + ExpectedHTML: `

This is a sentence with a link (http://example.com) in it.

`, + }, + "link-in-sentence-8": { + Markdown: `This is a sentence with a (https://en.wikipedia.org/wiki/Rendering_(computer_graphics)) in it.`, + ExpectedHTML: `

This is a sentence with a (https://en.wikipedia.org/wiki/Rendering_(computer_graphics)) in it.

`, + }, + "link-in-sentence-9": { + Markdown: `This is a sentence with a http://192.168.1.1:4040 in it.`, + ExpectedHTML: `

This is a sentence with a http://192.168.1.1:4040 in it.

`, + }, + "link-in-sentence-10": { + Markdown: `This is a link to http://example.com.`, + ExpectedHTML: `

This is a link to http://example.com.

`, + }, + "link-in-sentence-11": { + Markdown: `This is a link to http://example.com*`, + ExpectedHTML: `

This is a link to http://example.com*

`, + }, + "link-in-sentence-12": { + Markdown: `This is a link to http://example.com_`, + ExpectedHTML: `

This is a link to http://example.com_

`, + }, + "link-in-sentence-13": { + Markdown: `This is a link containing http://example.com/something?with,commas,in,url, but not at the end`, + ExpectedHTML: `

This is a link containing http://example.com/something?with,commas,in,url, but not at the end

`, + }, + "link-in-sentence-14": { + Markdown: `This is a question about a link http://example.com?`, + ExpectedHTML: `

This is a question about a link http://example.com?

`, + }, + + "plt-7250-link-with-trailing-periods-1": { + Markdown: `http://example.com.`, + ExpectedHTML: `

http://example.com.

`, + }, + "plt-7250-link-with-trailing-periods-2": { + Markdown: `http://example.com...`, + ExpectedHTML: `

http://example.com...

`, + }, + "plt-7250-link-with-trailing-periods-3": { + Markdown: `http://example.com/foo.`, + ExpectedHTML: `

http://example.com/foo.

`, + }, + "plt-7250-link-with-trailing-periods-4": { + Markdown: `http://example.com/foo...`, + ExpectedHTML: `

http://example.com/foo...

`, + }, + "plt-7250-link-with-trailing-periods-5": { + Markdown: `http://example.com/foo.bar`, + ExpectedHTML: `

http://example.com/foo.bar

`, + }, + "plt-7250-link-with-trailing-periods-6": { + Markdown: `http://example.com/foo...bar`, + ExpectedHTML: `

http://example.com/foo...bar

`, + }, + + "rn-319-www-link-as-part-of-word-1": { + Markdown: `testwww.example.com`, + ExpectedHTML: `

testwww.example.com

`, + }, + + "mm-10180-link-containing-period-followed-by-non-letter-1": { + Markdown: `https://example.com/123.+Pagetitle`, + ExpectedHTML: `

https://example.com/123.+Pagetitle

`, + }, + "mm-10180-link-containing-period-followed-by-non-letter-2": { + Markdown: `https://example.com/123.?Pagetitle`, + ExpectedHTML: `

https://example.com/123.?Pagetitle

`, + }, + "mm-10180-link-containing-period-followed-by-non-letter-3": { + Markdown: `https://example.com/123.-Pagetitle`, + ExpectedHTML: `

https://example.com/123.-Pagetitle

`, + }, + "mm-10180-link-containing-period-followed-by-non-letter-4": { + Markdown: `https://example.com/123._Pagetitle`, + ExpectedHTML: `

https://example.com/123._Pagetitle

`, + }, + "mm-10180-link-containing-period-followed-by-non-letter-5": { + Markdown: `https://example.com/123.+`, + ExpectedHTML: `

https://example.com/123.+

`, + }, + "mm-10180-link-containing-period-followed-by-non-letter-6": { + Markdown: `https://example.com/123.?`, + ExpectedHTML: `

https://example.com/123.?

`, + }, + "mm-10180-link-containing-period-followed-by-non-letter-7": { + Markdown: `https://example.com/123.-`, + ExpectedHTML: `

https://example.com/123.-

`, + }, + "mm-10180-link-containing-period-followed-by-non-letter-8": { + Markdown: `https://example.com/123._`, + ExpectedHTML: `

https://example.com/123._

`, + }, + } { + t.Run(name, func(t *testing.T) { + assert.Equal(t, tc.ExpectedHTML, RenderHTML(tc.Markdown)) + }) + } +} diff --git a/utils/markdown/commonmark_test.go b/utils/markdown/commonmark_test.go index 0a0959030..13e61f52d 100644 --- a/utils/markdown/commonmark_test.go +++ b/utils/markdown/commonmark_test.go @@ -999,3 +999,46 @@ func TestCommonMarkReferenceStrings(t *testing.T) { }) } } + +func TestCommonMarkRefernceAutolinks(t *testing.T) { + // These tests are adapted from the GitHub-flavoured CommonMark extension tests located at + // https://github.com/github/cmark/blob/master/test/extensions.txt + for name, tc := range map[string]struct { + Markdown string + ExpectedHTML string + }{ + "autolinks-1": { + Markdown: `: http://google.com https://google.com + +http://google.com/å + +www.github.com www.github.com/á + +www.google.com/a_b + +![http://inline.com/image](http://inline.com/image) + +Full stop outside parens shouldn't be included http://google.com/ok. + +(Full stop inside parens shouldn't be included http://google.com/ok.) + +"http://google.com" + +'http://google.com' + +http://🍄.ga/ http://x🍄.ga/`, + ExpectedHTML: `

: http://google.com https://google.com

http://google.com/å

www.github.com www.github.com/á

www.google.com/a_b

http://inline.com/image

Full stop outside parens shouldn't be included http://google.com/ok.

(Full stop inside parens shouldn't be included http://google.com/ok.)

"http://google.com"

'http://google.com'

http://🍄.ga/ http://x🍄.ga/

`, + }, + "autolinks-2": { + Markdown: `These should not link: + +* @a.b.c@. x +* n@. b`, + ExpectedHTML: `

These should not link:

`, + }, + } { + t.Run(name, func(t *testing.T) { + assert.Equal(t, tc.ExpectedHTML, RenderHTML(tc.Markdown)) + }) + } +} diff --git a/utils/markdown/html.go b/utils/markdown/html.go index 8d8e02c55..1a857afed 100644 --- a/utils/markdown/html.go +++ b/utils/markdown/html.go @@ -156,6 +156,12 @@ func RenderInlineHTML(inline Inline) (result string) { result += RenderInlineHTML(inline) } result += "" + case *Autolink: + result += `` + for _, inline := range v.Children { + result += RenderInlineHTML(inline) + } + result += "" default: panic(fmt.Sprintf("missing case for type %T", v)) } diff --git a/utils/markdown/inlines.go b/utils/markdown/inlines.go index 9198435ee..e6943a57d 100644 --- a/utils/markdown/inlines.go +++ b/utils/markdown/inlines.go @@ -81,6 +81,14 @@ type ReferenceImage struct { ReferenceLinkOrImage } +type Autolink struct { + inlineBase + + Children []Inline + + Link string +} + type delimiterType int const ( @@ -182,7 +190,7 @@ func (p *inlineParser) parseEscapeCharacter() { } func (p *inlineParser) parseText() { - if next := strings.IndexAny(p.raw[p.position:], "\r\n\\`&![]"); next == -1 { + if next := strings.IndexAny(p.raw[p.position:], "\r\n\\`&![]wW:"); next == -1 { absPos := relativeToAbsolutePosition(p.ranges, p.position) p.inlines = append(p.inlines, &Text{ Text: strings.TrimRightFunc(p.raw[p.position:], isWhitespace), @@ -198,6 +206,12 @@ func (p *inlineParser) parseText() { Range: Range{absPos, absPos + len(s)}, }) } else { + if next == 0 { + // Always read at least one character since 'w', 'W', and ':' may not actually match another + // type of node + next = 1 + } + p.inlines = append(p.inlines, &Text{ Text: p.raw[p.position : p.position+next], Range: Range{absPos, absPos + next}, @@ -443,6 +457,60 @@ func (p *inlineParser) parseCharacterReference() { } } +func (p *inlineParser) parseAutolink(c rune) bool { + for element := p.delimiterStack.Back(); element != nil; element = element.Prev() { + d := element.Value.(*delimiter) + if !d.IsInactive { + return false + } + } + + link := "" + text := "" + if c == ':' { + text = parseURLAutolink(p.raw, p.position) + link = text + + // Since the current position is at the colon, we have to rewind the parsing slightly so that + // we don't duplicate the URL scheme + rewind := strings.Index(text, ":") + if rewind != -1 { + lastInline := p.inlines[len(p.inlines)-1] + lastText, ok := lastInline.(*Text) + + if !ok { + // This should never occur since parseURLAutolink will only return a non-empty value + // when the previous text ends in a valid URL protocol which would mean that the previous + // node is a Text node + return false + } + + p.inlines = p.inlines[0 : len(p.inlines)-1] + p.inlines = append(p.inlines, &Text{ + Text: lastText.Text[:len(lastText.Text)-rewind], + Range: Range{lastText.Range.Position, lastText.Range.End - rewind}, + }) + p.position -= rewind + + } + } else if c == 'w' { + text = parseWWWAutolink(p.raw, p.position) + link = "http://" + text + } + + if text == "" { + return false + } + + p.inlines = append(p.inlines, &Autolink{ + Link: link, + Children: []Inline{&Text{Text: text}}, + }) + p.position += len(text) + + return true +} + func (p *inlineParser) Parse() []Inline { for _, r := range p.ranges { p.raw += p.markdown[r.Position:r.End] @@ -464,6 +532,12 @@ func (p *inlineParser) Parse() []Inline { p.parseLinkOrImageDelimiter() case ']': p.lookForLinkOrImage() + case 'w', 'W', ':': + matched := p.parseAutolink(c) + + if !matched { + p.parseText() + } default: p.parseText() } diff --git a/utils/markdown/markdown.go b/utils/markdown/markdown.go index 3061ba4bb..e0788d906 100644 --- a/utils/markdown/markdown.go +++ b/utils/markdown/markdown.go @@ -40,6 +40,14 @@ func isHexByte(c byte) bool { return isHex(rune(c)) } +func isAlphanumeric(c rune) bool { + return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} + +func isAlphanumericByte(c byte) bool { + return isAlphanumeric(rune(c)) +} + func nextNonWhitespace(markdown string, position int) int { for offset, c := range []byte(markdown[position:]) { if !isWhitespaceByte(c) { -- cgit v1.2.3-1-g7c22