diff options
Diffstat (limited to 'vendor/github.com/jaytaylor/html2text')
-rw-r--r-- | vendor/github.com/jaytaylor/html2text/.gitignore | 24 | ||||
-rw-r--r-- | vendor/github.com/jaytaylor/html2text/.travis.yml | 14 | ||||
-rw-r--r-- | vendor/github.com/jaytaylor/html2text/LICENSE | 22 | ||||
-rw-r--r-- | vendor/github.com/jaytaylor/html2text/README.md | 137 | ||||
-rw-r--r-- | vendor/github.com/jaytaylor/html2text/html2text.go | 473 |
5 files changed, 670 insertions, 0 deletions
diff --git a/vendor/github.com/jaytaylor/html2text/.gitignore b/vendor/github.com/jaytaylor/html2text/.gitignore new file mode 100644 index 000000000..daf913b1b --- /dev/null +++ b/vendor/github.com/jaytaylor/html2text/.gitignore @@ -0,0 +1,24 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +*.prof diff --git a/vendor/github.com/jaytaylor/html2text/.travis.yml b/vendor/github.com/jaytaylor/html2text/.travis.yml new file mode 100644 index 000000000..6c7f48efd --- /dev/null +++ b/vendor/github.com/jaytaylor/html2text/.travis.yml @@ -0,0 +1,14 @@ +language: go +go: + - tip + - 1.8 + - 1.7 + - 1.6 + - 1.5 + - 1.4 + - 1.3 + - 1.2 +notifications: + email: + on_success: change + on_failure: always diff --git a/vendor/github.com/jaytaylor/html2text/LICENSE b/vendor/github.com/jaytaylor/html2text/LICENSE new file mode 100644 index 000000000..24dc4abec --- /dev/null +++ b/vendor/github.com/jaytaylor/html2text/LICENSE @@ -0,0 +1,22 @@ +The MIT License (MIT) + +Copyright (c) 2015 Jay Taylor + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/vendor/github.com/jaytaylor/html2text/README.md b/vendor/github.com/jaytaylor/html2text/README.md new file mode 100644 index 000000000..34b87f7a7 --- /dev/null +++ b/vendor/github.com/jaytaylor/html2text/README.md @@ -0,0 +1,137 @@ +# html2text + +[![Documentation](https://godoc.org/github.com/jaytaylor/html2text?status.svg)](https://godoc.org/github.com/jaytaylor/html2text) +[![Build Status](https://travis-ci.org/jaytaylor/html2text.svg?branch=master)](https://travis-ci.org/jaytaylor/html2text) +[![Report Card](https://goreportcard.com/badge/github.com/jaytaylor/html2text)](https://goreportcard.com/report/github.com/jaytaylor/html2text) + +### Converts HTML into text of the markdown-flavored variety + + +## Introduction + +Ensure your emails are readable by all! + +Turns HTML into raw text, useful for sending fancy HTML emails with an equivalently nicely formatted TXT document as a fallback (e.g. for people who don't allow HTML emails or have other display issues). + +html2text is a simple golang package for rendering HTML into plaintext. + +There are still lots of improvements to be had, but FWIW this has worked fine for my [basic] HTML-2-text needs. + +It requires go 1.x or newer ;) + + +## Download the package + +```bash +go get github.com/jaytaylor/html2text +``` + +## Example usage + +```go +package main + +import ( + "fmt" + + "github.com/jaytaylor/html2text" +) + +func main() { + inputHTML := ` +<html> + <head> + <title>My Mega Service</title> + <link rel=\"stylesheet\" href=\"main.css\"> + <style type=\"text/css\">body { color: #fff; }</style> + </head> + + <body> + <div class="logo"> + <a href="http://jaytaylor.com/"><img src="/logo-image.jpg" alt="Mega Service"/></a> + </div> + + <h1>Welcome to your new account on my service!</h1> + + <p> + Here is some more information: + + <ul> + <li>Link 1: <a href="https://example.com">Example.com</a></li> + <li>Link 2: <a href="https://example2.com">Example2.com</a></li> + <li>Something else</li> + </ul> + </p> + + <table> + <thead> + <tr><th>Header 1</th><th>Header 2</th></tr> + </thead> + <tfoot> + <tr><td>Footer 1</td><td>Footer 2</td></tr> + </tfoot> + <tbody> + <tr><td>Row 1 Col 1</td><td>Row 1 Col 2</td></tr> + <tr><td>Row 2 Col 1</td><td>Row 2 Col 2</td></tr> + </tbody> + </table> + </body> +</html>` + + text, err := html2text.FromString(inputHTML, html2text.Options{PrettyTables: true}) + if err != nil { + panic(err) + } + fmt.Println(text) +} +``` + +Output: +``` +Mega Service ( http://jaytaylor.com/ ) + +****************************************** +Welcome to your new account on my service! +****************************************** + +Here is some more information: + +* Link 1: Example.com ( https://example.com ) +* Link 2: Example2.com ( https://example2.com ) +* Something else + ++-------------+-------------+ +| HEADER 1 | HEADER 2 | ++-------------+-------------+ +| Row 1 Col 1 | Row 1 Col 2 | +| Row 2 Col 1 | Row 2 Col 2 | ++-------------+-------------+ +| FOOTER 1 | FOOTER 2 | ++-------------+-------------+ +``` + + +## Unit-tests + +Running the unit-tests is straightforward and standard: + +```bash +go test +``` + + +# License + +Permissive MIT license. + + +## Contact + +You are more than welcome to open issues and send pull requests if you find a bug or want a new feature. + +If you appreciate this library please feel free to drop me a line and tell me! It's always nice to hear from people who have benefitted from my work. + +Email: jay at (my github username).com + +Twitter: [@jtaylor](https://twitter.com/jtaylor) + diff --git a/vendor/github.com/jaytaylor/html2text/html2text.go b/vendor/github.com/jaytaylor/html2text/html2text.go new file mode 100644 index 000000000..fa3699097 --- /dev/null +++ b/vendor/github.com/jaytaylor/html2text/html2text.go @@ -0,0 +1,473 @@ +package html2text + +import ( + "bytes" + "io" + "regexp" + "strings" + "unicode" + + "github.com/olekukonko/tablewriter" + "github.com/ssor/bom" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// Options provide toggles and overrides to control specific rendering behaviors. +type Options struct { + PrettyTables bool // Turns on pretty ASCII rendering for table elements. + OmitLinks bool // Turns on omitting links +} + +// FromHTMLNode renders text output from a pre-parsed HTML document. +func FromHTMLNode(doc *html.Node, o ...Options) (string, error) { + var options Options + if len(o) > 0 { + options = o[0] + } + + ctx := textifyTraverseContext{ + buf: bytes.Buffer{}, + options: options, + } + if err := ctx.traverse(doc); err != nil { + return "", err + } + + text := strings.TrimSpace(newlineRe.ReplaceAllString( + strings.Replace(ctx.buf.String(), "\n ", "\n", -1), "\n\n"), + ) + return text, nil +} + +// FromReader renders text output after parsing HTML for the specified +// io.Reader. +func FromReader(reader io.Reader, options ...Options) (string, error) { + newReader, err := bom.NewReaderWithoutBom(reader) + if err != nil { + return "", err + } + doc, err := html.Parse(newReader) + if err != nil { + return "", err + } + return FromHTMLNode(doc, options...) +} + +// FromString parses HTML from the input string, then renders the text form. +func FromString(input string, options ...Options) (string, error) { + bs := bom.CleanBom([]byte(input)) + text, err := FromReader(bytes.NewReader(bs), options...) + if err != nil { + return "", err + } + return text, nil +} + +var ( + spacingRe = regexp.MustCompile(`[ \r\n\t]+`) + newlineRe = regexp.MustCompile(`\n\n+`) +) + +// traverseTableCtx holds text-related context. +type textifyTraverseContext struct { + buf bytes.Buffer + + prefix string + tableCtx tableTraverseContext + options Options + endsWithSpace bool + justClosedDiv bool + blockquoteLevel int + lineLength int + isPre bool +} + +// tableTraverseContext holds table ASCII-form related context. +type tableTraverseContext struct { + header []string + body [][]string + footer []string + tmpRow int + isInFooter bool +} + +func (tableCtx *tableTraverseContext) init() { + tableCtx.body = [][]string{} + tableCtx.header = []string{} + tableCtx.footer = []string{} + tableCtx.isInFooter = false + tableCtx.tmpRow = 0 +} + +func (ctx *textifyTraverseContext) handleElement(node *html.Node) error { + ctx.justClosedDiv = false + + switch node.DataAtom { + case atom.Br: + return ctx.emit("\n") + + case atom.H1, atom.H2, atom.H3: + subCtx := textifyTraverseContext{} + if err := subCtx.traverseChildren(node); err != nil { + return err + } + + str := subCtx.buf.String() + dividerLen := 0 + for _, line := range strings.Split(str, "\n") { + if lineLen := len([]rune(line)); lineLen-1 > dividerLen { + dividerLen = lineLen - 1 + } + } + var divider string + if node.DataAtom == atom.H1 { + divider = strings.Repeat("*", dividerLen) + } else { + divider = strings.Repeat("-", dividerLen) + } + + if node.DataAtom == atom.H3 { + return ctx.emit("\n\n" + str + "\n" + divider + "\n\n") + } + return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n") + + case atom.Blockquote: + ctx.blockquoteLevel++ + ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " " + if err := ctx.emit("\n"); err != nil { + return err + } + if ctx.blockquoteLevel == 1 { + if err := ctx.emit("\n"); err != nil { + return err + } + } + if err := ctx.traverseChildren(node); err != nil { + return err + } + ctx.blockquoteLevel-- + ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + if ctx.blockquoteLevel > 0 { + ctx.prefix += " " + } + return ctx.emit("\n\n") + + case atom.Div: + if ctx.lineLength > 0 { + if err := ctx.emit("\n"); err != nil { + return err + } + } + if err := ctx.traverseChildren(node); err != nil { + return err + } + var err error + if !ctx.justClosedDiv { + err = ctx.emit("\n") + } + ctx.justClosedDiv = true + return err + + case atom.Li: + if err := ctx.emit("* "); err != nil { + return err + } + + if err := ctx.traverseChildren(node); err != nil { + return err + } + + return ctx.emit("\n") + + case atom.B, atom.Strong: + subCtx := textifyTraverseContext{} + subCtx.endsWithSpace = true + if err := subCtx.traverseChildren(node); err != nil { + return err + } + str := subCtx.buf.String() + return ctx.emit("*" + str + "*") + + case atom.A: + linkText := "" + // For simple link element content with single text node only, peek at the link text. + if node.FirstChild != nil && node.FirstChild.NextSibling == nil && node.FirstChild.Type == html.TextNode { + linkText = node.FirstChild.Data + } + + // If image is the only child, take its alt text as the link text. + if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img { + if altText := getAttrVal(img, "alt"); altText != "" { + if err := ctx.emit(altText); err != nil { + return err + } + } + } else if err := ctx.traverseChildren(node); err != nil { + return err + } + + hrefLink := "" + if attrVal := getAttrVal(node, "href"); attrVal != "" { + attrVal = ctx.normalizeHrefLink(attrVal) + // Don't print link href if it matches link element content or if the link is empty. + if !ctx.options.OmitLinks && attrVal != "" && linkText != attrVal { + hrefLink = "( " + attrVal + " )" + } + } + + return ctx.emit(hrefLink) + + case atom.P, atom.Ul: + return ctx.paragraphHandler(node) + + case atom.Table, atom.Tfoot, atom.Th, atom.Tr, atom.Td: + if ctx.options.PrettyTables { + return ctx.handleTableElement(node) + } else if node.DataAtom == atom.Table { + return ctx.paragraphHandler(node) + } + return ctx.traverseChildren(node) + + case atom.Pre: + ctx.isPre = true + err := ctx.traverseChildren(node) + ctx.isPre = false + return err + + case atom.Style, atom.Script, atom.Head: + // Ignore the subtree. + return nil + + default: + return ctx.traverseChildren(node) + } +} + +// paragraphHandler renders node children surrounded by double newlines. +func (ctx *textifyTraverseContext) paragraphHandler(node *html.Node) error { + if err := ctx.emit("\n\n"); err != nil { + return err + } + if err := ctx.traverseChildren(node); err != nil { + return err + } + return ctx.emit("\n\n") +} + +// handleTableElement is only to be invoked when options.PrettyTables is active. +func (ctx *textifyTraverseContext) handleTableElement(node *html.Node) error { + if !ctx.options.PrettyTables { + panic("handleTableElement invoked when PrettyTables not active") + } + + switch node.DataAtom { + case atom.Table: + if err := ctx.emit("\n\n"); err != nil { + return err + } + + // Re-intialize all table context. + ctx.tableCtx.init() + + // Browse children, enriching context with table data. + if err := ctx.traverseChildren(node); err != nil { + return err + } + + buf := &bytes.Buffer{} + table := tablewriter.NewWriter(buf) + table.SetHeader(ctx.tableCtx.header) + table.SetFooter(ctx.tableCtx.footer) + table.AppendBulk(ctx.tableCtx.body) + + // Render the table using ASCII. + table.Render() + if err := ctx.emit(buf.String()); err != nil { + return err + } + + return ctx.emit("\n\n") + + case atom.Tfoot: + ctx.tableCtx.isInFooter = true + if err := ctx.traverseChildren(node); err != nil { + return err + } + ctx.tableCtx.isInFooter = false + + case atom.Tr: + ctx.tableCtx.body = append(ctx.tableCtx.body, []string{}) + if err := ctx.traverseChildren(node); err != nil { + return err + } + ctx.tableCtx.tmpRow++ + + case atom.Th: + res, err := ctx.renderEachChild(node) + if err != nil { + return err + } + + ctx.tableCtx.header = append(ctx.tableCtx.header, res) + + case atom.Td: + res, err := ctx.renderEachChild(node) + if err != nil { + return err + } + + if ctx.tableCtx.isInFooter { + ctx.tableCtx.footer = append(ctx.tableCtx.footer, res) + } else { + ctx.tableCtx.body[ctx.tableCtx.tmpRow] = append(ctx.tableCtx.body[ctx.tableCtx.tmpRow], res) + } + + } + return nil +} + +func (ctx *textifyTraverseContext) traverse(node *html.Node) error { + switch node.Type { + default: + return ctx.traverseChildren(node) + + case html.TextNode: + var data string + if ctx.isPre { + data = node.Data + } else { + data = strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ") + } + return ctx.emit(data) + + case html.ElementNode: + return ctx.handleElement(node) + } +} + +func (ctx *textifyTraverseContext) traverseChildren(node *html.Node) error { + for c := node.FirstChild; c != nil; c = c.NextSibling { + if err := ctx.traverse(c); err != nil { + return err + } + } + + return nil +} + +func (ctx *textifyTraverseContext) emit(data string) error { + if data == "" { + return nil + } + var ( + lines = ctx.breakLongLines(data) + err error + ) + for _, line := range lines { + runes := []rune(line) + startsWithSpace := unicode.IsSpace(runes[0]) + if !startsWithSpace && !ctx.endsWithSpace && !strings.HasPrefix(data, ".") { + if err = ctx.buf.WriteByte(' '); err != nil { + return err + } + ctx.lineLength++ + } + ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) + for _, c := range line { + if _, err = ctx.buf.WriteString(string(c)); err != nil { + return err + } + ctx.lineLength++ + if c == '\n' { + ctx.lineLength = 0 + if ctx.prefix != "" { + if _, err = ctx.buf.WriteString(ctx.prefix); err != nil { + return err + } + } + } + } + } + return nil +} + +const maxLineLen = 74 + +func (ctx *textifyTraverseContext) breakLongLines(data string) []string { + // Only break lines when in blockquotes. + if ctx.blockquoteLevel == 0 { + return []string{data} + } + var ( + ret = []string{} + runes = []rune(data) + l = len(runes) + existing = ctx.lineLength + ) + if existing >= maxLineLen { + ret = append(ret, "\n") + existing = 0 + } + for l+existing > maxLineLen { + i := maxLineLen - existing + for i >= 0 && !unicode.IsSpace(runes[i]) { + i-- + } + if i == -1 { + // No spaces, so go the other way. + i = maxLineLen - existing + for i < l && !unicode.IsSpace(runes[i]) { + i++ + } + } + ret = append(ret, string(runes[:i])+"\n") + for i < l && unicode.IsSpace(runes[i]) { + i++ + } + runes = runes[i:] + l = len(runes) + existing = 0 + } + if len(runes) > 0 { + ret = append(ret, string(runes)) + } + return ret +} + +func (ctx *textifyTraverseContext) normalizeHrefLink(link string) string { + link = strings.TrimSpace(link) + link = strings.TrimPrefix(link, "mailto:") + return link +} + +// renderEachChild visits each direct child of a node and collects the sequence of +// textuual representaitons separated by a single newline. +func (ctx *textifyTraverseContext) renderEachChild(node *html.Node) (string, error) { + buf := &bytes.Buffer{} + for c := node.FirstChild; c != nil; c = c.NextSibling { + s, err := FromHTMLNode(c, ctx.options) + if err != nil { + return "", err + } + if _, err = buf.WriteString(s); err != nil { + return "", err + } + if c.NextSibling != nil { + if err = buf.WriteByte('\n'); err != nil { + return "", err + } + } + } + return buf.String(), nil +} + +func getAttrVal(node *html.Node, attrName string) string { + for _, attr := range node.Attr { + if attr.Key == attrName { + return attr.Val + } + } + + return "" +} |