package html2text import ( "bytes" "io" "io/ioutil" "regexp" "strings" "unicode" "github.com/dimchansky/utfbom" "golang.org/x/net/html" "golang.org/x/net/html/atom" ) var ( spacingRe = regexp.MustCompile(`[ \r\n\t]+`) newlineRe = regexp.MustCompile(`\n\n+`) ) type textifyTraverseCtx struct { Buf bytes.Buffer prefix string blockquoteLevel int lineLength int endsWithSpace bool endsWithNewline bool justClosedDiv bool } func (ctx *textifyTraverseCtx) traverse(node *html.Node) error { switch node.Type { default: return ctx.traverseChildren(node) case html.TextNode: data := strings.Trim(spacingRe.ReplaceAllString(node.Data, " "), " ") return ctx.emit(data) case html.ElementNode: return ctx.handleElementNode(node) } } func (ctx *textifyTraverseCtx) handleElementNode(node *html.Node) error { ctx.justClosedDiv = false switch node.DataAtom { case atom.Br: return ctx.emit("\n") case atom.H1, atom.H2, atom.H3: subCtx := textifyTraverseCtx{} if err := subCtx.traverseChildren(node); err != nil { return err } str := subCtx.Buf.String() dividerLen := 0 for _, line := range strings.Split(str, "\n") { if lineLen := len([]rune(line)); lineLen-1 > dividerLen { dividerLen = lineLen - 1 } } divider := "" if node.DataAtom == atom.H1 { divider = strings.Repeat("*", dividerLen) } else { divider = strings.Repeat("-", dividerLen) } if node.DataAtom == atom.H3 { return ctx.emit("\n\n" + str + "\n" + divider + "\n\n") } return ctx.emit("\n\n" + divider + "\n" + str + "\n" + divider + "\n\n") case atom.Blockquote: ctx.blockquoteLevel++ ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) + " " if err := ctx.emit("\n"); err != nil { return err } if ctx.blockquoteLevel == 1 { if err := ctx.emit("\n"); err != nil { return err } } if err := ctx.traverseChildren(node); err != nil { return err } ctx.blockquoteLevel-- ctx.prefix = strings.Repeat(">", ctx.blockquoteLevel) if ctx.blockquoteLevel > 0 { ctx.prefix += " " } return ctx.emit("\n\n") case atom.Div: if ctx.lineLength > 0 { if err := ctx.emit("\n"); err != nil { return err } } if err := ctx.traverseChildren(node); err != nil { return err } var err error if ctx.justClosedDiv == false { err = ctx.emit("\n") } ctx.justClosedDiv = true return err case atom.Li: if err := ctx.emit("* "); err != nil { return err } if err := ctx.traverseChildren(node); err != nil { return err } return ctx.emit("\n") case atom.B, atom.Strong: subCtx := textifyTraverseCtx{} subCtx.endsWithSpace = true if err := subCtx.traverseChildren(node); err != nil { return err } str := subCtx.Buf.String() return ctx.emit("*" + str + "*") case atom.A: // If image is the only child, take its alt text as the link text if img := node.FirstChild; img != nil && node.LastChild == img && img.DataAtom == atom.Img { if altText := getAttrVal(img, "alt"); altText != "" { ctx.emit(altText) } } else if err := ctx.traverseChildren(node); err != nil { return err } hrefLink := "" if attrVal := getAttrVal(node, "href"); attrVal != "" { attrVal = ctx.normalizeHrefLink(attrVal) if attrVal != "" { hrefLink = "( " + attrVal + " )" } } return ctx.emit(hrefLink) case atom.P, atom.Ul, atom.Table: if err := ctx.emit("\n\n"); err != nil { return err } if err := ctx.traverseChildren(node); err != nil { return err } return ctx.emit("\n\n") case atom.Tr: if err := ctx.traverseChildren(node); err != nil { return err } return ctx.emit("\n") case atom.Style, atom.Script, atom.Head: // Ignore the subtree return nil default: return ctx.traverseChildren(node) } } func (ctx *textifyTraverseCtx) traverseChildren(node *html.Node) error { for c := node.FirstChild; c != nil; c = c.NextSibling { if err := ctx.traverse(c); err != nil { return err } } return nil } func (ctx *textifyTraverseCtx) emit(data string) error { if len(data) == 0 { return nil } lines := ctx.breakLongLines(data) var err error for _, line := range lines { runes := []rune(line) startsWithSpace := unicode.IsSpace(runes[0]) if !startsWithSpace && !ctx.endsWithSpace { ctx.Buf.WriteByte(' ') ctx.lineLength++ } ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) for _, c := range line { _, err = ctx.Buf.WriteString(string(c)) if err != nil { return err } ctx.lineLength++ if c == '\n' { ctx.lineLength = 0 if ctx.prefix != "" { _, err = ctx.Buf.WriteString(ctx.prefix) if err != nil { return err } } } } } return nil } func (ctx *textifyTraverseCtx) breakLongLines(data string) []string { // only break lines when we are in blockquotes if ctx.blockquoteLevel == 0 { return []string{data} } var ret []string runes := []rune(data) l := len(runes) existing := ctx.lineLength if existing >= 74 { ret = append(ret, "\n") existing = 0 } for l+existing > 74 { i := 74 - existing for i >= 0 && !unicode.IsSpace(runes[i]) { i-- } if i == -1 { // no spaces, so go the other way i = 74 - existing for i < l && !unicode.IsSpace(runes[i]) { i++ } } ret = append(ret, string(runes[:i])+"\n") for i < l && unicode.IsSpace(runes[i]) { i++ } runes = runes[i:] l = len(runes) existing = 0 } if len(runes) > 0 { ret = append(ret, string(runes)) } return ret } func (ctx *textifyTraverseCtx) normalizeHrefLink(link string) string { link = strings.TrimSpace(link) link = strings.TrimPrefix(link, "mailto:") return link } func getAttrVal(node *html.Node, attrName string) string { for _, attr := range node.Attr { if attr.Key == attrName { return attr.Val } } return "" } func FromHtmlNode(doc *html.Node) (string, error) { ctx := textifyTraverseCtx{ Buf: bytes.Buffer{}, } if err := ctx.traverse(doc); err != nil { return "", err } text := strings.TrimSpace(newlineRe.ReplaceAllString( strings.Replace(ctx.Buf.String(), "\n ", "\n", -1), "\n\n")) return text, nil } func FromReader(reader io.Reader) (string, error) { bs, err := ioutil.ReadAll(reader) newReader, _ := utfbom.Skip(bytes.NewReader(bs)) doc, err := html.Parse(newReader) if err != nil { return "", err } return FromHtmlNode(doc) } func FromString(input string) (string, error) { bs := utfbom.SkipOnly(bytes.NewReader([]byte(input))) text, err := FromReader(bs) if err != nil { return "", err } return text, nil }