package html2rst import ( "bufio" "fmt" "golang.org/x/net/html" "io" "os" "strings" ) func StringToLines(s string) []string { var lines []string scanner := bufio.NewScanner(strings.NewReader(s)) for scanner.Scan() { lines = append(lines, scanner.Text()) } if err := scanner.Err(); err != nil { fmt.Fprintln(os.Stderr, "reading standard input:", err) } return lines } func indentEachLine(s string) string { lines := StringToLines(s) var indentedLines []string for _, line := range lines { indentedLines = append(indentedLines, " "+line) } return strings.Join(indentedLines, "\n") } func isAnchorElement(n *html.Node) bool { return n.Type == html.ElementNode && n.Data == "a" } func isUlElement(n *html.Node) bool { return n.Type == html.ElementNode && n.Data == "ul" } func isLiElement(n *html.Node) bool { return n.Type == html.ElementNode && n.Data == "li" } func isScriptElement(n *html.Node) bool { return n.Type == html.ElementNode && n.Data == "script" } func isImgElement(n *html.Node) bool { return n.Type == html.ElementNode && n.Data == "img" } func isTextNode(n *html.Node) bool { return n.Type == html.TextNode } func getAttribute(n *html.Node, key string) (string, bool) { for _, attr := range n.Attr { if attr.Key == key { return attr.Val, true } } return "", false } func textNode2rst(n *html.Node) string { text := strings.TrimSpace(n.Data) if text == "" { return "\n" } return n.Data } func a2rst(n *html.Node) string { if isImgElement(n.FirstChild) { rstText := img2rst(n.FirstChild) href, ok := getAttribute(n, "href") if ok { rstText += " :target: " rstText += href rstText += "\n" } return rstText } text := strings.TrimSpace(n.FirstChild.Data) href, ok := getAttribute(n, "href") if ok { return "`" + text + " <" + href + ">`__" } return "" } func li2rst(n *html.Node) string { rstText := "" for c := n.FirstChild; c != nil; c = c.NextSibling { if isTextNode(c) { rstText += textNode2rst(c) } if isAnchorElement(c) { rstText += a2rst(c) } if isUlElement(c) { rstText += "\n" rstText += indentEachLine(ul2rst(c)) } } return "- " + rstText + "\n" } func ul2rst(n *html.Node) string { rstText := "" for c := n.FirstChild; c != nil; c = c.NextSibling { if isLiElement(c) { rstText += li2rst(c) } } return rstText } func img2rst(n *html.Node) string { rstText := ".. image:: " src, ok := getAttribute(n, "src") if ok { rstText += src rstText += "\n" } else { rstText += "\n" } alt, ok := getAttribute(n, "alt") if ok { rstText += " :alt: " rstText += alt rstText += "\n" } class, ok := getAttribute(n, "class") if ok { if class == "align-center" { rstText += " :align: center\n" } } return rstText } func traverse(n *html.Node) string { if isTextNode(n) { return textNode2rst(n) } if isAnchorElement(n) { return a2rst(n) } if isImgElement(n) { return img2rst(n) } if isUlElement(n) { return ul2rst(n) } if isScriptElement(n) { return "" } rstText := "" for c := n.FirstChild; c != nil; c = c.NextSibling { rstText += traverse(c) } return rstText } func HtmlToRst(r io.Reader) string { doc, err := html.Parse(r) if err != nil { panic("Fail to parse html") } return traverse(doc) }