package main
import (
"context"
"flag"
"fmt"
"os"
"runtime/pprof"
"strings"
"sync"
"time"
"github.com/zoomio/tagify"
"github.com/zoomio/tagify/config"
)
var (
version = "tip"
source = flag.String("s", "", "source, could be URL (e.g. http://... and https://...) or file path")
lang = flag.String("lang", "", "language of the source, e.g. \"en\"")
// headless
query = flag.String("q", "", "DOM CSS query, e.g. `-q p` will fetch contents of all
tags from the given source")
ready = flag.String("r", "", "DOM CSS query, waits until certain element available, but fetches contents of the whole HTML document")
until = flag.Duration("u", 0, "duration to wait before getting HTML contents, handy for SPAs, because they keep loading in browsers for some time")
img = flag.String("i", "", "enables capturing screenshot in the provided path")
ua = flag.String("ua", "", "provide a custom user agent for headless HTTP calls")
limit = flag.Int("l", 5, "number of tags to return")
verbose = flag.Bool("v", false, "enables verbose mode")
contentType = flag.String("t", tagify.Unknown.String(), fmt.Sprintf("content type of the source, allowed values: %s", strings.Join(config.ContentTypes[:], ", ")))
noStopWords = flag.Bool("no-stop", true, "removes stop-words from results (see https://github.com/zoomio/stopwords)")
contentOnly = flag.Bool("content", true, "tagify only content")
// weighing
tagWeights = flag.String("tag-weights", "", "string with the custom tag weights for HTML & Markdown tagging in the form of :|:")
tagWeightsJSON = flag.String("tag-weights-json", "", "JSON file with the custom tag weights for HTML & Markdown tagging in the form of { \"\": , \"\": }")
adjustScores = flag.Bool("adjust-scores", false, "adjusts tags score to the interval 0.0 to 1.0")
extraTagWeights = flag.String("extra-tag-weights", "", "string with the additional tag weights for HTML & Markdown tagging in the form of :|:")
extraTagWeightsJSON = flag.String("extra-tag-weights-json", "", "JSON file with the additional tag weights for HTML & Markdown tagging in the form of { \"\": , \"\": }")
// EXPERIMENTAL
fullSite = flag.Bool("site", false, "[EXPERIMENTAL] might not be included in next releases: allows to tagify full site (HTML only)")
// Utility
cpuprofile = flag.String("cpuprofile", "", "write cpu profile to file")
ver = flag.Bool("version", false, "prints version of Tagify")
)
func main() {
flag.Parse()
if *ver {
fmt.Println(version)
return
}
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(3)
}
err = pprof.StartCPUProfile(f)
if err != nil {
fmt.Fprintf(os.Stderr, "error in profiling: %v\n", err)
os.Exit(3)
}
defer pprof.StopCPUProfile()
}
options := []tagify.Option{
tagify.TargetType(tagify.ContentTypeOf(*contentType)),
tagify.Limit(*limit),
}
if *source != "" {
options = append(options, tagify.Source(*source))
}
if *lang != "" {
options = append(options, tagify.Language(*lang))
}
// headless
if len(*query) > 0 {
options = append(options, tagify.Query(*query))
}
if len(*ready) > 0 {
options = append(options, tagify.WaitFor(*ready))
}
if *until > 0 {
options = append(options, tagify.WaitUntil(*until))
}
if len(*img) > 0 {
options = append(options, tagify.Screenshot(true))
}
if *ua != "" {
options = append(options, tagify.UserAgent(*ua))
}
if *verbose {
options = append(options, tagify.Verbose(*verbose))
}
if *noStopWords {
options = append(options, tagify.NoStopWords(*noStopWords))
}
if *contentOnly {
options = append(options, tagify.ContentOnly(*contentOnly))
}
if *fullSite {
options = append(options, tagify.FullSite(*fullSite))
}
if *tagWeights != "" {
options = append(options, tagify.TagWeightsString(*tagWeights))
} else if *tagWeightsJSON != "" {
options = append(options, tagify.TagWeightsJSON(*tagWeightsJSON))
}
if *adjustScores {
options = append(options, tagify.AdjustScores(*adjustScores))
}
if *extraTagWeights != "" {
options = append(options, tagify.ExtraTagWeightsString(*extraTagWeights))
} else if *extraTagWeightsJSON != "" {
options = append(options, tagify.ExtraTagWeightsJSON(*extraTagWeightsJSON))
}
// print progress spinner to terminal
stopCh := make(chan struct{})
var wg sync.WaitGroup
if !*verbose {
wg.Add(1)
go shellSpinner(stopCh, &wg)
}
res, err := tagify.Run(context.Background(), options...)
close(stopCh)
wg.Wait()
if err != nil {
if *verbose {
fmt.Fprintf(os.Stderr, "failed to get tags: %v\n", err)
}
os.Exit(2)
}
if len(*img) > 0 && len(res.Meta.Screenshot) > 0 {
err = os.WriteFile(*img, res.Meta.Screenshot, 0644)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to store captured screenshot at %s: %v\n", *img, err)
os.Exit(3)
}
}
if res.RawLen() == 0 {
fmt.Println("found 0 tags")
return
}
if *verbose {
fmt.Printf("title: %s\n", res.Meta.DocTitle)
fmt.Printf("hash: %s\n", res.Meta.DocHash)
fmt.Printf("content-type: %s\n", res.Meta.ContentType)
println()
}
prfx := ""
if !*verbose {
prfx = "\r"
}
fmt.Fprintf(os.Stdout, "%s%s\n", prfx, strings.Join(res.TagsStrings(), " "))
}
func shellSpinner(stopCh chan struct{}, wg *sync.WaitGroup) {
ticker := time.NewTicker(80 * time.Millisecond)
i := -1
symbs := []string{"|", "\\", "-", "/"}
for {
select {
case <-stopCh:
wg.Done()
return
case <-ticker.C:
i++
if i >= len(symbs) {
i = 0
}
fmt.Fprintf(os.Stdout, "\rprocessing... %s", symbs[i])
}
}
}