/* * Summarize images in a corpus of PDF files. For each PDF file, passes through each page, goes * through the content stream and finds instances of both XObject Images and inline images. Also * handles images referred within XObject Form content streams. * Outputs a summary of the images found. * * Run as: go run pdf_summarize_images.go ~/testdata/*.pdf */ package main import ( "encoding/csv" "flag" "fmt" "os" "path/filepath" "sort" "strings" "time" "github.com/unidoc/unipdf/v4/common" "github.com/unidoc/unipdf/v4/common/license" "github.com/unidoc/unipdf/v4/contentstream" "github.com/unidoc/unipdf/v4/core" "github.com/unidoc/unipdf/v4/model" ) func init() { // Make sure to load your metered License API key prior to using the library. // If you need a key, you can sign up and create a free one at https://cloud.unidoc.io err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) if err != nil { panic(err) } } const usage = "Usage: go run pdf_summarize_images.go testdata/*.pdf\n" func main() { var debug, trace bool flag.BoolVar(&debug, "d", false, "Print debugging information.") flag.BoolVar(&trace, "e", false, "Print detailed debugging information.") doSort := true var byDoc, noDims bool var csvPath string flag.StringVar(&csvPath, "o", "results.csv", "CSV results file.") flag.BoolVar(&byDoc, "p", false, "No page numbers specified in CSV file rows.") flag.BoolVar(&noDims, "w", false, "No widths and heights specified in CSV file rows.") makeUsage(usage) flag.Parse() args := flag.Args() if len(args) < 1 { flag.Usage() os.Exit(1) } if trace { common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace)) } else if debug { common.SetLogger(common.NewConsoleLogger(common.LogLevelDebug)) } else { common.SetLogger(common.NewConsoleLogger(common.LogLevelInfo)) } corpus := args[:] if len(corpus) > 1000 { corpus = corpus[:1000] } sort.Slice(corpus, func(i, j int) bool { fi, fj := corpus[i], corpus[j] si, sj := fileSizeMB(fi), fileSizeMB(fj) if si != sj { return si < sj } return fi < fj }) corpusInfo := map[string][]imageInfo{} for i, inputPath := range corpus { fmt.Fprintf(os.Stderr, "%4d of %d %q %.1f MB,", i, len(corpus), filepath.Base(inputPath), fileSizeMB(inputPath)) t0 := time.Now() fileInfo, err := fileImages(inputPath) if err != nil { fmt.Fprintf(os.Stderr, " ERROR: %v\n", err) continue } dt := time.Now().Sub(t0) corpusInfo[inputPath] = fileInfo fmt.Fprintf(os.Stderr, ", %.1f sec\n", dt.Seconds()) } showSummary(corpus, corpusInfo) saveAsCsv(csvPath, corpus, corpusInfo, doSort, byDoc, noDims) } // fileImages returns a list of imageInfo entries for the images in the PDF file `inputPath`. func fileImages(inputPath string) ([]imageInfo, error) { pdfReader, f, err := model.NewPdfReaderFromFile(inputPath, nil) if err != nil { return nil, err } defer f.Close() numPages, err := pdfReader.GetNumPages() if err != nil { return nil, err } fmt.Fprintf(os.Stderr, " %d pages,", numPages) var fileInfo []imageInfo for pageNum := 1; pageNum <= numPages; pageNum++ { page, err := pdfReader.GetPage(pageNum) showError(nil, err, "pdfReader.GetPage failed: page %d", pageNum) if err != nil { continue } // List images on the page. pageInfo, err := pageImages(page) if err != nil || len(pageInfo) == 0 { continue } for i := range pageInfo { pageInfo[i].path = inputPath pageInfo[i].page = pageNum } fileInfo = append(fileInfo, pageInfo...) } fmt.Fprintf(os.Stderr, " %d images", len(fileInfo)) return fileInfo, nil } // pageImages returns a list of imageInfo entries for the images in the PDF page `page`. func pageImages(page *model.PdfPage) ([]imageInfo, error) { contents, err := page.GetAllContentStreams() if err != nil { return nil, err } return contentStreamImages(contents, page.Resources) } // errors records the errors seen so far. It is used to display each error only once. var errors = map[error]bool{nil: true} // contentStreamImages returns a list of imageInfo entries for the images in the content stream `contents`. func contentStreamImages(contents string, resources *model.PdfPageResources) ([]imageInfo, error) { cstreamParser := contentstream.NewContentStreamParser(contents) operations, err := cstreamParser.Parse() showError(errors, err, "cstreamParser.Parse failed") if err != nil { return nil, err } var infoList []imageInfo processedXObjects := map[string]bool{} for _, op := range *operations { if op.Operand == "BI" && len(op.Params) == 1 { // Inline image. iimg, ok := op.Params[0].(*contentstream.ContentStreamInlineImage) if !ok { continue } var width, height, cpts, bpc int img, err := iimg.ToImage(resources) showError(errors, err, "ToImage failed") if err == nil { width = int(img.Width) height = int(img.Height) cpts = img.ColorComponents bpc = int(img.BitsPerComponent) } var filter, colorspace string cs, err := iimg.GetColorSpace(resources) showError(errors, err, "GetColorSpace failed") if err == nil { colorspace = cs.String() } encoder, err := iimg.GetEncoder() showError(errors, err, "GetEncoder failed") if err == nil { filter = encoder.GetFilterName() } info := imageInfo{ inline: true, filter: filter, width: width, height: height, cpts: cpts, colorspace: colorspace, bpc: bpc, } infoList = append(infoList, info) } else if op.Operand == "Do" && len(op.Params) == 1 { // XObject. name := op.Params[0].(*core.PdfObjectName) // Only process each one once. if _, has := processedXObjects[string(*name)]; has { continue } processedXObjects[string(*name)] = true _, xtype := resources.GetXObjectByName(*name) if xtype == model.XObjectTypeImage { ximg, err := resources.GetXObjectImageByName(*name) showError(errors, err, "GetXObjectImageByName failed: %q ", *name) if err != nil { continue } var width, height, cpts, bpc int img, err := ximg.ToImage() showError(errors, err, "ximg.ToImage failed: %q ", *name) if err == nil { cpts = img.ColorComponents } if ximg.Width != nil { width = int(*ximg.Width) } if ximg.Height != nil { height = int(*ximg.Height) } if ximg.BitsPerComponent != nil { bpc = int(*ximg.BitsPerComponent) } info := imageInfo{ inline: false, filter: ximg.Filter.GetFilterName(), width: width, height: height, cpts: cpts, colorspace: ximg.ColorSpace.String(), bpc: bpc, } infoList = append(infoList, info) } else if xtype == model.XObjectTypeForm { // Go through the XObject Form content stream. xform, err := resources.GetXObjectFormByName(*name) showError(errors, err, "GetXObjectFormByName failed: %q", *name) if err != nil { continue } formContent, err := xform.GetContentStream() showError(errors, err, "GetContentStream failed: %q", *name) if err != nil { continue } // Process the content stream in the Form object too. formResources := xform.Resources if formResources == nil { formResources = resources } formDescs, err := contentStreamImages(string(formContent), formResources) showError(errors, err, "contentStreamImages failed: %q", *name) if err != nil { continue } infoList = append(infoList, formDescs...) } } } return infoList, nil } type imageInfo struct { path string page int inline bool filter string width int height int cpts int colorspace string bpc int count int } func (info imageInfo) String() string { kind := "XObject" if info.inline { kind = "Inline image" } return strings.Join([]string{ fmt.Sprintf("%q:%d", filepath.Base(info.path), info.page), fmt.Sprintf(" %s", kind), fmt.Sprintf(" Filter: %s", info.filter), fmt.Sprintf(" Width: %d", info.width), fmt.Sprintf(" Height: %d", info.height), fmt.Sprintf(" Color components: %d", info.cpts), fmt.Sprintf(" ColorSpace: %s", info.colorspace), fmt.Sprintf(" BPC: %d", info.bpc), }, "\n") } func (info imageInfo) asStrings() []string { kind := "XObject" if info.inline { kind = "Inline image" } parts := []string{ info.path, fmt.Sprintf("%d", info.page), fmt.Sprintf("%d", info.count), kind, info.filter, fmt.Sprintf("%d", info.width), fmt.Sprintf("%d", info.height), fmt.Sprintf("%d", info.cpts), info.colorspace, fmt.Sprintf("%d", info.bpc), } if len(parts) != len(header) { panic("csv") } return parts } var header = []string{ "Path", "Page number", "Count", "Type", "Filter", "Width", "Height", "Cpts", "Colors Space", "BPC", } func asList(corpus []string, corpusInfo map[string][]imageInfo, doSort, byDoc, noRes bool) []imageInfo { var infoList []imageInfo for _, fn := range corpus { partList, ok := corpusInfo[fn] if !ok { continue } infoList = append(infoList, partList...) } infoList = coallesce(infoList, byDoc, noRes) if doSort { sort.Slice(infoList, func(i, j int) bool { oi, oj := infoList[i], infoList[j] ai := oi.width * oi.height aj := oj.width * oj.height if ai != aj { return ai > aj } if oi.count != oj.count { return oi.count > oj.count } if oi.page != oj.page { return oi.page < oj.page } return oi.String() < oj.String() }) } return infoList } func coallesce(infoList []imageInfo, byDoc, noRes bool) []imageInfo { uniques := map[string]imageInfo{} for _, info := range infoList { if byDoc { info.page = 0 } if noRes { info.width = 0 info.height = 0 } k := info.String() if _, ok := uniques[k]; !ok { uniques[k] = info } info := uniques[k] info.count++ uniques[k] = info } var coallesced []imageInfo for _, info := range uniques { coallesced = append(coallesced, info) } return coallesced } // saveAsCsv saves `fileInfo` as a CSV file. func saveAsCsv(csvPath string, corpus []string, corpusInfo map[string][]imageInfo, doSort, byDoc, noRes bool) error { f, err := os.Create(csvPath) if err != nil { fmt.Fprintf(os.Stderr, "Couldn't create %q. %v\n", csvPath, err) return err } defer f.Close() w := csv.NewWriter(f) defer w.Flush() err = w.Write(header) if err != nil { fmt.Fprintf(os.Stderr, "Couldn't write header %q. %v\n", csvPath, err) return err } infoList := asList(corpus, corpusInfo, doSort, byDoc, noRes) for i, info := range infoList { err := w.Write(info.asStrings()) if err != nil { fmt.Fprintf(os.Stderr, "Couldn't write %q line %d. %v\n", csvPath, i+1, err) return err } } return nil } func showSummary(corpus []string, corpusInfo map[string][]imageInfo) { numFiles := len(corpusInfo) numImages := sumVals(corpusInfo) fmt.Println("=================================================") fmt.Printf("Totals: %d of %d files contain images. %6d images\n", numFiles, len(corpus), numImages) boolSummary("inline", corpusInfo, func(info imageInfo) bool { return info.inline }) stringSummary("filter", corpusInfo, func(info imageInfo) string { return info.filter }) stringSummary("color", corpusInfo, func(info imageInfo) string { return info.colorspace }) intSummary("cpts", corpusInfo, func(info imageInfo) int { return info.cpts }) intSummary("bpc", corpusInfo, func(info imageInfo) int { return info.bpc }) // intSummary("width", corpusInfo, func(info imageInfo) int { return info.width }) // intSummary("height", corpusInfo, func(info imageInfo) int { return info.height }) } func boolSummary(title string, corpusInfo map[string][]imageInfo, selector func(imageInfo) bool) { numFiles := len(corpusInfo) numImages := sumVals(corpusInfo) byImage, byFile := boolCounts(corpusInfo, selector) imageKeys, fileKeys := boolKeys(byImage), boolKeys(byFile) fmt.Println("-----------------------------------------") fmt.Printf("%s\n", title) fmt.Printf("By image: %d\n", len(byImage)) for _, k := range imageKeys { fmt.Printf("\t%+15v\t%s\n", k, percentage(byImage[k], numImages)) } fmt.Printf("By file: %d\n", len(byFile)) for _, k := range fileKeys { fmt.Printf("\t%+15v\t%s\n", k, percentage(byFile[k], numFiles)) } } func intSummary(title string, corpusInfo map[string][]imageInfo, selector func(imageInfo) int) { numFiles := len(corpusInfo) numImages := sumVals(corpusInfo) byImage, byFile := intCounts(corpusInfo, selector) imageKeys, fileKeys := intKeys(byImage), intKeys(byFile) fmt.Println("-----------------------------------------") fmt.Printf("%s\n", title) fmt.Printf("By image: %d\n", len(byImage)) for _, k := range imageKeys { fmt.Printf("\t%+15v\t%s\n", k, percentage(byImage[k], numImages)) } fmt.Printf("By file: %d\n", len(byFile)) for _, k := range fileKeys { fmt.Printf("\t%+15v\t%s\n", k, percentage(byFile[k], numFiles)) } } func stringSummary(title string, corpusInfo map[string][]imageInfo, selector func(imageInfo) string) { numFiles := len(corpusInfo) numImages := sumVals(corpusInfo) byImage, byFile := stringCounts(corpusInfo, selector) imageKeys, fileKeys := stringKeys(byImage), stringKeys(byFile) fmt.Println("-----------------------------------------") fmt.Printf("%s\n", title) fmt.Printf("By image: %d\n", len(byImage)) for _, k := range imageKeys { fmt.Printf("\t%+15v\t%s\n", k, percentage(byImage[k], numImages)) } fmt.Printf("By file: %d\n", len(byFile)) for _, k := range fileKeys { fmt.Printf("\t%+15v\t%s\n", k, percentage(byFile[k], numFiles)) } } func boolKeys(counts map[bool]int) []bool { var keys []bool for k := range counts { keys = append(keys, k) } sort.Slice(keys, func(i, j int) bool { ki, kj := keys[i], keys[j] ni, nj := counts[ki], counts[kj] if ni != nj { return ni > nj } return kj }) return keys } func intKeys(counts map[int]int) []int { var keys []int for k := range counts { keys = append(keys, k) } sort.Slice(keys, func(i, j int) bool { ki, kj := keys[i], keys[j] ni, nj := counts[ki], counts[kj] if ni != nj { return ni > nj } return ki < kj }) return keys } func stringKeys(counts map[string]int) []string { var keys []string for k := range counts { keys = append(keys, k) } sort.Slice(keys, func(i, j int) bool { ki, kj := keys[i], keys[j] ni, nj := counts[ki], counts[kj] if ni != nj { return ni > nj } return ki < kj }) return keys } func boolCounts(corpusInfo map[string][]imageInfo, selector func(imageInfo) bool) ( map[bool]int, map[bool]int) { byImage := map[bool]int{} byFile := map[bool]int{} for _, infoList := range corpusInfo { vals := map[bool]bool{} for _, info := range infoList { byImage[selector(info)] += 1 vals[selector(info)] = true } for v := range vals { byFile[v] += 1 } } return byImage, byFile } func intCounts(corpusInfo map[string][]imageInfo, selector func(imageInfo) int) ( map[int]int, map[int]int) { byImage := map[int]int{} byFile := map[int]int{} for _, infoList := range corpusInfo { vals := map[int]bool{} for _, info := range infoList { byImage[selector(info)] += 1 vals[selector(info)] = true } for v := range vals { byFile[v] += 1 } } return byImage, byFile } func stringCounts(corpusInfo map[string][]imageInfo, selector func(imageInfo) string) ( map[string]int, map[string]int) { byImage := map[string]int{} byFile := map[string]int{} for _, infoList := range corpusInfo { vals := map[string]bool{} for _, info := range infoList { byImage[selector(info)] += 1 vals[selector(info)] = true } for v := range vals { byFile[v] += 1 } } return byImage, byFile } // showError prints an error message `format` for error `err` if `err` has not been reported before. // `errors` tracks errors seen so far. The caller can make `errors` per-page, per-file or global. func showError(errors map[error]bool, err error, format string, args ...interface{}) bool { seen := false if errors != nil { _, seen = errors[err] } if !seen && err != nil { msg := fmt.Sprintf(format, args...) fmt.Printf("%s. err=%v\n", msg, err) } if errors != nil { errors[err] = true } return err != nil } func sumVals(corpusInfo map[string][]imageInfo) int { n := 0 for _, info := range corpusInfo { n += len(info) } return n } func percentage(n, total int) string { perc := 0.0 if total > 0 { perc = 100.0 * float64(n) / float64(total) } return fmt.Sprintf("%6d of %d (%4.1f%%)", n, total, perc) } // fileSizeMB returns the size of file `path` in megabytes. func fileSizeMB(path string) float64 { fi, err := os.Stat(path) if err != nil { return -1.0 } return float64(fi.Size()) / 1024.0 / 1024.0 } // makeUsage updates flag.Usage to include usage message `msg`. func makeUsage(msg string) { usage := flag.Usage flag.Usage = func() { fmt.Fprintln(os.Stderr, msg) usage() } }