#define DUMMY /* \ set -ex; \ CFLAGS="-O3 -s -DNDEBUG=1"; [ "$1" ] && CFLAGS="-g"; \ CC=gcc; [ _"$1" = _-c ] && CC=checkergcc; \ $CC $CFLAGS -ansi -pedantic -Wunused \ -Wall -W -Wstrict-prototypes -Wnested-externs -Winline \ -Wpointer-arith -Wbad-function-cast -Wcast-qual -Wmissing-prototypes \ -Wmissing-declarations "$0" -o pdfconcat; \ exit */ /* pdfconcat.c: C89 (ANSI C) program to concatenate PDF files * by pts@fazekas.hu at Sat Nov 1 10:19:37 CET 2003 * -- Sun Nov 2 00:30:25 CET 2003 * * pdfconcat is a small and fast command-line utility written in C89 (ANSI C) * that can concatenate (merge) several PDF files into a long PDF document. * External libraries are not required, only ANSI C functions are used. * Several features of the output file are taken from the first input file * only. For example, outlines (also known as hierarchical bookmarks) in * subsequent input files are ignored. pdfconcat compresses its input a * little bit by removing whitespace and unused file parts. * * The license of pdfconcat is GPL v2 or later: * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* * Imp: optional safe mode, emitting more '\0' * Imp: true generation handling * Imp: extensive documentation * Dat: output must be seekable, so it cannot be a pipe * Dat: ungetc() destroys value of ftell(), even after getc()... */ #ifdef __TINYC__ /* pts-tcc, tcc (Tiny C Compiler) by Fabrice Bellard. https://bellard.org/tcc/ */ #ifndef __SIZEOF_INT__ #define __SIZEOF_INT__ 4 #endif #ifndef INT_FAST32_MAX #define INT_FAST32_MAX 2147483647 #endif #define NULL ((void *)0) typedef unsigned int size_t; /* TODO(pts): 64-bit tcc. */ /* errno.h */ extern int errno; /* stdlib.h */ void exit(int status); void *malloc(size_t size); void free(void *ptr); void *calloc(size_t nmemb, size_t size); void *realloc(void *ptr, size_t size); /* string.h */ int memcmp(const void *s1, const void *s2, size_t n); void *memcpy(void *dest, const void *src, size_t n); void *memset(void *s, int c, size_t n); int strcmp(const char *s1, const char *s2); size_t strlen(const char *s); char *strerror(int errnum); /* stdio.h */ #define SEEK_SET 0 #define SEEK_CUR 1 #define SEEK_END 2 typedef struct FILE FILE; extern FILE* stdout; extern FILE* stderr; FILE *fopen(const char *path, const char *mode); int fprintf(FILE *stream, const char *format, ...); int sprintf(char *str, const char *format, ...); int putc(int c, FILE *stream); int getc(FILE *stream); int ungetc(int c, FILE *stream); size_t fread(void *ptr, size_t size, size_t nmemb, FILE *stream); int fseek(FILE *stream, long offset, int whence); size_t fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); int fflush(FILE *stream); int ferror(FILE *stream); int fclose(FILE *stream); long ftell(FILE *stream); int sscanf(const char *str, const char *format, ...); /* assert.h */ #define assert(x) do {} while (0) #else # include # include # include /* exit() */ # include /* errno */ # include # include /* defines INT_FAST32_MAX */ #endif #if INT_FAST32_MAX >= 2147483647 || __SIZEOF_INT__ >= 4 typedef unsigned slen_t; typedef int slendiff_t; # define SLEN_P "" #else /* 16-bit integers -- old */ typedef unsigned long slen_t; typedef long slendiff_t; # define SLEN_P "l" #endif typedef char sbool; #if ';'!=59 || 'a'!=97 # error ASCII system is required to compile this program #endif #if __cplusplus >= 201700 #undef register #define register /* Pacify g++ -std=c++17 warning -Wregister.*/ #endif #undef TRUE #define TRUE 1 #undef FALSE #define FALSE 0 #ifdef NDEBUG # define ASSERT_SE(x,y) (y) /* assert with side effect */ #else # define ASSERT_SE(x,y) assert(x(y)) #endif #define ULE(a,b) (((a)+0U)<=((b)+0U)) #define ISWSPACE(i,j) ((i j)==32 || ULE(i-9,13-9) || i==0) #define PROGNAME "pdfconcat" #define VERSION "0.02" /* --- Data */ #if 0 #define SBUFSIZE 4096 /** Data area holding strings; */ char sbuf[SBUFSIZE], *sbufb; #define WORDSSIZE 127 /** Pointers inside sbuf, indexed by chars, [0..31] are special */ char const* words[WORDSSIZE]; static void sbuff(void) { unsigned i; sbuf[0]='\0'; sbufb=sbuf+1; for (i=0;i='!' && c<='~' && c!='/' && c!='%' && c!='{' && c!='}' && c!='<' && c!='>' && c!='[' && c!=']' && c!='(' && c!=')'; /* Dat: PS avoids: /{}<>()[]% \n\r\t\000\f\040 */ } #if 0 /** Definition chosen rather arbitrarily by pts */ static sbool is_wordx(char const *s) { if (!ULE(*s-'A','Z'-'A') && !ULE(*s-'a','z'-'a') && *s!='.') return 0; /* && !ULE(*s-'0','9'-'0') && *s!='-') return 0; */ while (*++s!='\0') if (!is_ps_name(*s)) return 0; return 1; } #endif /** @param b: assume null-terminated @return true on error */ static /*inline*/ sbool toInteger(char *s, pdfint_t *ret) { /* Dat: for both toInteger() and PDF `-5' and `+5' is OK, `--5' isn't */ int n=0; /* BUGFIX?? found by __CHECKER__ */ return sscanf(s, "%" SLEN_P"i%n", ret, &n)<1 || s[n]!='\0'; } /** @param b: assume null-terminated @return true on error */ static /*inline*/ sbool toReal(char *s, double *ret) { int n; char c; /* Dat: glibc accepts "12e", "12E", "12e+" and "12E-" */ return sscanf(s, "%lf%n", ret, &n)<1 || (c=s[n-1])=='e' || c=='E' || c=='+' || c=='-' || s[n]!='\0'; } static void r_seek(slen_t begofs) { if (0!=fseek(currs.file, begofs, SEEK_SET)) { fprintf(stderr, "%s: unseekable %s: %s\n", PROGNAME, currs.filename, strerror(errno)); exit(6); } } /** Returns a PostScript token ID, puts token into buf */ static char gettok(void) { /* Derived from MiniPS::Tokenizer::yylex() of sam2p-0.37 */ int c=0, d; /* dummy initialization */ sbool hi; unsigned hv=0; /* =0: pacify G++ 2.91 */ slen_t nest; char *ibufend=ibuf+IBUFSIZE; ibufb=ibuf; #if 0 if (ungot==EOFF) return EOFF; if (ungot!=NO_UNGOT) { c=ungot; ungot=NO_UNGOT; goto again; } #endif again_getcc: c=getc(currs.file); /* again: */ switch (c) { case -1: eof: return 0; /*ungot=EOFF */; case '\n': case '\r': case '\t': case ' ': case '\f': case '\0': goto again_getcc; case '%': /* one-line comment */ #if 0 /* XMLish tag from ps_tiny.c */ if ((c=getc(currs.file))=='<') { char ret='<'; if ((c=getc(currs.file))=='/') { ret='>'; c=getc(currs.file); } /* close tag */ if (!ULE(c-'A','Z'-'A')) erri("invalid tag",0); /* catch EOF */ (ibufb=ibuf)[0]=c; ibufb++; while (ULE((c=getc(currs.file))-'A','Z'-'A') || ULE(c-'a','z'-'a')) { if (ibufb==ibufend-1) erri("tag too long",0); *ibufb++=c; } if (c<0) erri("unfinished tag",0); *ibufb='\0'; ungetc(c,currs.file); return ret; } #endif while (c!='\n' && c!='\r' && c!=-1) c=getc(currs.file); if (c==-1) goto eof; goto again_getcc; case '[': *ibufb++=c; return '['; case ']': *ibufb++=c; return ']'; case '{': case '}': erri("proc arrays disallowed",0); /* allowed in PS, but not in PDF */ break; /* unreached */ case '>': if (getc(currs.file)!='>') goto err; *ibufb++='>'; *ibufb++='>'; return '>'; case '<': if ((c=getc(currs.file))==-1) { uf_hex: erri("unfinished hexstr",0); } if (c=='<') { *ibufb++='<'; *ibufb++='<'; return '<'; } if (c=='~') erri("a85str disallowed",0); /* allowed in PS, but not in PDF */ hi=1; while (c!='>') { if (ULE(c-'0','9'-'0')) hv=c-'0'; else if (ULE(c-'a','f'-'a')) hv=c-'a'+10; else if (ULE(c-'A','F'-'A')) hv=c-'A'+10; else if (is_ps_white(c)) hv=16; else erri("syntax error in hexstr",0); if (hv==16) ; else if (!hi) { ibufb[-1]|=hv; hi=1; } else if (ibufb==ibufend) erri("hexstr literal too long",0); else { *ibufb++=(char)(hv<<4); hi=0; } if ((c=getc(currs.file))==-1) goto uf_hex; } /* This is correct even if an odd number of hex digits have arrived */ return '('; case '(': nest=1; c=getc(currs.file); while (c!=-1) { if (c==')' && --nest==0) return '('; if (c=='\r') { if ((c=getc(currs.file))=='\n') {} /* convert "\r\n" -> "\n", as specified in subsection 3.2.3 of PDFRef.pdf */ else { d='\n'; dcont: if (ibufb==ibufend) erri("str literal too long",0); *ibufb++=d; continue; } } else if (c!='\\') { if (c=='(') nest++; } else switch (c=getc(currs.file)) { /* read a backslash escape */ case -1: goto uf_str; case 'n': c='\n'; break; case 'r': c='\r'; break; case 't': c='\t'; break; case 'b': c='\010'; break; /* \b and \a conflict between -ansi and -traditional */ case 'f': c='\f'; break; default: if (!ULE(c-'0','7'-'0')) break; hv=c-'0'; /* read at most 3 octal chars */ if ((c=getc(currs.file))==-1) goto uf_str; if (c<'0' || c>'7') { d=hv; goto dcont; } else { hv=8*hv+(c-'0'); if ((c=getc(currs.file))==-1) goto uf_str; if (c<'0' || c>'7') { d=hv; goto dcont; } else c=(char)(8*hv+(c-'0')); } } /* SWITCH */ if (ibufb==ibufend) erri("str literal too long",0); /* putchar(c); */ *ibufb++=c; c=getc(currs.file); } /* WHILE */ /* if (c==')') return '('; */ uf_str: erri("unfinished str",0); case ')': goto err; case '/': *ibufb++='/'; while (ISWSPACE(c,=getc(currs.file))) {} /* ^^^ `/ x' are two token in PostScript, but here we overcome the C * preprocessor's feature of including whitespace. */ /* fallthrough */ /* b will begin with '/' */ default: /* /nametype, /integertype or /realtype */ *ibufb++=c; while ((c=getc(currs.file))!=-1 && is_ps_name(c)) { *ibufb++=c; if (ibufb==ibufend) erri("token too long",0); } *ibufb='\0'; /* ensure null-termination */ currs.lastofs=ftell(currs.file)-1; r_seek(currs.lastofs); /* Dat: ungetc(c,currs.file) would destroy ftell() return value */ if (ibuf[0]=='/') return '/'; /* Imp: optimise numbers?? */ if (ibufb!=ibufend) { double d; /* Dat: PDF doesn't support (but PS does) base-n number such as `16#100' == 256; nor exponential notation (6e7) */ if (!toInteger(ibuf, &ibuf_int)) { sprintf(ibuf, "%" SLEN_P"d", ibuf_int); ibufb=ibuf+strlen(ibuf); /* compress it */ return '1'; } /* Dat: call toInteger _before_ toReal */ if (!toReal(ibuf, &d)) { /* Dat: `.5' and `6.' are valid PDF reals */ char *p; p=ibuf; while (*p!='\0' && *p!='e' && *p!='E') p++; if (*p!='\0') erri("exponential notation disallowed in PDF",0); /* Imp: convert to a name token instead */ p=ibufb=ibuf; while (*p=='0') p++; /* strip heading zeros */ while (*p!='\0') *ibufb++=*p++; while (ibufb!=ibuf && ibufb[-1]=='0') ibufb--; /* strip trailing zeros */ /* *ibufb='\0'; -- not required */ } } switch (*ibuf) { case 'R': if (0==strcmp(ibuf,"R")) return 'R'; break; case 't': if (0==strcmp(ibuf,"true")) return 'b'; break; case 'f': if (0==strcmp(ibuf,"false")) return 'b'; break; case 'n': if (0==strcmp(ibuf,"null")) return 'n'; break; } return 'E'; /* -endstream obj endobj stream trailer xref startxref */ } err: erri("syntax error, token expected",0); goto again_getcc; /* unreached */ } static void r_check_pdf_header(void) { int c; r_seek(0); if (9>fread(ibuf, 1, 9, currs.file) || 0!=memcmp(ibuf, "%PDF-", 5) || !ULE(ibuf[5]-'0','9'-'0') || ibuf[6]!='.' || !ULE(ibuf[7]-'0','9'-'0') || !is_ps_white(ibuf[8]) ) { erri("invalid PDF header", 0); } ibuf[8]='\n'; ibuf[9]='\0'; memcpy(currs.pdf_header, ibuf, sizeof(currs.pdf_header)); currs.is_binary=FALSE; /* Imp: should we count >=4 high chars? */ r_seek(0); /* vvv Seek binary bytes in the first few comment lines, see subsection 3.4.1 in PDFRef.pdf */ while (1) { while ((c=getc(currs.file))=='\n' || c=='\r') {} if (c!='%') break; while ((c=getc(currs.file))!='\n' && c!='\r' && c!=-1) if ((c&0x80)!=0) { currs.is_binary=TRUE; break; } } } /** Minimum offset in the PDF file that an object may start */ #define OBJ_MIN_OFS 9 static void r_seek_xref(void) { pdfint_t xrefofs; char *p; int n=0; /* BUGFIX?? found by __CHECKER__ */ slen_t got; r_seek(currs.filesize > 256 ? currs.filesize-256 : 0); if (0==(got=fread(ibuf, 1, 256, currs.file))) erri("cannot read startxref",0); p=ibuf+got; while (p!=ibuf && (p[-1]!='s' || 1!=sscanf(p,"tartxref%" SLEN_P"i%n",&xrefofs,&n))) p--; if (p==ibuf) erri("cannot find startxref",0); p[n]='\0'; if (xrefofs=currs.xrefc) { emsg="obj num out of bounds: "; err: { char tmp[64]; sprintf(tmp, "%" SLEN_P"d %" SLEN_P"d obj", num, gennum); erri(emsg, tmp); } } if (gennum<0 || gennum+0U>=65535U) { emsg="obj gennum bounds: "; goto err; } #if DEBUG fprintf(stderr, "resolving num=%ld\n", num); #endif /* Dat: pdftex creates unused objects like `0000000083 00000 f ' * when certain fonts are not subsetted (e.g. `<type!='n' && e->type!='f') { emsg="bad type for obj: "; goto err; } if (e->gennum!=gennum) { emsg="gennum mismatch: "; goto err; } return e; } static void r_seek_obj(pdfint_t num, pdfint_t gennum) { char const *emsg; struct XrefEntry *e=objentry(num, gennum); r_seek(e->ofs); if ('1'!=gettok() || ibuf_int!=num) { emsg="inobj num mismatch: "; err: { char tmp[64]; sprintf(tmp, "%" SLEN_P"d %" SLEN_P"d obj", num, gennum); erri(emsg, tmp); } } if ('1'!=gettok() || ibuf_int!=gennum) { emsg="inobj gennum mismatch: "; goto err; } if ('E'!=gettok() || 0!=strcmp(ibuf,"obj")) { emsg="inobj `obj' missing: "; goto err; } } static sbool is_digits(char const *p, char const *pend) { while (p!=pend && ULE(*p-'0','9'-'0')) p++; return p==pend; } #if 0 static void getotag(char const*tag) { #if 0 /* This code segment cannot ignore comments */ char const *p=tag; int c; while (ISWSPACE(c,=getcc())) ; if (c!='%' || (c=getcc())!='<') erri("tag expected: ", tag); while (ISWSPACE(c,=getcc())) ; while (p[0]!='\0') { if (c!=*p++) erri("this tag expected: ", tag); c=getcc(); } ungetc(c,currs.file); #else if (gettok()!='<' || 0!=strcmp(ibuf,tag)) erri("tag expected: ", tag); #endif } static void gettagbeg(void) { int c; while (ISWSPACE(c,=getcc())) ; if (c!='>') erri("`>' expected",0); } static void gettagend(void) { int c; while (ISWSPACE(c,=getcc())) ; if (c!='/') erri("`/>' expected",0); while (ISWSPACE(c,=getcc())) ; if (c!='>') erri("`/>' expected",0); } static void getkey(char const *key) { char const *p=key; int c; while (ISWSPACE(c,=getcc())) ; while (p[0]!='\0') { if (c!=*p++) erri("this key expected: ", key); c=getcc(); } while (ISWSPACE(c,)) c=getcc(); if (c!='=') erri("key `=' expected", 0); } /** Loads a value into ibuf, '\0'-terminated. */ static void getval(void) { sbool g=1; char *ibufend1=ibuf+IBUFSIZE-1, c; ibufb=ibuf; while (ISWSPACE(c,=getcc())) ; while (1) { if (c=='"') g=!g; else if (g && (ISWSPACE(c,) || c=='/' || c=='>')) { ungetcc(c); break; } else if (c<0) erri("unfinished tag val",0); else if (c==0) erri("\\0 disallowed in tag val",0); else if (ibufb==ibufend1) erri("tag val too long",0); else *ibufb++=c; c=getcc(); } /* WHILE */ *ibufb='\0'; } static pdfint_t getuintval(void) { pdfint_t ret; getval(); /* fprintf(stderr, "[%s]\n", ibuf); */ if (toInteger(ibuf, &ret) || ret<0) erri("tag val must be nonnegative integer",0); return ret; } #endif /* --- Writing */ /** Maximum number of characters in a line. */ #define MAXLINE 78 static struct WriteState { /** Number of characters already written into this line. (from 0) */ slen_t colc; /** Last token was a self-closing one */ sbool lastclosed, is_binary; FILE *wf; char const* filename; char* trailer; slen_t outobjc; /* # assigned objs */ slen_t trailerlen; slen_t *txrefs; /* txrefs[I] is the target file offset for `I 0 obj' */ slen_t txrefc; /* number of items used in txrefs */ slen_t txrefa; /* number of items allocated in txrefs */ slen_t startxrefofs; slen_t lastsrcpages_num; slen_t pagetotal; slen_t *srcpages_nums; slen_t srcpages_numc; /* number of subfiles */ } curws; #if 0 static void init_out(void) { curws.wf=stdout; curws.colc=0; curws.lastclosed=TRUE; } #endif static void newline(void) { if (curws.colc!=0) { putc('\n',curws.wf); curws.colc=0; curws.lastclosed=TRUE; } else assert(curws.lastclosed); } /** @return the byte length of a string as a quoted PostScript ASCII string * literal */ static slen_t pstrqlen(register char const* p, char const* pend) { slen_t olen=2+pend-p; /* '(' and ')' */ /* Close parens after this */ char const *q=pend; /* Number of parens opened so far */ slen_t nest=0; char c; p=ibuf; pend=ibufb; while (p!=pend) { if ((c=*p++)=='\r') { olen++; continue; } else if (c=='(') { while (q>p && *--q!=')') {} if (q<=p) { olen++; continue; } nest++; } else if (c==')') { if (nest!=0) { assert(q!=pend); while (*q++!=')') {} nest--; } else { olen++; continue; } } } assert(nest==0); return olen; } /** Prints the specified string as a quoted PostScript ASCII string literal. * Does not modify curws.colc etc. */ static void pstrqput(register char const* p, char const* pend) { /* Close parens after this */ char const *q=pend; /* Number of parens opened so far */ slen_t nest=0; char c; putc('(',curws.wf); curws.colc++; p=ibuf; pend=ibufb; while (p!=pend) { if ((c=*p++)=='\n') { putc('\n',curws.wf); curws.colc=0; continue; } else if (c=='\r' || c=='\\') { put2: putc('\\',curws.wf); putc(c,curws.wf); curws.colc+=2; continue; } else if (c=='(') { while (q>p && *--q!=')') {} if (q<=p) goto put2; nest++; } else if (c==')') { if (nest!=0) { assert(q!=pend); while (*q++!=')') {} nest--; } else goto put2; } putc(c,curws.wf); curws.colc++; continue; #if 0 else if (ULE(c-32, 126-32)) { putc(c); curws.colc++; continue; } curws.colc+=2; putc('\\'); if (c=='\r') putc('r'); #if 0 /* literal newline allowed in strings */ else if (c=='\n') putc('n'); #endif else if (c=='\t') putc('t'); else if (c=='\010') putc('b'); else if (c=='\f') putc('f'); else if (c>=64 || p==pend || ULE(*p-'0','7'-'0')) { putc((c>>6&7)+'0'); putc((c>>3&7)+'0'); putc(( c&7)+'0'); } else if (c>=8) { putc((c>>3) +'0'); putc(( c&7)+'0'); } else putc(c+'0'); #endif } assert(nest==0); putc(')',curws.wf); curws.colc++; } /** @return the byte length of a string as a quoted PostScript hex string * literal */ static slen_t pstrhlen(register char const* p, char const* pend) { slen_t olen=2+2*(pend-p); /* '<' and '>' */ if (p!=pend && (pend[-1]&15)==0) olen--; return olen; } /** Prints the specified string as a quoted PostScript hex string literal. * Does not modify curws.colc etc. */ static void pstrhput(register char const* p, char const* pend) { static char const hextable[]="0123456789abcdef"; char c; curws.colc+=2+2*(pend-p); putc('<',curws.wf); if (p!=pend--) { while (p!=pend) { c=hextable[*(unsigned char const*)p>>4]; putc(c,curws.wf); c=hextable[*(unsigned char const*)p&15]; putc(c,curws.wf); p++; } c=hextable[*(unsigned char const*)p>>4]; putc(c,curws.wf); c=hextable[*(unsigned char const*)p&15]; if (c!='0') putc(c,curws.wf); else curws.colc--; } putc('>',curws.wf); } static void copy_token(char tok) { slen_t len, qlen, hlen; switch (tok) { case 0: erri("eof in copy", 0); break; /* unreached */ case '[': case ']': case '<': case '>': len=ibufb-ibuf; #if 0 if (curws.colc+(len=ibufb-ibuf)>MAXLINE) newline(); #endif curws.lastclosed=TRUE; write: #if 0 if (len>MAXLINE) fprintf(stderr, "%s: warning: output line too long\n", PROGNAME); #endif fwrite(ibuf, 1, len, curws.wf); curws.colc+=len; break; case '/': len=ibufb-ibuf; #if 0 if ((len=ibufb-ibuf)=0) { ibuf[0]='/'; ibuf[1]=c; ibufb=ibuf+2; len=2; } #endif #if 0 if (curws.colc+len>MAXLINE) newline(); #endif curws.lastclosed=FALSE; goto write; #if 0 /* tags from ps_tiny.c */ case '<': case '>': #endif case '(': qlen=pstrqlen(ibuf,ibufb); hlen=pstrhlen(ibuf,ibufb); #if 0 if (curws.colc+qlen>MAXLINE) newline(); if (qlen>MAXLINE) fprintf(stderr, "%s: warning: output string too long\n", PROGNAME); #endif /* putc(ibuf[1]); */ if (hlen=0) { ibuf[0]=c; ibufb=ibuf+1; len=1; } #endif #if 0 if (curws.colc+len+!curws.lastclosed>MAXLINE) newline(); #else if (0) {} #endif else if (curws.lastclosed) {} else if (curws.colc+len': if (nest--==0) erri("too many array/dict closes",0); break; default: ; } if (nest==0) break; tok=gettok(); } } static pdfint_t gettok_int(char const* for_) { char tok; slen_t lastofs; pdfint_t a, b; if ('1'!=(tok=gettok())) erri("int expected for ", for_); a=ibuf_int; lastofs=currs.lastofs; if ('1'==(tok=gettok()) && (b=ibuf_int, TRUE) && 'R'==(tok=gettok())) { r_seek_obj(a,b); /* Imp: test this */ if ('1'!=(tok=gettok())) erri("int expected (R) for ", for_); a=ibuf_int; r_seek(lastofs); ASSERT_SE('1'==, gettok()); ASSERT_SE('R'==, gettok()); } else { r_seek(lastofs); } return a; } static void r_seek_ref(void) { slen_t lastofs=ftell(currs.file); pdfint_t a, b; if ('1'==gettok() && (a=ibuf_int, TRUE) && '1'==gettok() && (b=ibuf_int, TRUE) && 'R'==gettok() ) { r_seek_obj(a,b); } else { r_seek(lastofs); /* seek back */ } } /** @return file offset of previous xref table in file; or 0 */ static slen_t r_copy_trailer(void) { char tok; pdfint_t prev=0; if (gettok()!='E' || 0!=strcmp(ibuf,"trailer")) erri("trailer expected",0); if (gettok()!='<') erri("trailer dict expected",0); while (1) { if ('>'==(tok=gettok())) break; if ('/'!=tok) erri("trailer dict key expected",0); /* Dat: first trailer usually has: /Size /Info /Root /Prev /ID */ /* Dat: prev trailers usually have: /Size /ID */ /* Dat: thus our merged trailer will have many /ID fields */ #if DEBUG fprintf(stderr,"trailer_key=(%s)\n",ibuf); #endif if (0==strcmp(ibuf,"/Prev")) { prev=gettok_int("trailer /Prev"); if (prev=currs.filesize) erri("invalid prev ofs",0); } else if (0==strcmp(ibuf,"/Size")) { skipstruct(gettok(), FALSE); } else { copy_token(tok); skipstruct(gettok(), FALSE); /* copy() */ } } return prev; } /** * currs.file must be positioned just before `<<'. After this function, * currs.file will be positioned just after the dict key (i.e just before * the value). If the key isn't found, the file position is unchanged. * @param key dict key, e.g "/Root" * @return true iff found */ static sbool r_seek_dictval(char const* key) { char tok; pdfint_t prev=0; slen_t oldofs=ftell(currs.file); if (gettok()!='<') erri("dict expected",0); while (1) { if ('>'==(tok=gettok())) { r_seek(oldofs); return FALSE; } if ('/'!=tok) erri("dict key expected",0); /* ^^^ Dat: PDF keys must be names (PS allows others) */ if (0==strcmp(ibuf,key)) return TRUE; skipstruct(gettok(), FALSE); } return prev; } static void r_seek_dictval_must(char const* key) { if (!r_seek_dictval(key)) erri("missing dict key", key); } /** @param typenam e.g "/Pages" */ static void r_checktype(char const* typenam) { slen_t oldofs=ftell(currs.file); if (!r_seek_dictval("/Type")) erri("missing /Type for dict", 0); r_seek_ref(); if ('/'!=gettok() || 0!=strcmp(ibuf, typenam)) { if ((ibufb-ibuf)+strlen(typenam)+20>=IBUFSIZE) erri("expected type", typenam); sprintf(ibufb, ", needed %s", typenam); erri("dict type mismatch: got ", ibuf); } r_seek(oldofs); } static slen_t r_copy_trailer(void); static void r_read_xref(void) { struct XrefEntry *e; char tok, xbuf[21]; unsigned long dummy; pdfint_t xzero, xcount; slen_t prevofs; int n; currs.xreftc=1; currs.trailer1ofs=-1U; while (1) { if ((tok=gettok())!='E' || 0!=strcmp(ibuf,"xref")) { erri("expected xref",0); return; } if ((tok=gettok())!='1' || (xzero =ibuf_int)<0) { erri("expected xref base offset",0); return; } if ((tok=gettok())!='1' || (xcount=ibuf_int)<0) { erri("expected xref count",0); return; } #if DEBUG fprintf(stderr,"xref=(%lu+%lu)\n", xzero, xcount); #endif while ((n=getc(currs.file))>=0 && is_ps_white(n)) {} if (n>=0) ungetc(n,currs.file); if (xzero+xcount+(slen_t)0>currs.xrefc) { if (NULL==(currs.xrefs=(struct XrefEntry*)realloc(currs.xrefs, sizeof(currs.xrefs[0])*(xzero+xcount)))) erri("out of memory for xref",0); memset(currs.xrefs+currs.xrefc, '\0', (xzero+xcount-currs.xrefc)*sizeof(currs.xrefs[0])); /* ^^^ Dat: initialize .type with '\0' */ currs.xrefc=xzero+xcount; } e=currs.xrefs+xzero; xbuf[20]='\0'; while (xcount--!=0) { if (20!=fread(xbuf, 1, 20, currs.file) || !is_digits(xbuf, xbuf+10) || !is_ps_white(xbuf[10]) || !is_digits(xbuf+11, xbuf+16) || !is_ps_white(xbuf[16]) || ((e->type=xbuf[17])!='n' && xbuf[17]!='f') || 2!=sscanf(xbuf, "%" SLEN_P"u%hu", &(e->ofs), &(e->gennum)) || (dummy=e->gennum)>65535UL /* Dat: tmp= to pacify gcc-3.4 warning: comparison is always false due to limited range of data type */ || (xbuf[17]=='n' && e->ofs != 0 && (e->ofsofs>=currs.filesize)) ) erri("invalid xref entry",0); if (e->ofs == 0) e->type = 'f'; e++; } if (currs.trailer1ofs==-1U) currs.trailer1ofs=ftell(currs.file); if (0==(prevofs=r_copy_trailer())) break; r_seek(prevofs); currs.xreftc++; } /* Now find currs.catalogofs */ r_seek(currs.trailer1ofs); ASSERT_SE('E'==,gettok()); /* skip `trailer' */ r_seek_dictval_must("/Root"); r_seek_ref(); currs.catalogofs=ftell(currs.file); #if DEBUG fprintf(stdout, "Input PDF (%s): filesize=%" SLEN_P"u, xrefc=%" SLEN_P"u, xreftc=%u, catalogofs=%" SLEN_P"d\n", currs.filename, currs.filesize, currs.xrefc, currs.xreftc, currs.catalogofs); #endif r_seek(currs.catalogofs); r_checktype("/Catalog"); r_seek_dictval_must("/Pages"); r_seek_ref(); #if DEBUG fprintf(stderr, "/Pages at=%ld\n", ftell(currs.file)); #endif currs.uppagesofs=ftell(currs.file); r_checktype("/Pages"); r_seek_dictval_must("/Count"); if (0>(xcount=gettok_int("pagecount"))) erri("page count <0",ibuf); curws.pagetotal+=currs.pagecount=xcount; } static struct XrefEntry *enq_first=NULL, **enq_lastp=&enq_first; #define ENQ_PUT(xe) (*enq_lastp=(xe), (xe)->next=NULL, enq_lastp=&((xe)->next)) #define ENQ_RESET() (enq_first=NULL, enq_lastp=&enq_first) /** Skips a whole recursive structure starting with `tok'. Works with `R' */ static void wr_enqueue_struct(sbool copy_p) { struct XrefEntry *e; char tok; slen_t nest=0, lastofs; pdfint_t a, b; /* enqueue_stream_length=-1; */ while (1) { if (0==(tok=gettok())) erri("eof in e_s", 0); #if DEBUG ibufb[0]='\n'; ibufb[1]='\0'; fputs(ibuf,stderr); #endif if (copy_p && tok!='1') copy_token(tok); switch (tok) { case '1': /* Skip a possible `R' */ a=ibuf_int; lastofs=currs.lastofs; /* Dat: copy_token() already called */ if ('1'==(tok=gettok()) && (b=ibuf_int, TRUE) && 'R'==(tok=gettok())) { e=objentry(a,b); #if DEBUG fprintf(stderr,"XUT %ld (%ld %ld obj)\n", e->target_num, a, b); #endif if (e->target_num==0) { e->target_num=curws.outobjc++; #if DEBUG fprintf(stderr, "PUT\n"); #endif ENQ_PUT(e); } if (copy_p) { sprintf(ibuf, "%" SLEN_P"d 0 R", e->target_num); ibufb=ibuf+strlen(ibuf); copy_token('1'); } } else { if (copy_p) { sprintf(ibuf, "%" SLEN_P"d", a); ibufb=ibuf+strlen(ibuf); copy_token('1'); } r_seek(lastofs); } break; case '[': case '<': /* Imp: treat dicts and arrays differently, create nest stack */ nest++; break; case ']': case '>': if (nest--==0) erri("too many array/dict closes in e_s",0); break; default: ; } if (nest==0) break; } } static void w_dump_start(void) { if (0!=fseek(curws.wf, 0, SEEK_SET)) errn("cannot begin dump",curws.filename); fprintf(curws.wf, "%s%s", currs.pdf_header, currs.is_binary ? "%\xE1\xE9\xF3\xFA\n" : ""); curws.is_binary=currs.is_binary; /* Imp: pre-look other inputs */ curws.outobjc=2; curws.txrefa=0; curws.txrefc=0; curws.txrefs=NULL; curws.lastclosed=TRUE; } static void w_xref_aset(slen_t num, slen_t ofs) { if (num>=curws.txrefa) { #ifdef __CHECKER__Z slen_t oa=curws.txrefa; #endif if (curws.txrefa<16) curws.txrefa=16; while (curws.txrefa<=num) curws.txrefa<<=1; if (NULL==(curws.txrefs=(slen_t*)realloc(curws.txrefs, curws.txrefa*sizeof(curws.txrefs[0])))) errn("out of memory for xref_aset",0); #ifdef __CHECKER__Z memset(curws.txrefs+oa, '\0' #endif } if (num>=curws.txrefc) { memset(curws.txrefs+curws.txrefc, '\0', sizeof(curws.txrefs[0])*(num-curws.txrefc)); curws.txrefc=num+1; } curws.txrefs[num]=ofs; } static void w_dump_xref(void) { slen_t const *p=curws.txrefs, *pend=p+curws.txrefc; if (!curws.lastclosed) putc('\n',curws.wf); curws.startxrefofs=ftell(curws.wf); fprintf(curws.wf, "xref\n0 %" SLEN_P"u\n", curws.txrefc); /* Dat: must be "\n" */ while (p!=pend) { if (*p!=0) { if (*p/1000000U>=10000U) errn("offset overflow",0); /* Dat: works with 32 bit arithmetic */ fprintf(curws.wf, "%010" SLEN_P"u 00000 n \n", *p++); } else { fprintf(curws.wf, "0000000000 65535 f \n"); p++; } } curws.lastclosed=TRUE; curws.colc=0; } static void wr_enqueue_catalog(void) { char tok; if (gettok()!='<') erri("catalog dict expected",0); copy_token('<'); while (1) { copy_token(tok=gettok()); if ('>'==tok) break; if ('/'!=tok) erri("catalog dict key expected",0); if (0==strcmp(ibuf,"/Pages")) { /* must be an indirect reference */ struct XrefEntry *e; slen_t lastofs=ftell(currs.file); pdfint_t a, b; if ('1'==gettok() && (a=ibuf_int, TRUE) && '1'==gettok() && (b=ibuf_int, TRUE) && 'R'==gettok() ) {} else { r_seek(lastofs); erri("/Pages of /Catalog must be indirect", 0); return; } r_seek(lastofs); e=objentry(a,b); wr_enqueue_struct(FALSE); curws.lastsrcpages_num=e->target_num; sprintf(ibuf, "1 0 R"); ibufb=ibuf+strlen(ibuf); copy_token('1'); } else { wr_enqueue_struct(TRUE); } } } static void wr_enqueue_uppages(void) { char tok; if (gettok()!='<') erri("uppages dict expected",0); copy_token('<'); sprintf(ibuf,"/Parent"); ibufb=ibuf+strlen(ibuf); copy_token('/'); sprintf(ibuf,"1 0 R"); ibufb=ibuf+strlen(ibuf); copy_token('1'); while (1) { copy_token(tok=gettok()); if ('>'==tok) break; if ('/'!=tok) erri("uppages dict key expected",0); if (0==strcmp(ibuf,"/Parent")) { /* must be an indirect reference */ /* Dat: top /Pages doesn't have /Parent, but ensure */ skipstruct(gettok(), FALSE); } else { wr_enqueue_struct(TRUE); } } } /** Reads all objs reachable from currs, and dumps them to curws in order */ static void r_dump_reachable(void) { struct XrefEntry *e; pdfint_t streamlen; slen_t lastofs; char tok; ENQ_RESET(); r_seek(currs.trailer1ofs); skipstruct(gettok(), FALSE); /* `trailer' */ wr_enqueue_struct(FALSE); while (enq_first!=NULL) { e=enq_first; #if DEBUG fprintf(stderr,"dumping_src=(%u)\n", e-currs.xrefs); #endif if (!curws.lastclosed) putc('\n',curws.wf); w_xref_aset(e->target_num, ftell(curws.wf)); #if 0 fprintf(stderr, "%" SLEN_P"u 0 obj # from %lu\n", e->target_num, e->ofs); #endif fprintf(curws.wf, "%" SLEN_P"u 0 obj\n", e->target_num); curws.lastclosed=TRUE; curws.colc=0; r_seek(e->ofs); if ('1'!=gettok() || '1'!=gettok() || 'E'!=gettok() || 0!=strcmp(ibuf,"obj") ) erri("obj start expected",0); lastofs=ftell(currs.file); #if DEBUG fprintf(stderr, "CMP %ld %ld\n", lastofs, currs.catalogofs); #endif if (lastofs==currs.catalogofs) wr_enqueue_catalog(); else if (lastofs==currs.uppagesofs) wr_enqueue_uppages(); else wr_enqueue_struct(TRUE); if ('E'!=(tok=gettok())) erri("name expected after obj",0); if (0==strcmp(ibuf,"stream")) { int i; slen_t afterofs=ftell(currs.file); r_seek(lastofs); r_seek_dictval_must("/Length"); /* BUGFIX at Sun Mar 7 18:37:23 CET 2004 */ streamlen=gettok_int("dump"); if (streamlen<0) erri("negative stream length",0); r_seek(afterofs); if (!curws.lastclosed) putc('\n',curws.wf); fprintf(curws.wf, "stream\n"); /* no "\r", to avoid confusion */ while (1) { /* Imp: why this while(1)? */ /* Dat: PDFRef.pdf subsection 3.2.7 says that "\r\n" mustn't follow `stream' -- but in the file PDFRef.pdf, it does */ if ((i=getc(currs.file))=='\r') { i=getc(currs.file); if (i!='\n' && i!=-1) r_seek(ftell(currs.file)-1); break; } else if (is_ps_white(i)) { break; } else { r_seek(ftell(currs.file)-1); break; } } while (streamlen!=0) { if (0==(afterofs=fread(ibuf, 1, streamlen>IBUFSIZE ? IBUFSIZE : streamlen, currs.file))) erri("stream too short",0); fwrite(ibuf, 1, afterofs, curws.wf); streamlen-=afterofs; } curws.lastclosed=TRUE; curws.colc=0; if ('E'!=gettok() || 0!=strcmp(ibuf,"endstream")) erri("endstream expected",0); copy_token('E'); tok=gettok(); } if ('E'!=tok || 0!=strcmp(ibuf,"endobj")) erri("endobj expected",0); copy_token('E'); enq_first=e->next; /* this must be done as late as possible (afte ENQ_PUT()s) */ } } static void w_make_trailer(void) { char tok; slen_t pretofs; r_seek(currs.trailer1ofs); if (gettok()!='E' || 0!=strcmp(ibuf,"trailer")) erri("trailer expected for dump",0); newline(); pretofs=ftell(curws.wf); copy_token('E'); newline(); if (gettok()!='<') erri("trailer dict expected",0); copy_token('<'); while (1) { if ('>'==(tok=gettok())) break; if ('/'!=tok) erri("trailer dict key expected",0); /* Dat: first trailer usually has: /Size /Info /Root /Prev /ID */ /* Dat: prev trailers usually have: /Size /ID -- will be ignored */ /* Dat: thus our merged trailer will have many /ID fields -- but we don't merge prev trailers */ #if DEBUG fprintf(stderr,"trailer_key=(%s)\n",ibuf); #endif if (0==strcmp(ibuf,"/Prev") || 0==strcmp(ibuf,"/Size")) { skipstruct(gettok(), FALSE); } else { copy_token(tok); wr_enqueue_struct(TRUE); /* renumbering */ } } if (0!=fseek(curws.wf, pretofs, SEEK_SET)) errn("cannot seek pretofs: ",curws.filename); curws.lastclosed=TRUE; curws.colc=0; } static void w_dump_trailer(void) { newline(); fwrite(curws.trailer, 1, curws.trailerlen, curws.wf); fprintf(curws.wf,"/Size %" SLEN_P"u>>\nstartxref\n%" SLEN_P"u\n%%%%EOF\n", curws.txrefc, curws.startxrefofs); /* Dat: must end by "%%EOF\n" */ fflush(curws.wf); curws.lastclosed=TRUE; curws.colc=0; } /** curws now contains a trailer dict. Read it to memory. ftruncate() isn't * necessary, because final output will be longer than the trailer */ static void w_pull_trailer(void) { slen_t ofs=ftell(curws.wf); if (0!=fseek(curws.wf, 0, SEEK_END)) { seekerr: errn("cannot seek: ", curws.filename); } if (NULL==(curws.trailer=(char*)malloc(1+(curws.trailerlen=ftell(curws.wf)-ofs)))) errn("out of memory for trailer",0); if (0!=fseek(curws.wf, ofs, SEEK_SET)) goto seekerr; if (curws.trailerlen!=fread(curws.trailer, 1, curws.trailerlen, curws.wf)) errn("cannot read trailer: ", curws.filename); if (0!=fseek(curws.wf, ofs, SEEK_SET)) goto seekerr; /* superfluous, but ANSI needs it */ } static void w_dump_toppages(void) { /* Dat: we must say `1 0 obj' for (data flow to) /Parent of /Pages */ slen_t srci; newline(); w_xref_aset(1, ftell(curws.wf)); sprintf(ibuf, "1 0 obj\n<>"); ibufb=ibuf+strlen(ibuf); copy_token(']'); sprintf(ibuf, "endobj"); ibufb=ibuf+strlen(ibuf); copy_token('E'); } static void r_open(char const *filename) { currs.xrefs=NULL; currs.xrefc=0; currs.lastofs=0; currs.filename=filename; if (!(currs.file=fopen(currs.filename,"rb"))) { fprintf(stderr, "%s: open %s: %s\n", PROGNAME, currs.filename, strerror(errno)); exit(3); } if (0!=fseek(currs.file, 0, SEEK_END)) { fprintf(stderr, "%s: unseekable %s: %s\n", PROGNAME, currs.filename, strerror(errno)); exit(6); } { long l=ftell(currs.file); currs.filesize=l; if (l<32 || currs.filesize!=l+0UL) { fprintf(stderr, "%s: invalid filesize for %s: %ld\n", PROGNAME, currs.filename, l); exit(7); } } } static void r_input_status(void) { fprintf(stdout, "Input PDF (%s): filesize=%" SLEN_P"u, xrefc=%" SLEN_P"u, xreftc=%" SLEN_P"u, catalogofs=%" SLEN_P"u, #pages=%" SLEN_P"u, is_binary=%d\n", currs.filename, currs.filesize, currs.xrefc, currs.xreftc, currs.catalogofs, currs.pagecount, currs.is_binary); } static void r_close(void) { free(currs.xrefs); currs.xrefs=NULL; if (ferror(currs.file)) erri("error reading file: ", currs.filename); fclose(currs.file); currs.file=NULL; currs.filename=NULL; } static void w_output_status(void) { fprintf(stdout, "Output PDF (%s): filesize=%lu, xrefc=%" SLEN_P"u, subfiles=%" SLEN_P"u, #pages=%" SLEN_P"u, is_binary=%d\n", curws.filename, (long)ftell(curws.wf), curws.txrefc, curws.srcpages_numc, curws.pagetotal, curws.is_binary); } /* --- Main */ int main(int argc, char const* const*argv) { char const*const* ap; slen_t srci; (void)argc; (void)argv; if (argc<3 || 0!=strcmp(argv[1],"-o")) { fprintf(stderr, "Usage: %s -o [...]\n", argv[0]); exit(2); } curws.colc=0; curws.lastclosed=TRUE; curws.pagetotal=0; curws.filename=argv[2]; { ap=argv+3; while (*ap) if (0==strcmp(curws.filename, *ap++)) { fprintf(stderr, "%s: may not append to existing PDF: %s\n", PROGNAME, curws.filename); exit(4); } curws.srcpages_numc=ap-argv-3; /* fprintf(stderr,"%d\n", curws.srcpages_numc); */ } if (!(curws.wf=fopen(curws.filename,"wb+"))) { fprintf(stderr, "%s: open4write %s: %s\n", PROGNAME, curws.filename, strerror(errno)); exit(5); } if (NULL==(curws.srcpages_nums=(slen_t*)malloc(sizeof(curws.srcpages_nums[0])*curws.srcpages_numc))) errn("out of memory for srcpages_nums",0); r_open(argv[3]); r_check_pdf_header(); r_seek_xref(); r_read_xref(); r_input_status(); w_dump_start(); r_dump_reachable(); w_make_trailer(); w_pull_trailer(); r_close(); curws.srcpages_nums[0]=curws.lastsrcpages_num; ap=argv+4; srci=1; while (*ap) { r_open(*ap++); r_check_pdf_header(); r_seek_xref(); r_read_xref(); r_input_status(); r_dump_reachable(); r_close(); curws.srcpages_nums[srci++]=curws.lastsrcpages_num; } w_dump_toppages(); w_dump_xref(); w_dump_trailer(); fflush(curws.wf); w_output_status(); if (ferror(curws.wf)) errn("error writing output file: ", curws.filename); if (fclose(curws.wf)) errn("error closing output file: ", curws.filename); free(curws.trailer); free(curws.srcpages_nums); if (curws.txrefs!=NULL) free(curws.txrefs); return 0; }