#!/usr/bin/env python3 r""" rtfuc v1.1 (c) 2024 Silas S. Brown. License: Apache 2 Removes unnecessary \uc1 codes from RTF files (to work around catdoc issue described by Ubuntu bug 2076244) and provides better ASCII fallback characters when possible (e.g. for unrtf --text) Also available on PyPI: https://pypi.org/project/rtfuc/ """ def rtfuc(in_rtf: bytes) -> bytes: r"""Converts input RTF data (provided as a byte string of the RTF file's contents) into a version with unnecessary \uc1 removed, and better fall-back ASCII characters provided when possible. Returns resulting RTF data as another byte string.""" class UCMatch: def __init__(self): self.uc = 1 def __call__(self,m): if re.match(br"\\uc%d$" % self.uc, m.group()): # unneeded \uc return b"" elif re.match(br"\\uc", m.group()): self.uc = int(m.group[3:]) return m.group() codepoint = int(m.group()[2:-1]) if self.uc == 1: if ascii_ok(codepoint): return chr(codepoint).encode('ascii').replace(b" ",b"{} ") elif codepoint==169: return br"\uc3\u169(c)\uc1{}" elif codepoint==174: return br"\uc3\u174(R)\uc1{}" elif codepoint==0x2122: return br"\uc4\u8482(TM)\uc1{}" subst = m.group()[-1:] subst2 = get_subst(codepoint) if subst==b"?" and subst2: subst = subst2 # use ours instead return br"\u%d%s" % (codepoint,subst) elif ascii_ok(codepoint): return chr(codepoint).encode('ascii') + m.group()[-1:] else: return m.group() ucMatch = UCMatch() return re.sub(br"(? out.rtf") else: sys.stdout.buffer.write(rtfuc(sys.stdin.buffer.read())) if __name__ == "__main__": main()