In [2]:
from pathlib import Path
import re, sqlite3
from collections import defaultdict

In [3]:
# ~4800 commonly used chars from 常用國字標準字體表
common = Path('common.txt').read_text('utf-8').strip()

# ~6300 less common chars from 次常用國字標準字體表
lessCommon = Path('lessCommon.txt').read_text('utf-8').strip()

# your custom add-on chars, eg. 粵語用字
addon = Path('add_on_char.txt').read_text('utf-8').splitlines()

# combine ordered list from above
chars = list(common) + addon + list(lessCommon)

# rare chars with same input code with others, to be removed from the final character list
duplicated_code = Path('duplicated_code.txt').read_text('utf-8').splitlines()

# chars from 教育部 - 字頻表排序
char_rank = Path('char_rank.txt').read_text('utf-8').splitlines()


In [4]:
# dict with input code for all chars
with open('char_db.csv') as f:
 input_dict = dict(line.strip().split(',') for line in f)

In [5]:
# pioritize charaters with 教育部 - 字頻表排序
for char in char_rank:
 if char not in chars:
 print(char, 'not exsit')
 chars.remove(char)

ranked_chars = char_rank + chars 

In [6]:
# remove rare chars having duplicated input code with others
for char in duplicated_code:
 if char in ranked_chars:
 ranked_chars.remove(char)
 else:
 print(char, ' not in ranked_chars')

In [7]:
# make sure all chars have input code
if chars_not_found := [char for char in ranked_chars if not input_dict.get(char)]:
 raise f"chars not having input code: {chars_not_found}"

In [8]:
# header of final character list
header = '''%gen_inp
%ename cangjie
%cname 倉頡輸入法
%selkey 1234567890
%keyname begin
a 日
b 月
c 金
d 木
e 水
f 火
g 土
h 竹
i 戈
j 十
k 大
l 中
m 一
n 弓
o 人
p 心
q 手
r 口
s 尸
t 廿
u 山
v 女
w 田
x 難
y 卜
z 重
[ 「
] 」
; ;
' 、
\ \n, ,
. 。
! !
: :
/ ?
%keyname end
%chardef begin
, ,
! !
: :
. 。
/ ?
// /
.. .
... …
' 、
; ;
\ \n[ 「
[ 『
[ (
[ 〈
[ 《
] 」
] 』
] )
] 〉
] 》
'''

In [9]:
footer = '\n%chardef end'

In [10]:
# create main body of the final character list from `ranked_chars`
body = '\n'.join(input_dict[char].lower() + ' ' + char for char in ranked_chars)

In [11]:
# generate final character list
with open('dime_cangjie.txt', mode='w', encoding='utf-8') as f:
 f.writelines(header + body +footer)

In [12]:
# total number of chars in the final character list
len(ranked_chars)

11047

## show chars with more then one output

base on the follow results, you may:

- adjust `duplicated_code.txt` to remove unwanted chars
- adjust `char_rank.txt` to re-order the output when more then one candidates

In [13]:
tmp = defaultdict(list)
for char in ranked_chars:
 code = input_dict[char]
 exist_char = tmp.get(code)
 arr = tmp[code]
 arr.append(char)
 if len(arr) > 1:
 print(f'{code} {" ".join(arr)}')

SU 己 已
YOLN 刻 劇
DHE 皮 板
OJ 什 午
OGE 雙 隻
MNP 死 恐
JD 未 宋
WD 果 困
EHSK 激 淚
YRBU 亮 毫
YHMBC 顏 頻
TW 曲 苗
DTMC 橫 棋
OPBUC 貨 貸
GRTR 喜 嘉
EMHF 源 鴻
RRIK 哭 獸
QOMR 拾 捨
SIP 忍 慰
GIKS 勢 劫
MGOK 致 玫
NO 久 欠
A 日 曰
MBUC 貢 頁
KN 九 夷
HND 朵 梨
QHLO 抓 掀
TCNO 歉 欺
EJMC 演 濱
RC 只 叭
PA 旨 旬
HPA 昏 筍
ANAU 晚 冕
HS 戶 乍
MRNO 歌 砍
TWK 奠 茵
NBG 角 墮
OKR 知 佑
DBDB 棘 棗
IPP 態 庇
FBOK 敞 敝
EOMN 汽 渝
NL 引 弔
ABJJ 暈 暉
HOUFK 徽 黴
BT 冊 皿
AYRF 景 晾
YCK 交 奕
HUP 息 憩
HMNL 郵 邸
TMD 某 芋
SRNL 郡 邵
HTMC 簧 箕
SEB 腎 臀
NI 夕 弘
ARF 照 煦
BM 且 肛
HDLN 利 剁
IRP 感 怠
SHOE 履 屐
HI 鬼 么
SHI 刃 戮
THJD 茱 孽
RAU 吧 邑
ENI 汐 泓
VID 樂 槳
ETMC 淇 潢
BHN 肌 冗
FBR 尚 炯
QSMG 握 擢
PI 勾 勺
OLOK 攸 倏
DYTJ 樟 梓
YVVV 巡 邋
IT 戒 弁
YRU 訕 乩
TVID 藥 孳
ESMG 濯 渥
NBKS 勇 觔
GKLMI 螯 螫
VFHAF 鷥 鸞
YMP 此 忐
HKP 懲 忝
YRPA 詢 詣
KJCC 痲 癲
IPP 態 庇 忒
ROMR 哈 啥
RKS 另 叻
EPD 池 柒
LWB 冑 胄
BUOG 瞿 睢
MGTMC 琪 璜
AFMBC 顯 顥
OM 丘 仝
EMCW 酒 洒
TYTR 菩 蒟
RJBF 嗦 嚓
THON 荇 蘅
TT 井 并
IFP 憑 慼
RJI 戰 戢
DWD 棵 梱
KB 有 冇
K 大 乂
RKI 吰 呔
RSJ 咡 咠
RMCW 哂 唒
RBUC 員 唄
ROIR 嗆 唅
RRRD 噪 喿
RHSK 唳 噭
RTWI 噂 囆
GTWI 墫 壿
SHOD 髹 屧
UOIN 岑 岒
UNMU 峗 峞
IP 弋 庀
PWD 悃 惈
PMRW 匐 愊
WOP