# see details in vocab_export.py # this imports from csv to fake vocab.000 import shutil INPUT_FILE = r"vocab.csv" OUTPUT_FILE1 = r"vocab.000" OUTPUT_FILE2 = r"vocab.900" ENCODING = 'windows-1255' import csv classes = { 'CONJUNCTION': 0x004, 'ASSOCIATION': 0x008, 'PREPOSITION': 0x010, 'ARTICLE': 0x020, 'ADJECTIVE': 0x040, 'PRONOUN': 0x080, 'NOUN': 0x100, 'INDICATIVE_VERB': 0x200, 'ADVERB': 0x400, 'IMPERATIVE_VERB': 0x800 } def get_class(l): if l.strip() == '': return 0 result = 0 for c in l.split('|'): result += classes[c.strip()] return result def get_entry_with_word(entries, word): for entry in entries: if word in entry['words']: return entry def read_csv_file(): with open(INPUT_FILE, newline='') as csvfile: vocab = [{k: v for k, v in row.items()} for row in csv.DictReader(csvfile, skipinitialspace=True)] # for duplicate word checking words = [] # group number to be used for new entries without group number, start after current maximum next_group = max([int(e['group']) for e in vocab if e['group'] != '']) + 1 entries = [] rooms_to_recompile = [] for entry in vocab: new_entry = {} new_entry['cls'] = get_class(entry['class']) if entry['group'] == '': new_entry['group'] = next_group next_group += 1 else: new_entry['group'] = int(entry['group']) new_entry['words'] = [] duplicated = None for w in entry['words'].split('|'): word = w.strip() if word == '': # empty word, it's OK - just a redundant | pass elif len(word.split()) > 1: print("Warning: multy word '%s' at: " % word.strip()) print(entry) elif word not in words: words.append(word) new_entry['words'].append(word) elif entry['rooms']: # ignore duplicated that aren't used in any room duplicated = word if duplicated: # this word already appears in an existing entry # we combine the two entries # note - we don't do `new_entry['words'].append(duplicated)` - because `duplicated` already exists in existing_entry existing_entry = get_entry_with_word(entries, duplicated.strip()) if existing_entry['cls'] != new_entry['cls']: existing_entry['cls'] = existing_entry['cls'] | new_entry['cls'] existing_entry['words'].extend(new_entry['words']) assert duplicated in existing_entry['words'] rooms = [r.strip() for r in entry['rooms'].split('in')[1].split(',')] rooms_to_recompile.extend(rooms) rooms_to_recompile = sorted(list(set(rooms_to_recompile))) #print(entry) else: entries.append(new_entry) return (entries, rooms_to_recompile) def write_vocab_file(entries): binary_vocab = [0x86, 0x00] # vocab signature # vocab.900 starts with 255 16-bit pointers # they aren't interesting... binary_vocab.extend([0] * (255 * 2)) for entry in entries: byte1 = entry['cls'] >> 4 byte2 = (entry['cls'] & 0x0f) << 4 byte2 += entry['group'] >> 8 byte3 = entry['group'] & 0xff for word in entry['words']: # don't bother with the useless compression binary_vocab.append(0) chars = str.encode(word, 'windows-1255') for char in chars: assert 0 <= char <= 255 binary_vocab.append(char) binary_vocab.append(0) # end of string (only on newer format!) binary_vocab.append(byte1) binary_vocab.append(byte2) binary_vocab.append(byte3) with open(OUTPUT_FILE1, "wb") as out_file: out_file.write(bytes(binary_vocab)) shutil.copyfile(OUTPUT_FILE1, OUTPUT_FILE2) def vocab_import(): (entries, rooms_to_recompile) = read_csv_file() write_vocab_file(entries) return rooms_to_recompile if __name__ == "__main__": print(vocab_import())