import sys w2id = {} def indexFile(pt, res_pt): print('index file: '+str(pt)) wf = open(res_pt, 'w', encoding='utf-8') for l in open(pt, encoding='utf-8'): ws = l.strip().split() for w in ws: if w not in w2id: w2id[w] = len(w2id) wids = [w2id[w] for w in ws] # print>>wf,' '.join(map(str, wids)) print(' '.join(map(str, wids)), file=wf) print('write file: '+str(res_pt)) def write_w2id(res_pt): print('write:'+str(res_pt)) wf = open(res_pt, 'w', encoding='utf-8') for w, wid in sorted(w2id.items(), key=lambda d:d[1]): print('%d\t%s' % (wid, w), file=wf) def run_indexDocs(argv): if len(argv) < 4: print('Usage: python %s ' % argv[0]) print('\tdoc_pt input docs to be indexed, each line is a doc with the format "word word ..."') print('\tdwid_pt output docs after indexing, each line is a doc with the format "wordId wordId..."') print('\tvoca_pt output vocabulary file, each line is a word with the format "wordId word"') exit(1) doc_pt = argv[1] dwid_pt = argv[2] voca_pt = argv[3] indexFile(doc_pt, dwid_pt) print('n(w)='+str(len(w2id))) write_w2id(voca_pt) return len(w2id)