#!/usr/bin/env python # -*- coding: utf-8 -*- # script to create test-sets for evaluation of word embeddings # saves logged results in additional file # # @author: Andreas Mueller # @see: Bachelor Thesis 'Analyse von Wort-Vektoren deutscher Textkorpora' # # Contributors: # Michael Egger # # @example: python evaluation.py test.model -cu import gensim import random import argparse import logging from pathlib import Path # configuration parser = argparse.ArgumentParser(description='Script for creating testsets and evaluating word vector models') parser.add_argument('model', type=str, help='source file with trained model') parser.add_argument('-c', '--create', action='store_true', help='if set, create testsets before evaluating') parser.add_argument('-u', '--umlauts', action='store_true', help='if set, create additional testsets with transformed umlauts and use them instead') parser.add_argument('-t', '--topn', type=int, default=10, help='check the top n result (correct answer under top n answeres)') args = parser.parse_args() TARGET_SYN = 'data/syntactic.questions' TARGET_SEM_OP = 'data/semantic_op.questions' TARGET_SEM_BM = 'data/semantic_bm.questions' TARGET_SEM_DF = 'data/semantic_df.questions' SRC_NOUNS = 'src/nouns.txt' SRC_ADJECTIVES = 'src/adjectives.txt' SRC_VERBS = 'src/verbs.txt' SRC_BESTMATCH = 'src/bestmatch.txt' SRC_DOESNTFIT = 'src/doesntfit.txt' SRC_OPPOSITE = 'src/opposite.txt' PATTERN_SYN = [ ('nouns', 'SI/PL', SRC_NOUNS, 0, 1), ('nouns', 'PL/SI', SRC_NOUNS, 1, 0), ('adjectives', 'GR/KOM', SRC_ADJECTIVES, 0, 1), ('adjectives', 'KOM/GR', SRC_ADJECTIVES, 1, 0), ('adjectives', 'GR/SUP', SRC_ADJECTIVES, 0, 2), ('adjectives', 'SUP/GR', SRC_ADJECTIVES, 2, 0), ('adjectives', 'KOM/SUP', SRC_ADJECTIVES, 1, 2), ('adjectives', 'SUP/KOM', SRC_ADJECTIVES, 2, 1), ('verbs (pres)', 'INF/1SP', SRC_VERBS, 0, 1), ('verbs (pres)', '1SP/INF', SRC_VERBS, 1, 0), ('verbs (pres)', 'INF/2PP', SRC_VERBS, 0, 2), ('verbs (pres)', '2PP/INF', SRC_VERBS, 2, 0), ('verbs (pres)', '1SP/2PP', SRC_VERBS, 1, 2), ('verbs (pres)', '2PP/1SP', SRC_VERBS, 2, 1), ('verbs (past)', 'INF/3SV', SRC_VERBS, 0, 3), ('verbs (past)', '3SV/INF', SRC_VERBS, 3, 0), ('verbs (past)', 'INF/3PV', SRC_VERBS, 0, 4), ('verbs (past)', '3PV/INF', SRC_VERBS, 4, 0), ('verbs (past)', '3SV/3PV', SRC_VERBS, 3, 4), ('verbs (past)', '3PV/3SV', SRC_VERBS, 4, 3) ] logging.basicConfig(filename=args.model.strip() + '.result', format='%(asctime)s : %(message)s', level=logging.INFO) consoleHandler = logging.StreamHandler() logging.getLogger().addHandler(consoleHandler) def replace_umlauts(text): """ Replaces german umlauts and sharp s in given text. :param text: text as str :return: manipulated text as str """ res = text res = res.replace('ä', 'ae') res = res.replace('ö', 'oe') res = res.replace('ü', 'ue') res = res.replace('Ä', 'Ae') res = res.replace('Ö', 'Oe') res = res.replace('Ü', 'Ue') res = res.replace('ß', 'ss') return res def create_syntactic_testset(): """ Creates syntactic test set and writes it into a file. :return: None """ if args.umlauts: u = open(TARGET_SYN + '.nouml', 'w') with open(TARGET_SYN, 'w') as t: for label, short, src, index1, index2 in PATTERN_SYN: t.write(': {}: {}\n'.format(label, short)) if args.umlauts: u.write(': {}: {}\n'.format(label, short)) for q in create_questions(src, index1, index2): t.write(q + '\n') if args.umlauts: u.write(replace_umlauts(q) + '\n') logging.info('created pattern ' + short) def create_semantic_testset(): """ Creates semantic test set and writes it into a file. :return: None """ # opposite with open(TARGET_SEM_OP, 'w') as t: for q in create_questions(SRC_OPPOSITE, combinate=10): t.write(q + '\n') if args.umlauts: with open(TARGET_SEM_OP + '.nouml', 'w') as u: u.write(replace_umlauts(q) + '\n') logging.info('created opposite questions') # best match with open(TARGET_SEM_BM, 'w') as t: groups = open(SRC_BESTMATCH).read().split(':') groups.pop(0) # remove first empty group for group in groups: questions = group.splitlines() _ = questions.pop(0) while questions: for i in range(1, len(questions)): question = questions[0].split('-') + questions[i].split('-') t.write(' '.join(question) + '\n') if args.umlauts: with open(TARGET_SEM_BM + '.nouml', 'w') as u: u.write(replace_umlauts(' '.join(question)) + '\n') questions.pop(0) logging.info('created best-match questions') # doesn't fit with open(TARGET_SEM_DF, 'w') as t: for line in open(SRC_DOESNTFIT): words = line.split() for wrongword in words[-1].split('-'): question = ' '.join(words[:3] + [wrongword]) t.write(question + '\n') if args.umlauts: with open(TARGET_SEM_DF + '.nouml', 'w') as u: u.write(replace_umlauts(question) + '\n') logging.info('created doesn\'t-fit questions') def create_questions(src, index1=0, index2=1, combinate=5): """ Creates single questions from given source. :param src: source file to load words from :param index1: index of first word in a line to focus on :param index2: index of second word in a line to focus on :param combinate: combinate number of combinations with random other lines :return: list of question words """ # get source content with open(src) as f: content = f.readlines() content = [x.strip() for x in content] questions = [] for line in content: for i in range(0, combinate): # get current word pair question = list(line.split('-')[i] for i in [index1, index2]) # get random word pair that is not the current random_line = random.choice(list(set(content) - {line})) random_word = list(random_line.split('-')[i] for i in [index1, index2]) # merge both word pairs to one question question.extend(random_word) questions.append(' '.join(question)) return questions def test_most_similar(model, src, label='most similar', topn=10): """ Tests given model to most similar word. :param model: model to test :param src: source file to load words from :param label: label to print current test case :param topn: number of top matches :return: """ num_lines = sum(1 for _ in open(src)) num_questions = 0 num_right = 0 num_topn = 0 # get questions with open(src) as f: questions = f.readlines() questions = [x.strip() for x in questions] # test each question for question in questions: words = question.split() # check if all words exist in vocabulary if all(x in model.index2word for x in words): num_questions += 1 best_matches = model.most_similar(positive=[words[1], words[2]], negative=[words[0]], topn=topn) # best match if words[3] in best_matches[0]: num_right += 1 # topn match for match in best_matches[:topn]: if words[3] in match: num_topn += 1 break # calculate result correct_matches = round(num_right/float(num_questions)*100, 1) if num_questions > 0 else 0.0 topn_matches = round(num_topn/float(num_questions)*100, 1) if num_questions > 0 else 0.0 coverage = round(num_questions/float(num_lines)*100, 1) if num_lines > 0 else 0.0 # log result logging.info(label + ' correct: {0}% ({1}/{2})'.format(correct_matches, num_right, num_questions)) logging.info(label + ' top {0}: {1}% ({2}/{3})'.format(topn, topn_matches, num_topn, num_questions)) logging.info(label + ' coverage: {0}% ({1}/{2})'.format(coverage, num_questions, num_lines)) def test_most_similar_groups(model, src, topn=10): """ Tests given model to most similar word. :param model: model to test :param src: source file to load words from :param topn: number of top matches :return: None """ num_lines = 0 num_questions = 0 num_right = 0 num_topn = 0 # test each group with open(src) as groups_fp: groups = groups_fp.read().split('\n: ') for group in groups: questions = group.splitlines() label = questions.pop(0) label = label[2:] if label.startswith(': ') else label # handle first group num_group_lines = len(questions) num_group_questions = 0 num_group_right = 0 num_group_topn = 0 # test each question of current group for question in questions: words = question.split() # check if all words exist in vocabulary if all(x in model.index2word for x in words): num_group_questions += 1 best_matches = model.most_similar(positive=[words[1], words[2]], negative=[words[0]], topn=topn) # best match if words[3] in best_matches[0]: num_group_right += 1 # topn match for match in best_matches[:topn]: if words[3] in match: num_group_topn += 1 break # calculate result correct_group_matches = round(num_group_right/float(num_group_questions)*100, 1) if num_group_questions > 0 else 0.0 topn_group_matches = round(num_group_topn/float(num_group_questions)*100, 1) if num_group_questions > 0 else 0.0 group_coverage = round(num_group_questions/float(num_group_lines)*100, 1) if num_group_lines > 0 else 0.0 # log result logging.info(label + ': {0}% ({1}/{2}), {3}% ({4}/{5}), {6}% ({7}/{8})'.format( correct_group_matches, num_group_right, num_group_questions, topn_group_matches, num_group_topn, num_group_questions, group_coverage, num_group_questions, num_group_lines )) # total numbers num_lines += num_group_lines num_questions += num_group_questions num_right += num_group_right num_topn += num_group_topn # calculate result correct_matches = round(num_right/float(num_questions)*100, 1) if num_questions > 0 else 0.0 topn_matches = round(num_topn/float(num_questions)*100, 1) if num_questions > 0 else 0.0 coverage = round(num_questions/float(num_lines)*100, 1) if num_lines > 0 else 0.0 # log result logging.info('total correct: {0}% ({1}/{2})'.format(correct_matches, num_right, num_questions)) logging.info('total top {0}: {1}% ({2}/{3})'.format(topn, topn_matches, num_topn, num_questions)) logging.info('total coverage: {0}% ({1}/{2})'.format(coverage, num_questions, num_lines)) def test_doesnt_fit(model, src): """ Tests given model to most not fitting word. :param model: model to test :param src: source file to load words from :return: """ num_lines = sum(1 for _ in open(src)) num_questions = 0 num_right = 0 # get questions with open(src) as f: questions = f.readlines() questions = [x.strip() for x in questions] # test each question for question in questions: words = question.split() # check if all words exist in vocabulary if all(x in model.index2word for x in words): num_questions += 1 if model.doesnt_match(words) == words[3]: num_right += 1 # calculate result correct_matches = round(num_right/float(num_questions)*100, 1) if num_questions > 0 else 0.0 coverage = round(num_questions/float(num_lines)*100, 1) if num_lines > 0 else 0.0 # log result logging.info('doesn\'t fit correct: {0}% ({1}/{2})'.format(correct_matches, num_right, num_questions)) logging.info('doesn\'t fit coverage: {0}% ({1}/{2})'.format(coverage, num_questions, num_lines)) if args.create: logging.info('> CREATING SYNTACTIC TESTSET') create_syntactic_testset() logging.info('> CREATING SEMANTIC TESTSET') create_semantic_testset() # get trained model, files without a suffix, .bin or .model are treated as binary files binary_filetypes = ['', '.bin','.model'] is_binary = Path(args.model.strip()).suffix in binary_filetypes trained_model = gensim.models.KeyedVectors.load_word2vec_format(args.model.strip(), binary=is_binary) # remove original vectors to free up memory trained_model.init_sims(replace=True) # execute evaluation logging.info('> EVALUATING SYNTACTIC FEATURES') test_most_similar_groups(trained_model, TARGET_SYN + '.nouml' if args.umlauts else TARGET_SYN, args.topn) logging.info('> EVALUATING SEMANTIC FEATURES') test_most_similar(trained_model, TARGET_SEM_OP + '.nouml' if args.umlauts else TARGET_SEM_OP, 'opposite', args.topn) test_most_similar(trained_model, TARGET_SEM_BM + '.nouml' if args.umlauts else TARGET_SEM_BM, 'best match', args.topn) test_doesnt_fit(trained_model, TARGET_SEM_DF + '.nouml' if args.umlauts else TARGET_SEM_DF)