from pickle import load from numpy import array from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical def load_doc(filename): """读取文本文件为string Args: filename: 文本文件 Returns: string, 文本文件的内容 """ # open the file as read only file = open(filename, 'r') # read all text text = file.read() # close the file file.close() return text def to_list(captions): """将一个字典(key为文件名, value为图像标题list)转换为图像标题list Args: captions: 一个字典, key为文件名, value为图像标题list Returns: 图像标题list """ all_desc = list() for key in captions.keys(): [all_desc.append(d) for d in captions[key]] return all_desc def get_max_length(captions): """从标题字典计算图像标题里面最长的标题的长度 Args: captions: 一个dict, key为文件名(不带.jpg后缀), value为图像标题list Returns: 最长标题的长度 """ lines = to_list(captions) return max(len(d.split()) for d in lines) def load_set(filename): """从文本文件加载图像名set Args: filename: 文本文件,每一行都包含一个图像文件名(包含.jpg文件后缀) Returns:get_max_length set, 文件名,去除了.jpg后缀 """ doc = load_doc(filename) dataset = list() # process line by line for line in doc.split('\n'): # skip empty lines if len(line) < 1: continue # get the image identifier identifier = line.split('.')[0] dataset.append(identifier) return set(dataset) def load_clean_captions(filename, dataset): """为图像标题首尾分别加上'startseq ' 和 ' endseq', 作为自动标题生成的起始和终止 Args: filename: 文本文件,每一行由图像名,和图像标题构成, 图像的标题已经进行了清洗 dataset: 图像名list Returns: dict, key为图像名, value为添加了'startseq'和'endseq'的标题list """ # load document doc = load_doc(filename) descriptions = dict() for line in doc.split('\n'): # split line by white space tokens = line.split() # split id from description image_id, image_desc = tokens[0], tokens[1:] # skip images not in the set if image_id in dataset: # create list if image_id not in descriptions: descriptions[image_id] = list() # wrap description in tokens desc = 'startseq ' + ' '.join(image_desc) + ' endseq' # store descriptions[image_id].append(desc) return descriptions def load_photo_features(filename, dataset): """从图像特征文件中加载给定图像名list对应的图像特征 Args: filename: 包含图像特征的文件名, 文件加载以后是一个字典, key为'Flicker8k_Dataset/' + 文件名, value为文件名对应的图表的特征 dataset: 图像文件名list Returns: 图像特征字典, key为文件名, value为文件名对应的图表的特征 """ # load all features all_features = load(open(filename, 'rb')) # filter features features = {k: all_features[k] for k in dataset} return features #根据数据训练模型 #读取一组图像id def load_ids(fn): doc = load_doc(fn) ret = list() for line in doc.split('\n'): if len(line) < 1: continue id = line.split('.')[0] ret.append(id) return set(ret) def create_sequences(tokenizer, max_length, descriptions, photos_features, vocab_size): """ 从输入的图片标题list和图片特征构造LSTM的一组输入 Args: :param tokenizer: 英文单词和整数转换的工具keras.preprocessing.text.Tokenizer :param max_length: 训练数据集中最长的标题的长度 :param descriptions: dict, key 为图像的名(不带.jpg后缀), value 为list, 包含一个图像的几个不同的描述 :param photos_features: dict, key 为图像的名(不带.jpg后缀), value 为numpy array 图像的特征 :param vocab_size: 训练集中表的单词数量 :return: tuple: 第一个元素为 numpy array, 元素为图像的特征, 它本身也是 numpy.array 第二个元素为 numpy array, 元素为图像标题的前缀, 它自身也是 numpy.array 第三个元素为 numpy array, 元素为图像标题的下一个单词(根据图像特征和标题的前缀产生) 也为numpy.array Examples: from pickle import load tokenizer = load(open('tokenizer.pkl', 'rb')) max_length = 6 descriptions = {'1235345':['startseq one bird on tree endseq', "startseq red bird on tree endseq"], '1234546':['startseq one boy play water endseq', "startseq one boy run across water endseq"]} photo_features = {'1235345':[ 0.434, 0.534, 0.212, 0.98 ], '1234546':[ 0.534, 0.634, 0.712, 0.28 ]} vocab_size = 7378 print(create_sequences(tokenizer, max_length, descriptions, photo_features, vocab_size)) (array([[ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.434, 0.534, 0.212, 0.98 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ], [ 0.534, 0.634, 0.712, 0.28 ]]), array([[ 0, 0, 0, 0, 0, 2], [ 0, 0, 0, 0, 2, 59], [ 0, 0, 0, 2, 59, 254], [ 0, 0, 2, 59, 254, 6], [ 0, 2, 59, 254, 6, 134], [ 0, 0, 0, 0, 0, 2], [ 0, 0, 0, 0, 2, 26], [ 0, 0, 0, 2, 26, 254], [ 0, 0, 2, 26, 254, 6], [ 0, 2, 26, 254, 6, 134], [ 0, 0, 0, 0, 0, 2], [ 0, 0, 0, 0, 2, 59], [ 0, 0, 0, 2, 59, 16], [ 0, 0, 2, 59, 16, 82], [ 0, 2, 59, 16, 82, 24], [ 0, 0, 0, 0, 0, 2], [ 0, 0, 0, 0, 2, 59], [ 0, 0, 0, 2, 59, 16], [ 0, 0, 2, 59, 16, 165], [ 0, 2, 59, 16, 165, 127], [ 2, 59, 16, 165, 127, 24]]), array([[ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], ..., [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.], [ 0., 0., 0., ..., 0., 0., 0.]])) """ X1, X2, y = list(), list(), list() for key, desc_list in descriptions.items(): for desc in desc_list: seq = tokenizer.texts_to_sequences([desc])[0] for i in range(1, len(seq)): in_seq, out_seq = seq[:i], seq[i] #填充in_seq,使得其长度为max_length in_seq = pad_sequences([in_seq], maxlen = max_length)[0] out_seq = to_categorical([out_seq], num_classes = vocab_size)[0] X1.append(photos_features[key][0]) X2.append(in_seq) y.append(out_seq) return array(X1), array(X2), array(y)