local NUM_GPUS = 2; local NUM_GRAD_ACC = 4; local BATCH_SIZE = 512 / NUM_GPUS / NUM_GRAD_ACC; local BASE_READER = { "type": "simple_language_modeling", "tokenizer": { // The 1 Billion Word Language Model Benchmark dataset is // pre-tokenized. (Also, if you're running against a untokenized // dataset be aware that there are serialization issues with Spacy. // These come into play in the multiprocess case.) "type": "just_spaces" }, "token_indexers": { "tokens": { "type": "single_id" }, "token_characters": { "type": "elmo_characters" } }, "max_sequence_length": 400, "start_tokens": [""], "end_tokens": [""], }; local BASE_LOADER = { "max_instances_in_memory": BATCH_SIZE * 100, "batch_sampler": { "type": "bucket", "batch_size": BATCH_SIZE, } }; { "dataset_reader": { "type": "sharded", "base_reader": BASE_READER, }, // Note: We don't set a validation_data_path because the softmax is only // sampled during training. Not sampling on GPUs results in a certain OOM // given our large vocabulary. We'll need to evaluate against the test set // (when we'll want a full softmax) with the CPU. "train_data_path": std.extVar("BIDIRECTIONAL_LM_TRAIN_PATH"), "vocabulary": { // Use a prespecified vocabulary for efficiency. "type": "from_files", "directory": std.extVar("BIDIRECTIONAL_LM_VOCAB_PATH"), // Plausible config for generating the vocabulary. // "tokens_to_add": { // "tokens": ["", ""], // "token_characters": ["<>/S"] // }, // "min_count": {"tokens": 3} }, "model": { "type": "language_model", "bidirectional": true, "num_samples": 8192, # Sparse embeddings don't work with DistributedDataParallel. "sparse_embeddings": false, "text_field_embedder": { "token_embedders": { "tokens": { "type": "empty" }, "token_characters": { "type": "character_encoding", "embedding": { "num_embeddings": 262, // Same as the Transformer ELMo in Calypso. Matt reports that // this matches the original LSTM ELMo as well. "embedding_dim": 16 }, "encoder": { "type": "cnn-highway", "activation": "relu", "embedding_dim": 16, "filters": [ [1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], "num_highway": 2, "projection_dim": 512, "projection_location": "after_highway", "do_layer_norm": true } } } }, // TODO(brendanr): Consider the following. // remove_bos_eos: true, // Applies to the contextualized embeddings. "dropout": 0.1, "contextualizer": { "type": "bidirectional_language_model_transformer", "input_dim": 512, "hidden_dim": 2048, "num_layers": 6, "dropout": 0.1, "input_dropout": 0.1 } }, "data_loader": BASE_LOADER, "distributed": { "cuda_devices": if NUM_GPUS > 1 then std.range(0, NUM_GPUS - 1) else 0, }, "trainer": { "num_epochs": 10, "optimizer": { // The gradient accumulators in Adam for the running stdev and mean for // words not used in the sampled softmax would be decayed to zero with the // standard "adam" optimizer. "type": "dense_sparse_adam" }, // TODO(brendanr): Needed with transformer too? // "grad_norm": 10.0, "learning_rate_scheduler": { "type": "noam", // See https://github.com/allenai/calypso/blob/master/calypso/train.py#L401 "model_size": 512, // See https://github.com/allenai/calypso/blob/master/bin/train_transformer_lm1b.py#L51. // Adjusted based on our sample size relative to Calypso's. "warmup_steps": 6000 }, "num_gradient_accumulation_steps": NUM_GRAD_ACC, "use_amp": true } }