/content/wandb/run-20230528_174740-fdsvtgrs
"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"Step | \n", "Training Loss | \n", "
---|---|
1 | \n", "1.997600 | \n", "
2 | \n", "2.265900 | \n", "
3 | \n", "1.809600 | \n", "
4 | \n", "2.020400 | \n", "
5 | \n", "3.371100 | \n", "
6 | \n", "1.832700 | \n", "
7 | \n", "2.383400 | \n", "
8 | \n", "1.730600 | \n", "
9 | \n", "2.520400 | \n", "
10 | \n", "2.620500 | \n", "
11 | \n", "2.613400 | \n", "
12 | \n", "2.683500 | \n", "
13 | \n", "2.354000 | \n", "
14 | \n", "1.917100 | \n", "
"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n",
"\u001b[31m│\u001b[0m in \u001b[92m╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
"│ in <cell line: 29>:29 │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/transformers/trainer.py:1696 in train │\n",
"│ │\n",
"│ 1693 │ │ inner_training_loop = find_executable_batch_size( │\n",
"│ 1694 │ │ │ self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size │\n",
"│ 1695 │ │ ) │\n",
"│ ❱ 1696 │ │ return inner_training_loop( │\n",
"│ 1697 │ │ │ args=args, │\n",
"│ 1698 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │\n",
"│ 1699 │ │ │ trial=trial, │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/transformers/trainer.py:1973 in _inner_training_loop │\n",
"│ │\n",
"│ 1970 │ │ │ │ │ with model.no_sync(): │\n",
"│ 1971 │ │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │\n",
"│ 1972 │ │ │ │ else: │\n",
"│ ❱ 1973 │ │ │ │ │ tr_loss_step = self.training_step(model, inputs) │\n",
"│ 1974 │ │ │ │ │\n",
"│ 1975 │ │ │ │ if ( │\n",
"│ 1976 │ │ │ │ │ args.logging_nan_inf_filter │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2787 in training_step │\n",
"│ │\n",
"│ 2784 │ │ │ return loss_mb.reduce_mean().detach().to(self.args.device) │\n",
"│ 2785 │ │ │\n",
"│ 2786 │ │ with self.compute_loss_context_manager(): │\n",
"│ ❱ 2787 │ │ │ loss = self.compute_loss(model, inputs) │\n",
"│ 2788 │ │ │\n",
"│ 2789 │ │ if self.args.n_gpu > 1: │\n",
"│ 2790 │ │ │ loss = loss.mean() # mean() to average on multi-gpu parallel training │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/transformers/trainer.py:2819 in compute_loss │\n",
"│ │\n",
"│ 2816 │ │ │ labels = inputs.pop(\"labels\") │\n",
"│ 2817 │ │ else: │\n",
"│ 2818 │ │ │ labels = None │\n",
"│ ❱ 2819 │ │ outputs = model(**inputs) │\n",
"│ 2820 │ │ # Save past state if it exists │\n",
"│ 2821 │ │ # TODO: this needs to be fixed and made cleaner later. │\n",
"│ 2822 │ │ if self.args.past_index >= 0: │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501 in _call_impl │\n",
"│ │\n",
"│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
"│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
"│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
"│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │\n",
"│ 1502 │ │ # Do not call functions when jit is used │\n",
"│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │\n",
"│ 1504 │ │ backward_pre_hooks = [] │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/peft/peft_model.py:686 in forward │\n",
"│ │\n",
"│ 683 │ ): │\n",
"│ 684 │ │ peft_config = self.active_peft_config │\n",
"│ 685 │ │ if not isinstance(peft_config, PromptLearningConfig): │\n",
"│ ❱ 686 │ │ │ return self.base_model( │\n",
"│ 687 │ │ │ │ input_ids=input_ids, │\n",
"│ 688 │ │ │ │ attention_mask=attention_mask, │\n",
"│ 689 │ │ │ │ inputs_embeds=inputs_embeds, │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501 in _call_impl │\n",
"│ │\n",
"│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
"│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
"│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
"│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │\n",
"│ 1502 │ │ # Do not call functions when jit is used │\n",
"│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │\n",
"│ 1504 │ │ backward_pre_hooks = [] │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165 in new_forward │\n",
"│ │\n",
"│ 162 │ │ │ with torch.no_grad(): │\n",
"│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 164 │ │ else: │\n",
"│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 166 │ │ return module._hf_hook.post_forward(module, output) │\n",
"│ 167 │ │\n",
"│ 168 │ module.forward = new_forward │\n",
"│ │\n",
"│ /root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-rw-1b/104655c0c067936f1ae2b4 │\n",
"│ 73625fe5161375591a/modelling_RW.py:753 in forward │\n",
"│ │\n",
"│ 750 │ │ │\n",
"│ 751 │ │ return_dict = return_dict if return_dict is not None else self.config.use_return │\n",
"│ 752 │ │ │\n",
"│ ❱ 753 │ │ transformer_outputs = self.transformer( │\n",
"│ 754 │ │ │ input_ids, │\n",
"│ 755 │ │ │ past_key_values=past_key_values, │\n",
"│ 756 │ │ │ attention_mask=attention_mask, │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501 in _call_impl │\n",
"│ │\n",
"│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
"│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
"│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
"│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │\n",
"│ 1502 │ │ # Do not call functions when jit is used │\n",
"│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │\n",
"│ 1504 │ │ backward_pre_hooks = [] │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165 in new_forward │\n",
"│ │\n",
"│ 162 │ │ │ with torch.no_grad(): │\n",
"│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 164 │ │ else: │\n",
"│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 166 │ │ return module._hf_hook.post_forward(module, output) │\n",
"│ 167 │ │\n",
"│ 168 │ module.forward = new_forward │\n",
"│ │\n",
"│ /root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-rw-1b/104655c0c067936f1ae2b4 │\n",
"│ 73625fe5161375591a/modelling_RW.py:640 in forward │\n",
"│ │\n",
"│ 637 │ │ │ │ │ │\n",
"│ 638 │ │ │ │ │ return custom_forward │\n",
"│ 639 │ │ │ │ │\n",
"│ ❱ 640 │ │ │ │ outputs = torch.utils.checkpoint.checkpoint( │\n",
"│ 641 │ │ │ │ │ create_custom_forward(block), │\n",
"│ 642 │ │ │ │ │ hidden_states, │\n",
"│ 643 │ │ │ │ │ alibi, │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:249 in checkpoint │\n",
"│ │\n",
"│ 246 │ │ raise ValueError(\"Unexpected keyword arguments: \" + \",\".join(arg for arg in kwar │\n",
"│ 247 │ │\n",
"│ 248 │ if use_reentrant: │\n",
"│ ❱ 249 │ │ return CheckpointFunction.apply(function, preserve, *args) │\n",
"│ 250 │ else: │\n",
"│ 251 │ │ return _checkpoint_without_reentrant( │\n",
"│ 252 │ │ │ function, │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/autograd/function.py:506 in apply │\n",
"│ │\n",
"│ 503 │ │ if not torch._C._are_functorch_transforms_active(): │\n",
"│ 504 │ │ │ # See NOTE: [functorch vjp and autograd interaction] │\n",
"│ 505 │ │ │ args = _functorch.utils.unwrap_dead_wrappers(args) │\n",
"│ ❱ 506 │ │ │ return super().apply(*args, **kwargs) # type: ignore[misc] │\n",
"│ 507 │ │ │\n",
"│ 508 │ │ if cls.setup_context == _SingleLevelFunction.setup_context: │\n",
"│ 509 │ │ │ raise RuntimeError( │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/utils/checkpoint.py:107 in forward │\n",
"│ │\n",
"│ 104 │ │ ctx.save_for_backward(*tensor_inputs) │\n",
"│ 105 │ │ │\n",
"│ 106 │ │ with torch.no_grad(): │\n",
"│ ❱ 107 │ │ │ outputs = run_function(*args) │\n",
"│ 108 │ │ return outputs │\n",
"│ 109 │ │\n",
"│ 110 │ @staticmethod │\n",
"│ │\n",
"│ /root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-rw-1b/104655c0c067936f1ae2b4 │\n",
"│ 73625fe5161375591a/modelling_RW.py:636 in custom_forward │\n",
"│ │\n",
"│ 633 │ │ │ │ def create_custom_forward(module): │\n",
"│ 634 │ │ │ │ │ def custom_forward(*inputs): │\n",
"│ 635 │ │ │ │ │ │ # None for past_key_value │\n",
"│ ❱ 636 │ │ │ │ │ │ return module(*inputs, use_cache=use_cache, output_attentions=ou │\n",
"│ 637 │ │ │ │ │ │\n",
"│ 638 │ │ │ │ │ return custom_forward │\n",
"│ 639 │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501 in _call_impl │\n",
"│ │\n",
"│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
"│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
"│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
"│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │\n",
"│ 1502 │ │ # Do not call functions when jit is used │\n",
"│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │\n",
"│ 1504 │ │ backward_pre_hooks = [] │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165 in new_forward │\n",
"│ │\n",
"│ 162 │ │ │ with torch.no_grad(): │\n",
"│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 164 │ │ else: │\n",
"│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 166 │ │ return module._hf_hook.post_forward(module, output) │\n",
"│ 167 │ │\n",
"│ 168 │ module.forward = new_forward │\n",
"│ │\n",
"│ /root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-rw-1b/104655c0c067936f1ae2b4 │\n",
"│ 73625fe5161375591a/modelling_RW.py:385 in forward │\n",
"│ │\n",
"│ 382 │ │ residual = hidden_states │\n",
"│ 383 │ │ │\n",
"│ 384 │ │ # Self attention. │\n",
"│ ❱ 385 │ │ attn_outputs = self.self_attention( │\n",
"│ 386 │ │ │ layernorm_output, │\n",
"│ 387 │ │ │ layer_past=layer_past, │\n",
"│ 388 │ │ │ attention_mask=attention_mask, │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1501 in _call_impl │\n",
"│ │\n",
"│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │\n",
"│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │\n",
"│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │\n",
"│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │\n",
"│ 1502 │ │ # Do not call functions when jit is used │\n",
"│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │\n",
"│ 1504 │ │ backward_pre_hooks = [] │\n",
"│ │\n",
"│ /usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:165 in new_forward │\n",
"│ │\n",
"│ 162 │ │ │ with torch.no_grad(): │\n",
"│ 163 │ │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 164 │ │ else: │\n",
"│ ❱ 165 │ │ │ output = old_forward(*args, **kwargs) │\n",
"│ 166 │ │ return module._hf_hook.post_forward(module, output) │\n",
"│ 167 │ │\n",
"│ 168 │ module.forward = new_forward │\n",
"│ │\n",
"│ /root/.cache/huggingface/modules/transformers_modules/tiiuae/falcon-rw-1b/104655c0c067936f1ae2b4 │\n",
"│ 73625fe5161375591a/modelling_RW.py:306 in forward │\n",
"│ │\n",
"│ 303 │ │ │ │ attention_scores = attention_scores.to(torch.float32) │\n",
"│ 304 │ │ │ # attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.f │\n",
"│ 305 │ │ │ attention_probs = F.softmax( │\n",
"│ ❱ 306 │ │ │ │ (attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)) * sel │\n",
"│ 307 │ │ │ │ dim=-1, │\n",
"│ 308 │ │ │ │ dtype=hidden_states.dtype, │\n",
"│ 309 │ │ │ ) │\n",
"╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
"OutOfMemoryError: CUDA out of memory. Tried to allocate 2.74 GiB (GPU 0; 14.75 GiB total capacity; 8.16 GiB already\n",
"allocated; 2.63 GiB free; 11.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try \n",
"setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and \n",
"PYTORCH_CUDA_ALLOC_CONF\n",
"
\n"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"trainer = transformers.Trainer(\n",
" model=model,\n",
" train_dataset=data[\"train\"],\n",
" args=transformers.TrainingArguments(\n",
" per_device_train_batch_size=1,\n",
" gradient_accumulation_steps=4,\n",
" warmup_steps=40,\n",
" max_steps=1000,\n",
" learning_rate=2e-4,\n",
" fp16=True,\n",
" save_strategy=\"steps\",\n",
" save_steps=100,\n",
" logging_steps=1,\n",
" output_dir=output_dir,\n",
" optim=\"paged_adamw_8bit\"\n",
" ),\n",
" data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
")\n",
"model.config.use_cache = False # silence the warnings. Please re-enable for inference!\n",
"trainer.train(resume_from_checkpoint=True)\n",
"model.save_pretrained(output_dir)"
],
"metadata": {
"id": "K_CcqVS424Gr"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "jY53rV1wcd7K"
},
"execution_count": null,
"outputs": []
}
]
}