{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "zJvyUmZdktu0",
"outputId": "25275dd8-4787-4b77-f7bf-4f2bbe66f0b8"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/workspace/axolotl\n"
]
}
],
"source": [
"%cd /workspace/axolotl"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mC48y25Lkqa5",
"outputId": "2757a3c8-3790-4fd3-be39-03be8f533b35"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Setting ds_accelerator to cuda (auto detect)\n",
"accelerate configuration saved at /root/.cache/huggingface/accelerate/default_config.yaml\n"
]
}
],
"source": [
"!accelerate config --config_file configs/accelerate/default_config.yaml default"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Based on https://gist.github.com/fearnworks/723709806cebc67bafe1eb8138e7efbd\n",
"base_model: openlm-research/open_llama_3b_600bt_preview\n",
"base_model_config: openlm-research/open_llama_3b_600bt_preview\n",
"model_type: LlamaForCausalLM\n",
"tokenizer_type: LlamaTokenizer\n",
"load_in_8bit: false\n",
"load_in_4bit: true\n",
"strict: false\n",
"push_dataset_to_hub:\n",
"datasets:\n",
" # - path: AtlasUnified/Code-Instruct-Sets\n",
" # data_files:\n",
" # - unmasked-set-1.jsonl\n",
" # - unmasked-set-2.jsonl\n",
" # - unmasked-set-3.jsonl\n",
" # - unmasked-set-4.jsonl\n",
" # type: alpaca_code_instruct\n",
" # - path: winglian/pygmalion-cleaned\n",
" # data_files:\n",
" # - v13_no_ai.cleaned.jsonl\n",
" # type: pygmalion\n",
" # shards: 4\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - hf/ARC-Challenge.jsonl\n",
" # - hf/ARC-Easy.jsonl\n",
" # - hf/riddle_sense.jsonl\n",
" # type: explainchoice:chat\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - hf/gsm8k.jsonl\n",
" # - custom/logic_inference_oa.jsonl\n",
" # type: alpaca_chat.load_qa\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - custom/in_context_qa.jsonl\n",
" # type: context_qa\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - custom/in_context_qa.jsonl\n",
" # type: context_qa.load_404\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - custom/jokes_explained_500up.jsonl\n",
" # type: sharegpt_jokes\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - custom/classify-self-chat.sharegpt.jsonl\n",
" # - custom/coding-self-chat.sharegpt.jsonl\n",
" # - custom/prose-gpt4.sharegpt.jsonl\n",
" # - custom/prose-rewrite-gpt4.sharegpt.jsonl\n",
" # type: sharegpt_simple\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - custom/guanaco-cleaned.en.jsonl\n",
" # type: sharegpt_simple.load_guanaco\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - openai/tldr.jsonl\n",
" # type: summarizetldr:chat\n",
" # - path: winglian/evals\n",
" # data_files:\n",
" # - hellaswag/hellaswag.jsonl\n",
" # type: explainchoice:chat\n",
" # shards: 60\n",
" # - path: metaeval/ScienceQA_text_only\n",
" # type: concisechoice:chat\n",
" # shards: 13\n",
" # - path: teknium/GPTeacher-General-Instruct\n",
" # data_files: \n",
" # - gpt4-instruct-similarity-0.6-dataset.json\n",
" # type: gpteacher:chat\n",
" - path: QingyiSi/Alpaca-CoT\n",
" data_files:\n",
" # - chain-of-thought/formatted_cot_data/aqua_train.jsonl\n",
" # - Chain-of-Thought/formatted_cot_data/creak_train.json\n",
" # - Chain-of-Thought/formatted_cot_data/ecqa_train.json\n",
" # - Chain-of-Thought/formatted_cot_data/esnli_train.json\n",
" - Chain-of-Thought/formatted_cot_data/gsm8k_train.json\n",
" # - Chain-of-Thought/formatted_cot_data/qasc_train.json\n",
" # - Chain-of-Thought/formatted_cot_data/qed_train.json\n",
" # - Chain-of-Thought/formatted_cot_data/sensemaking_train.json\n",
" # - Chain-of-Thought/formatted_cot_data/strategyqa_train.json\n",
" # - GPTeacher/Roleplay/formatted_roleplay-similarity_0.6-instruct-dataset.json\n",
" type: \"alpaca:chat\"\n",
"dataset_prepared_path: last_run_prepared\n",
"val_set_size: 0.01\n",
"adapter: qlora\n",
"lora_model_dir:\n",
"sequence_len: 2048\n",
"max_packed_sequence_len: 2048\n",
"lora_r: 64\n",
"lora_alpha: 16\n",
"lora_dropout: 0.05\n",
"lora_target_modules:\n",
"lora_target_linear: true\n",
"lora_fan_in_fan_out:\n",
"wandb_project: openllama-7b-qlora-gsm8k\n",
"wandb_watch:\n",
"wandb_run_id:\n",
"wandb_log_model: checkpoint\n",
"output_dir: ./qlora-out\n",
"batch_size: 36\n",
"micro_batch_size: 9\n",
"num_epochs: 3\n",
"optimizer: paged_adamw_32bit\n",
"torchdistx_path:\n",
"lr_scheduler: cosine\n",
"learning_rate: 0.0002\n",
"train_on_inputs: false\n",
"group_by_length: false\n",
"bf16: true\n",
"fp16: false\n",
"tf32: true\n",
"gradient_checkpointing: true\n",
"# stop training after this many evaluation losses have increased in a row\n",
"# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback\n",
"early_stopping_patience: 3\n",
"resume_from_checkpoint:\n",
"auto_resume_from_checkpoints: true\n",
"local_rank:\n",
"logging_steps: 1\n",
"xformers_attention: false\n",
"flash_attention:\n",
"gptq_groupsize:\n",
"gptq_model_v1:\n",
"warmup_steps: 10\n",
"eval_steps: 5\n",
"save_steps: 10\n",
"debug:\n",
"deepspeed:\n",
"weight_decay: 0.000001\n",
"fsdp:\n",
"fsdp_config:\n",
"special_tokens:\n",
" bos_token: \"\"\n",
" eos_token: \"\"\n",
" unk_token: \"\"\n"
]
}
],
"source": [
"!cat examples/openllama/qlora.yml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "jICMPJuomFsx"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Setting ds_accelerator to cuda (auto detect)\n",
"\n",
"===================================BUG REPORT===================================\n",
"Welcome to bitsandbytes. For bug reports, please run\n",
"\n",
"python -m bitsandbytes\n",
"\n",
" and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
"================================================================================\n",
"bin /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda118.so\n",
"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib64'), PosixPath('/usr/local/nvidia/lib')}\n",
" warn(msg)\n",
"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
" warn(msg)\n",
"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
" warn(msg)\n",
"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIK1tFOFrWbmoa2ckCJYhzgBHKTSMeR/AeuScCCzugqlI utensilcandel@gmail.com')}\n",
" warn(msg)\n",
"CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...\n",
"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: Found duplicate ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] files: {PosixPath('/usr/local/cuda/lib64/libcudart.so'), PosixPath('/usr/local/cuda/lib64/libcudart.so.11.0')}.. We'll flip a coin and try one of these, in order to fail forward.\n",
"Either way, this might cause trouble in the future:\n",
"If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.\n",
" warn(msg)\n",
"CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
"CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
"CUDA SETUP: Detected CUDA version 118\n",
"CUDA SETUP: Loading binary /root/miniconda3/envs/py3.9/lib/python3.9/site-packages/bitsandbytes-0.39.0-py3.9.egg/bitsandbytes/libbitsandbytes_cuda118.so...\n",
"Setting ds_accelerator to cuda (auto detect)\n",
"INFO:root:loading tokenizer...\n",
"Using pad_token, but it is not set yet.\n",
"INFO:root:Loading prepared packed dataset from disk at last_run_prepared/21a0611c6c2b67b31f00097fa2a91c26...\n",
"INFO:root:Prepared packed dataset loaded from disk...\n",
"INFO:root:loading model and peft_config...\n",
"INFO:root:converting PEFT model w/ prepare_model_for_int8_training\n",
"/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/peft/utils/other.py:76: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.\n",
" warnings.warn(\n",
"INFO:root:found linear modules: ['k_proj', 'gate_proj', 'q_proj', 'v_proj', 'o_proj', 'up_proj', 'down_proj']\n",
"trainable params: 101703680 || all params: 1917425280 || trainable%: 5.304179571472011\n",
"INFO:root:Compiling torch model\n",
"INFO:root:Pre-saving adapter config to ./qlora-out\n",
"INFO:root:Starting trainer...\n",
"INFO:root:Using Auto-resume functionality to start with checkpoint at qlora-out/checkpoint-40\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mutensil\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.3\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/workspace/axolotl/wandb/run-20230531_043745-ggfx5q40\u001b[0m\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mpeach-feather-14\u001b[0m\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/utensil/openllama-7b-qlora-gsm8k\u001b[0m\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/utensil/openllama-7b-qlora-gsm8k/runs/ggfx5q40\u001b[0m\n",
"{'loss': 0.7336, 'learning_rate': 0.0001, 'epoch': 1.71} \n",
"{'loss': 0.7318, 'learning_rate': 9.493508311612874e-05, 'epoch': 1.75} \n",
"{'loss': 0.7294, 'learning_rate': 8.98831678012568e-05, 'epoch': 1.79} \n",
"{'loss': 0.7361, 'learning_rate': 8.485722224954237e-05, 'epoch': 1.83} \n",
"{'loss': 0.692, 'learning_rate': 7.987014799113397e-05, 'epoch': 1.88} \n",
" 62%|██████████████████████████▉ | 45/72 [04:57<06:00, 13.33s/it]\n",
" \u001b[A\n",
"\u001b[A{'eval_loss': 0.7622343897819519, 'eval_runtime': 4.0149, 'eval_samples_per_second': 1.993, 'eval_steps_per_second': 0.249, 'epoch': 1.88}\n",
" 62%|██████████████████████████▉ | 45/72 [05:01<06:00, 13.33s/it]\n",
"100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 7.40it/s]\u001b[A\n",
"{'loss': 0.7289, 'learning_rate': 7.493474677412794e-05, 'epoch': 1.92} \u001b[A\n",
"{'loss': 0.7027, 'learning_rate': 7.006368770266421e-05, 'epoch': 1.96} \n",
"{'loss': 0.7396, 'learning_rate': 6.526947471551798e-05, 'epoch': 2.0} \n",
" 67%|████████████████████████████▋ | 48/72 [07:49<11:07, 27.80s/it]"
]
}
],
"source": [
"!accelerate launch scripts/finetune.py examples/openllama/qlora.yml"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Below are ad hoc cells handling issues during training"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading package lists... Done\n",
"Building dependency tree... Done\n",
"Reading state information... Done\n",
"The following NEW packages will be installed:\n",
" lsof\n",
"0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.\n",
"Need to get 253 kB of archives.\n",
"After this operation, 458 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 lsof amd64 4.93.2+dfsg-1.1build2 [253 kB]\n",
"Fetched 253 kB in 1s (364 kB/s)0m\u001b[33m\n",
"debconf: delaying package configuration, since apt-utils is not installed\n",
"\n",
"\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package lsof.\n",
"(Reading database ... 21634 files and directories currently installed.)\n",
"Preparing to unpack .../lsof_4.93.2+dfsg-1.1build2_amd64.deb ...\n",
"\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking lsof (4.93.2+dfsg-1.1build2) ...\n",
"\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up lsof (4.93.2+dfsg-1.1build2) ...\n",
"\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8\n",
"\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J"
]
}
],
"source": [
"!apt install lsof"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME\n",
"docker-in 1 root 0u CHR 1,3 0t0 6 /dev/null\n",
"bash 7 root 0u CHR 1,3 0t0 6 /dev/null\n",
"sshd 19 root 0u CHR 1,3 0t0 6 /dev/null\n",
"sshd 19 root 1u CHR 1,3 0t0 6 /dev/null\n",
"sshd 19 root 2u CHR 1,3 0t0 6 /dev/null\n",
"jupyter-l 2308 root 0r CHR 1,3 0t0 6 /dev/null\n",
"jupyter-l 2308 root 12r CHR 1,9 0t0 11 /dev/urandom\n",
"python3 2541 root 4r CHR 1,9 0t0 11 /dev/urandom\n",
"python3 2947 root mem CHR 195,255 472 /dev/nvidiactl\n",
"python3 2947 root mem CHR 195,0 473 /dev/nvidia0\n",
"python3 2947 root mem CHR 234,0 481 /dev/nvidia-uvm\n",
"python3 2947 root 3r CHR 1,9 0t0 11 /dev/urandom\n",
"python3 2947 root 132u CHR 195,255 0t0 472 /dev/nvidiactl\n",
"python3 2947 root 133u CHR 234,0 0t0 481 /dev/nvidia-uvm\n",
"python3 2947 root 134u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 135u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 136u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 139u CHR 195,255 0t0 472 /dev/nvidiactl\n",
"python3 2947 root 140u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 141u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 142u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 145u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 147u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 148u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 149u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 151u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 152u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 153u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 2947 root 154u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 3545 root 4r CHR 1,9 0t0 11 /dev/urandom\n",
"python3 4493 root mem CHR 195,255 472 /dev/nvidiactl\n",
"python3 4493 root mem CHR 195,0 473 /dev/nvidia0\n",
"python3 4493 root mem CHR 234,0 481 /dev/nvidia-uvm\n",
"python3 4493 root 3r CHR 1,9 0t0 11 /dev/urandom\n",
"python3 4493 root 132u CHR 195,255 0t0 472 /dev/nvidiactl\n",
"python3 4493 root 133u CHR 234,0 0t0 481 /dev/nvidia-uvm\n",
"python3 4493 root 134u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 135u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 136u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 139u CHR 195,255 0t0 472 /dev/nvidiactl\n",
"python3 4493 root 140u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 141u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 142u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 145u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 146u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 147u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 148u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 150u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 151u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 152u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 4493 root 153u CHR 195,0 0t0 473 /dev/nvidia0\n",
"sh 4950 root 10u CHR 5,0 0t0 13 /dev/tty\n",
"python3 5051 root mem CHR 195,255 472 /dev/nvidiactl\n",
"python3 5051 root mem CHR 195,0 473 /dev/nvidia0\n",
"python3 5051 root mem CHR 234,0 481 /dev/nvidia-uvm\n",
"python3 5051 root 3r CHR 1,9 0t0 11 /dev/urandom\n",
"python3 5051 root 132u CHR 195,255 0t0 472 /dev/nvidiactl\n",
"python3 5051 root 133u CHR 234,0 0t0 481 /dev/nvidia-uvm\n",
"python3 5051 root 134u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 135u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 136u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 139u CHR 195,255 0t0 472 /dev/nvidiactl\n",
"python3 5051 root 140u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 141u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 142u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 145u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 146u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 147u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 148u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 150u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 151u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 152u CHR 195,0 0t0 473 /dev/nvidia0\n",
"python3 5051 root 153u CHR 195,0 0t0 473 /dev/nvidia0\n",
"tmux:\\x20 5801 root 0u CHR 1,3 0t0 6 /dev/null\n",
"tmux:\\x20 5801 root 1u CHR 1,3 0t0 6 /dev/null\n",
"tmux:\\x20 5801 root 2u CHR 1,3 0t0 6 /dev/null\n",
"nvitop 5817 root 3u CHR 195,255 0t0 472 /dev/nvidiactl\n",
"nvitop 5817 root 4u CHR 195,0 0t0 473 /dev/nvidia0\n",
"nvitop 5817 root 5u CHR 195,0 0t0 473 /dev/nvidia0\n",
"nvitop 5817 root 6u CHR 195,0 0t0 473 /dev/nvidia0\n"
]
}
],
"source": [
"!lsof /dev/nvidia*"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"!ps aux|grep python|grep finetune|awk '{print $2}'|xargs kill -9"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"!kill -9 2960 "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root 2353 0.7 0.0 576260 110108 ? Sl 12:51 0:05 /root/miniconda3/envs/py3.9/bin/python3 /root/miniconda3/envs/py3.9/bin/jupyter-lab --allow-root --no-browser --port=8888 --ip=* --ServerApp.token=sc --ServerApp.allow_origin=* --ServerApp.preferred_dir=/workspace/\n",
"root 2636 1.6 0.0 770824 63020 ? Ssl 12:52 0:12 /root/miniconda3/envs/py3.9/bin/python3 -m ipykernel_launcher -f /root/.local/share/jupyter/runtime/kernel-b2638c7c-467b-4866-a969-c97f1b037796.json\n",
"root 3776 3.5 0.0 316080 90152 pts/2 Sl+ 12:55 0:19 /root/miniconda3/envs/py3.9/bin/python3 /root/miniconda3/envs/py3.9/bin/nvitop -m full\n",
"root 5019 0.0 0.0 2880 952 pts/3 Ss+ 13:04 0:00 /usr/bin/sh -c ps aux|grep python\n",
"root 5022 0.0 0.0 3836 1968 pts/3 S+ 13:04 0:00 grep python\n"
]
}
],
"source": [
"!ps aux|grep python"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pwd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!apt install zip\n",
"!zip -r last_run_prepared.zip -xi last_run_prepared"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"!pip install nvitop"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!nvitop -m full"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"authorship_tag": "ABX9TyP0jHt4WNuLaC5ecmX0YtWl",
"gpuType": "T4",
"include_colab_link": true,
"provenance": [],
"toc_visible": true
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 4
}