{ "cells": [ { "cell_type": "markdown", "id": "7845a5bd-a65f-46b4-a775-6c2a5b842cc8", "metadata": { "tags": [] }, "source": [ "# Model Cards\n", "\n", "### This notebook showcases 5 LLMs with varying licenses. To run all models, follow these [GPU Configurations](https://github.com/jackfrost1411/HUST23-SC23-LLMs/tree/master/gpu-config).\n", "\n", "#### The layout is as follows:\n", "\n", "* **Initial set up (Important Environment Variable Discussion)**\n", "* **Falcon 7b/ 13b/ 40b Instruct (Apache 2.0 License)**\n", "* **LLaMa2 7b/ 13b/ 70b (Meta License)**\n", "* **C4AI Command R 35b (C4AI License)**\n", "* **Mistral 7b, Mixtral 8x7b (Apache)**\n", "* **Llama3 8b/70b (Llama3 License)**" ] }, { "cell_type": "markdown", "id": "238b0163-a438-4fbc-a244-914dedecd311", "metadata": { "tags": [] }, "source": [ "## Setting the path to institute's centralized database containing the models" ] }, { "cell_type": "code", "execution_count": 1, "id": "18815d70-04f5-439b-8a54-c281f3f141b1", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "# Two directories need to be specified. The HUGGINGFACE_HUB_CACHE is where the intermediate data from the running model are stored\n", "# which needs to be writeable by the user\n", "# the other path is where are the models, this is called LLM_CACHE_PATH and is set by the module (and used in the cells that load the model)\n", "os.environ['HUGGINGFACE_HUB_CACHE'] = f\"{os.environ['HOME']}/llm/cache\"" ] }, { "cell_type": "code", "execution_count": 2, "id": "c9357c1b-cf44-4bdc-a13f-6ba0c7906728", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking the number of GPUs requested\n", "import torch\n", "torch.cuda.device_count()" ] }, { "cell_type": "markdown", "id": "4c26610f-e51a-4acd-af64-f7b7d3c902fc", "metadata": { "id": "WXOZ_Un6e1oo", "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Falcon 40b\n", "#### It is made available under the Apache 2.0 license.\n", "##### For running 7b model change the model_id to \"tiiuae/falcon-7b\"\n", "##### https://huggingface.co/tiiuae/falcon-40b" ] }, { "cell_type": "code", "execution_count": 3, "id": "d458a6dd-8496-42d8-b7fa-347f500e67bf", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "WARNING: You are currently loading Falcon using legacy code contained in the model repository. Falcon has now been fully ported into the Hugging Face transformers library. For the most up-to-date and high-performance version of the Falcon model code, please update to the latest version of transformers and then load the model without the trust_remote_code=True argument.\n", "\n", "The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "71eb9804833f42568ccd08c72aee474a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/9 [00:00 2\u001b[0m \u001b[43mchatbot\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_chat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[6], line 35\u001b[0m, in \u001b[0;36mChatBot.start_chat\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m user_response \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 34\u001b[0m user_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28minput\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mChat here!\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 35\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchat\u001b[49m\u001b[43m(\u001b[49m\u001b[43muser_response\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[6], line 41\u001b[0m, in \u001b[0;36mChatBot.chat\u001b[0;34m(self, reply)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmake_exit(reply):\n\u001b[1;32m 40\u001b[0m input_ \u001b[38;5;241m=\u001b[39m reply\n\u001b[0;32m---> 41\u001b[0m reply \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43minput\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mconversation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[38;5;250;43m \u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;250;43m \u001b[39;49m\u001b[43minput_\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;130;43;01m\\n\u001b[39;49;00m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/ipykernel/kernelbase.py:1282\u001b[0m, in \u001b[0;36mKernel.raw_input\u001b[0;34m(self, prompt)\u001b[0m\n\u001b[1;32m 1280\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mraw_input was called, but this frontend does not support input requests.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1281\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m StdinNotImplementedError(msg)\n\u001b[0;32m-> 1282\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_input_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1283\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mstr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mprompt\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1284\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_parent_ident\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1285\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_parent\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshell\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1286\u001b[0m \u001b[43m \u001b[49m\u001b[43mpassword\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1287\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/ipykernel/kernelbase.py:1325\u001b[0m, in \u001b[0;36mKernel._input_request\u001b[0;34m(self, prompt, ident, parent, password)\u001b[0m\n\u001b[1;32m 1322\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m 1323\u001b[0m \u001b[38;5;66;03m# re-raise KeyboardInterrupt, to truncate traceback\u001b[39;00m\n\u001b[1;32m 1324\u001b[0m msg \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInterrupted by user\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1325\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m(msg) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1327\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mwarning(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInvalid Message:\u001b[39m\u001b[38;5;124m\"\u001b[39m, exc_info\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: Interrupted by user" ] } ], "source": [ "chatbot = ChatBot()\n", "chatbot.start_chat()" ] }, { "cell_type": "markdown", "id": "a4d163c4-593a-4b02-9418-de76c18dbdd7", "metadata": { "tags": [] }, "source": [ "## LLaMA2 7b Chat\n", "#### License A custom commercial license is available at: https://ai.meta.com/resources/models-and-libraries/llama-downloads/\n", "##### For running 70b / 13b models change the model_id to `$LLM_CACHE_PATH/meta-llama/Llama-2-70b-chat-hf/` / `$LLM_CACHE_PATH/meta-llama/Llama-2-13b-chat-hf/`\n", "##### https://huggingface.co/meta-llama/Llama-2-70b-chat-hf" ] }, { "cell_type": "code", "execution_count": 3, "id": "51c27cb7-f7ca-418c-b81a-791a03acbf8c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/uufs/chpc.utah.edu/common/home/u0101881/llm/HUST23-SC23-LLMs/miniconda3/envs/genai/lib/python3.12/site-packages/transformers/models/auto/configuration_auto.py:1123: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.\n", " warnings.warn(\n", "The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "10d2ea5565fc49e0beceb2602d1adb2e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/2 [00:00>\n", "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.\n", "\n", "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n", "<>\n", "\n", "CONTEXT: \n", "\n", "{history}\n", "\n", "Question: {input}[/INST]'''\n", "\n", "class ChatBot:\n", " exit_commands = (\"quit\", \"pause\", \"exit\", \"goodbye\", \"bye\", \"later\", \"stop\")\n", "\n", " #Method to start the conversation\n", " def start_chat(self):\n", " user_response = input(\"Chat here!\\n\")\n", " while user_response == '':\n", " user_response = input(\"Chat here!\\n\")\n", " self.chat(user_response)\n", "\n", " #Method to handle the conversation\n", " def chat(self, reply):\n", " while not self.make_exit(reply):\n", " input_ = reply\n", " reply = input(f\"{conversation.predict(input = input_)}\\n\")\n", "\n", " #Method to check for exit commands\n", " def make_exit(self, reply):\n", " for exit_command in self.exit_commands:\n", " if exit_command in reply.lower():\n", " memory.clear()\n", " torch.cuda.empty_cache()\n", " print(\"Ok, have a great day!\")\n", " return True\n", " return False" ] }, { "cell_type": "code", "execution_count": null, "id": "f6496318-1cfe-4d67-8753-8e80cca7fa1b", "metadata": {}, "outputs": [], "source": [ "chatbot = ChatBot()\n", "chatbot.start_chat()" ] }, { "cell_type": "markdown", "id": "f49ca1c6-1f6a-4808-b2a8-166cc7c3aaf3", "metadata": { "tags": [] }, "source": [ "## C4AI Command-R\n", "#### License: Creative Commons Attribution-NonCommercial 4.0 International Public License with Acceptable Use Addendum\n", "#### https://cohere.com/c4ai-cc-by-nc-license\n", "##### C4AI Command-R is a research release of a 35 billion parameter highly performant generative model. Command-R is a large language model with open weights optimized for a variety of use cases including reasoning, summarization, and question answering. Command-R has the capability for multilingual generation evaluated in 10 languages and highly performant RAG capabilities.\n", "##### https://huggingface.co/CohereForAI/c4ai-command-r-v01-4bit" ] }, { "cell_type": "code", "execution_count": 23, "id": "13a7e5e7-f0e6-4a4f-9d3b-557ba3c6737d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "`low_cpu_mem_usage` was None, now set to True since model is quantized.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f9c2429931cb4c62a099259e6a265c1a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/5 [00:00<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hello! As an AI chatbot, I don't experience emotions or feelings, but I'm functioning properly and ready to assist you in any way I can. How can I help you today?<|END_OF_TURN_TOKEN|>\n" ] } ], "source": [ "# Format message with the command-r chat template\n", "messages = [{\"role\": \"user\", \"content\": \"Hello, how are you?\"}]\n", "input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors=\"pt\")\n", "## <|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>\n", "\n", "gen_tokens = model.generate(\n", " input_ids, \n", " max_new_tokens=100, \n", " do_sample=True, \n", " temperature=0.3,\n", " )\n", "\n", "gen_text = tokenizer.decode(gen_tokens[0])\n", "print(gen_text)" ] }, { "cell_type": "code", "execution_count": 12, "id": "30b05f53-ca63-4ff7-aea3-8844b8eb9819", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "from langchain.llms import HuggingFacePipeline\n", "\n", "pipe = pipeline(\n", " \"text-generation\",\n", " model=model,\n", " tokenizer=tokenizer,\n", " # torch_dtype=torch.bfloat16,\n", " device_map=\"auto\",\n", " pad_token_id=tokenizer.eos_token_id, \n", " max_length=2048,\n", " # temperature=1,\n", " # top_p=0.95,\n", " # repetition_penalty=1.15\n", ")\n", "\n", "local_llm = HuggingFacePipeline(pipeline=pipe)" ] }, { "cell_type": "code", "execution_count": 13, "id": "cbd4f61a-aa95-41be-a8dd-4994da9f4e85", "metadata": {}, "outputs": [], "source": [ "from langchain.chains import ConversationChain\n", "from langchain.chains.conversation.memory import ConversationSummaryBufferMemory#Summary\n", "\n", "memory = ConversationSummaryBufferMemory(llm=local_llm, max_token_limit=512)\n", "memory.save_context({\"input\": \"Hello\"}, {\"output\": \"What's up\"})\n", "conversation = ConversationChain(\n", " llm=local_llm, \n", " memory = memory,\n", " verbose=False\n", ")\n", "\n", "conversation.prompt.template='''### System:\n", "You are Command-R, an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal. Below is the current conversation history and an input that provides further context. Write a response that appropriately completes the request.\n", "\n", "### Current conversation:\n", "{history}\n", "\n", "### User:\n", "{input}\n", "\n", "### Assistant:'''\n", "\n", "class ChatBot:\n", " exit_commands = (\"quit\", \"pause\", \"exit\", \"goodbye\", \"bye\", \"later\", \"stop\")\n", "\n", " #Method to start the conversation\n", " def start_chat(self):\n", " user_response = input(\"Chat here!\\n\")\n", " while user_response == '':\n", " user_response = input(\"Chat here!\\n\")\n", " self.chat(user_response)\n", "\n", " #Method to handle the conversation\n", " def chat(self, reply):\n", " while not self.make_exit(reply):\n", " input_ = reply\n", " reply = input(f\"{conversation.predict(input = input_)}\\n\")\n", "\n", " #Method to check for exit commands\n", " def make_exit(self, reply):\n", " for exit_command in self.exit_commands:\n", " if exit_command in reply.lower():\n", " memory.clear()\n", " torch.cuda.empty_cache()\n", " print(\"Ok, have a great day!\")\n", " return True\n", " return False" ] }, { "cell_type": "code", "execution_count": null, "id": "8c9f8060-c88d-4e8f-9fb1-f5963f85c73d", "metadata": {}, "outputs": [], "source": [ "chatbot = ChatBot()\n", "chatbot.start_chat()" ] }, { "cell_type": "markdown", "id": "172fcac3-ad33-4138-95a3-9e85f2b4b52b", "metadata": { "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ "## Mistral-7B-Instruct\n", "#### License: Apache 2.0 \n", "#### The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an instruct fine-tuned version of the Mistral-7B-v0.2\n", "##### For running 7b model change the model_id to \"mistralai/Mistral-7B-Instruct-v0.2\"\n", "##### https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2" ] }, { "cell_type": "code", "execution_count": 16, "id": "09a5a1d2-0915-41e0-91f7-be9c899be404", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f648fbd86ee54214b7708481062ca72c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/3 [00:00 [INST] What is your favourite condiment? [/INST]Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] Certainly! Here's a classic and simple mayonnaise recipe that you can make at home.\n", "\n", "**Homemade Mayonnaise**\n", "\n", "* 1 egg yolk\n", "* 1 tbsp Dijon mustard\n", "* 1 tbsp white wine vinegar\n", "* 1/2 cup neutral oil (like vegetable or canola oil)\n", "* 1/2 cup light olive oil or avocado oil\n", "* 1-2 tbsp water\n", "* Salt to taste\n", "\n", "Whisk together the egg yolk, mustard, and vinegar in a medium bowl until the mixture thickens slightly and turns pale yellow. With the whisk continually moving, add the neutral oil in a thin, steady stream until it is completely incorporated and the mixture has thickened to a mayonnaise consistency. Then, slowly drizzle in the olive or avocado oil, continuing to whisk continuously. Once all the oil has been added, whisk in the water, a tablespoon at a time, until the desired consistency is reached. Season with salt to taste.\n", "\n", "If you're concerned about using raw eggs, you can use a method called \"cooked mayonnaise\" instead. The classic French version, called \"AIoli,\" cooks the garlic with the egg yolk before adding the oil. Here's a simple recipe:\n", "\n", "**Garlic Aioli**\n", "\n", "* 2 garlic cloves, minced\n", "* 1 egg yolk\n", "* 1/2 cup neutral oil (like vegetable or canola oil)\n", "* 1/2 cup olive oil\n", "* 1 tbsp white wine vinegar\n", "* Salt to taste\n", "\n", "Sauté the garlic in a small saucepan over medium heat until lightly golden brown. Remove from heat and let cool. In a blender, combine the garlic and egg yolk and process until the yolk turns pale yellow. With the blender running, slowly drizzle in the neutral oil in a thin, steady stream until thickened. Then, slowly drizzle in the olive oil. Once all the oil has been added, add the vinegar and season with salt. Process until smooth.\n", "\n", "Both of these homemade mayonnaise recipes can be stored in an airtight container in the refrigerator for up to 1 week. Enjoy your homemade mayonnaise as a sandwich spread, use it as a base for other sauces, or serve it alongside grilled vegetables and meats.\n" ] } ], "source": [ "device = \"cuda\" # the device to load the model onto\n", "\n", "messages = [\n", " {\"role\": \"user\", \"content\": \"What is your favourite condiment?\"},\n", " {\"role\": \"assistant\", \"content\": \"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!\"},\n", " {\"role\": \"user\", \"content\": \"Do you have mayonnaise recipes?\"}\n", "]\n", "\n", "encodeds = tokenizer.apply_chat_template(messages, return_tensors=\"pt\")\n", "\n", "model_inputs = encodeds.to(device)\n", "model.to(device)\n", "\n", "generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)\n", "decoded = tokenizer.batch_decode(generated_ids)\n", "print(decoded[0])" ] }, { "cell_type": "code", "execution_count": 17, "id": "a50f18f6-c347-4ee6-bc67-7c0b6d4930f3", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline\n", "from langchain.llms import HuggingFacePipeline\n", "\n", "pipe = pipeline(\n", " \"text-generation\",\n", " model=model,\n", " tokenizer=tokenizer,\n", " # torch_dtype=torch.bfloat16,\n", " device_map=\"auto\",\n", " pad_token_id=tokenizer.eos_token_id, \n", " max_length=2048,\n", " # temperature=1,\n", " # top_p=0.95,\n", " # repetition_penalty=1.15\n", ")\n", "\n", "local_llm = HuggingFacePipeline(pipeline=pipe)" ] }, { "cell_type": "code", "execution_count": 18, "id": "83dc09dd-43f3-4865-a83a-73c6b2eb0225", "metadata": {}, "outputs": [], "source": [ "from langchain.chains import ConversationChain\n", "from langchain.chains.conversation.memory import ConversationSummaryBufferMemory#Summary\n", "\n", "memory = ConversationSummaryBufferMemory(llm=local_llm, max_token_limit=512)\n", "memory.save_context({\"input\": \"Hello\"}, {\"output\": \"What's up\"})\n", "conversation = ConversationChain(\n", " llm=local_llm, \n", " memory = memory,\n", " verbose=False\n", ")\n", "\n", "conversation.prompt.template='''### System:\n", "You are an AI that follows instructions extremely well. Help as much as you can. Remember, be safe, and don't do anything illegal. Below is the current conversation history and an input that provides further context. Write a response that appropriately completes the request.\n", "\n", "### Current conversation:\n", "{history}\n", "\n", "### User:\n", "{input}\n", "\n", "### Assistant:'''\n", "\n", "class ChatBot:\n", " exit_commands = (\"quit\", \"pause\", \"exit\", \"goodbye\", \"bye\", \"later\", \"stop\")\n", "\n", " #Method to start the conversation\n", " def start_chat(self):\n", " user_response = input(\"Chat here!\\n\")\n", " while user_response == '':\n", " user_response = input(\"Chat here!\\n\")\n", " self.chat(user_response)\n", "\n", " #Method to handle the conversation\n", " def chat(self, reply):\n", " while not self.make_exit(reply):\n", " input_ = reply\n", " reply = input(f\"{conversation.predict(input = input_)}\\n\")\n", "\n", " #Method to check for exit commands\n", " def make_exit(self, reply):\n", " for exit_command in self.exit_commands:\n", " if exit_command in reply.lower():\n", " memory.clear()\n", " torch.cuda.empty_cache()\n", " print(\"Ok, have a great day!\")\n", " return True\n", " return False" ] }, { "cell_type": "code", "execution_count": 20, "id": "744ca4e8-f3cb-423a-a031-7bba760fbcc2", "metadata": {}, "outputs": [ { "name": "stdin", "output_type": "stream", "text": [ "Chat here!\n", " What's the capital of France?\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[20], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m chatbot \u001b[38;5;241m=\u001b[39m ChatBot()\n\u001b[0;32m----> 2\u001b[0m \u001b[43mchatbot\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_chat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[18], line 31\u001b[0m, in \u001b[0;36mChatBot.start_chat\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m user_response \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m'\u001b[39m:\n\u001b[1;32m 30\u001b[0m user_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28minput\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mChat here!\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 31\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchat\u001b[49m\u001b[43m(\u001b[49m\u001b[43muser_response\u001b[49m\u001b[43m)\u001b[49m\n", "Cell \u001b[0;32mIn[18], line 37\u001b[0m, in \u001b[0;36mChatBot.chat\u001b[0;34m(self, reply)\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmake_exit(reply):\n\u001b[1;32m 36\u001b[0m input_ \u001b[38;5;241m=\u001b[39m reply\n\u001b[0;32m---> 37\u001b[0m reply \u001b[38;5;241m=\u001b[39m \u001b[38;5;28minput\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mconversation\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[38;5;250;43m \u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;250;43m \u001b[39;49m\u001b[43minput_\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain/chains/llm.py:293\u001b[0m, in \u001b[0;36mLLMChain.predict\u001b[0;34m(self, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mpredict\u001b[39m(\u001b[38;5;28mself\u001b[39m, callbacks: Callbacks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[1;32m 279\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Format prompt with kwargs and pass to LLM.\u001b[39;00m\n\u001b[1;32m 280\u001b[0m \n\u001b[1;32m 281\u001b[0m \u001b[38;5;124;03m Args:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;124;03m completion = llm.predict(adjective=\"funny\")\u001b[39;00m\n\u001b[1;32m 292\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_key]\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain_core/_api/deprecation.py:148\u001b[0m, in \u001b[0;36mdeprecated..deprecate..warning_emitting_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 146\u001b[0m warned \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 147\u001b[0m emit_warning()\n\u001b[0;32m--> 148\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mwrapped\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain/chains/base.py:378\u001b[0m, in \u001b[0;36mChain.__call__\u001b[0;34m(self, inputs, return_only_outputs, callbacks, tags, metadata, run_name, include_run_info)\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Execute the chain.\u001b[39;00m\n\u001b[1;32m 347\u001b[0m \n\u001b[1;32m 348\u001b[0m \u001b[38;5;124;03mArgs:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 369\u001b[0m \u001b[38;5;124;03m `Chain.output_keys`.\u001b[39;00m\n\u001b[1;32m 370\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 371\u001b[0m config \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 372\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcallbacks\u001b[39m\u001b[38;5;124m\"\u001b[39m: callbacks,\n\u001b[1;32m 373\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtags\u001b[39m\u001b[38;5;124m\"\u001b[39m: tags,\n\u001b[1;32m 374\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata\u001b[39m\u001b[38;5;124m\"\u001b[39m: metadata,\n\u001b[1;32m 375\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrun_name\u001b[39m\u001b[38;5;124m\"\u001b[39m: run_name,\n\u001b[1;32m 376\u001b[0m }\n\u001b[0;32m--> 378\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minvoke\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 379\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 380\u001b[0m \u001b[43m \u001b[49m\u001b[43mcast\u001b[49m\u001b[43m(\u001b[49m\u001b[43mRunnableConfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[43mk\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 381\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_only_outputs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_only_outputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 382\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_run_info\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_run_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 383\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain/chains/base.py:163\u001b[0m, in \u001b[0;36mChain.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 162\u001b[0m run_manager\u001b[38;5;241m.\u001b[39mon_chain_error(e)\n\u001b[0;32m--> 163\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 164\u001b[0m run_manager\u001b[38;5;241m.\u001b[39mon_chain_end(outputs)\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m include_run_info:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain/chains/base.py:153\u001b[0m, in \u001b[0;36mChain.invoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_inputs(inputs)\n\u001b[1;32m 152\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_manager\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m new_arg_supported\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call(inputs)\n\u001b[1;32m 156\u001b[0m )\n\u001b[1;32m 158\u001b[0m final_outputs: Dict[\u001b[38;5;28mstr\u001b[39m, Any] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprep_outputs(\n\u001b[1;32m 159\u001b[0m inputs, outputs, return_only_outputs\n\u001b[1;32m 160\u001b[0m )\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain/chains/llm.py:103\u001b[0m, in \u001b[0;36mLLMChain._call\u001b[0;34m(self, inputs, run_manager)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_call\u001b[39m(\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 100\u001b[0m inputs: Dict[\u001b[38;5;28mstr\u001b[39m, Any],\n\u001b[1;32m 101\u001b[0m run_manager: Optional[CallbackManagerForChainRun] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 102\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mstr\u001b[39m]:\n\u001b[0;32m--> 103\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_manager\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcreate_outputs(response)[\u001b[38;5;241m0\u001b[39m]\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain/chains/llm.py:115\u001b[0m, in \u001b[0;36mLLMChain.generate\u001b[0;34m(self, input_list, run_manager)\u001b[0m\n\u001b[1;32m 113\u001b[0m callbacks \u001b[38;5;241m=\u001b[39m run_manager\u001b[38;5;241m.\u001b[39mget_child() \u001b[38;5;28;01mif\u001b[39;00m run_manager \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mllm, BaseLanguageModel):\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mllm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_prompt\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 116\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 117\u001b[0m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 118\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 119\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mllm_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 120\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 122\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mllm\u001b[38;5;241m.\u001b[39mbind(stop\u001b[38;5;241m=\u001b[39mstop, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mllm_kwargs)\u001b[38;5;241m.\u001b[39mbatch(\n\u001b[1;32m 123\u001b[0m cast(List, prompts), {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcallbacks\u001b[39m\u001b[38;5;124m\"\u001b[39m: callbacks}\n\u001b[1;32m 124\u001b[0m )\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain_core/language_models/llms.py:633\u001b[0m, in \u001b[0;36mBaseLLM.generate_prompt\u001b[0;34m(self, prompts, stop, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mgenerate_prompt\u001b[39m(\n\u001b[1;32m 626\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 627\u001b[0m prompts: List[PromptValue],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 630\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 631\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m LLMResult:\n\u001b[1;32m 632\u001b[0m prompt_strings \u001b[38;5;241m=\u001b[39m [p\u001b[38;5;241m.\u001b[39mto_string() \u001b[38;5;28;01mfor\u001b[39;00m p \u001b[38;5;129;01min\u001b[39;00m prompts]\n\u001b[0;32m--> 633\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprompt_strings\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain_core/language_models/llms.py:803\u001b[0m, in \u001b[0;36mBaseLLM.generate\u001b[0;34m(self, prompts, stop, callbacks, tags, metadata, run_name, run_id, **kwargs)\u001b[0m\n\u001b[1;32m 788\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m get_llm_cache() \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcache \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m:\n\u001b[1;32m 789\u001b[0m run_managers \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 790\u001b[0m callback_manager\u001b[38;5;241m.\u001b[39mon_llm_start(\n\u001b[1;32m 791\u001b[0m dumpd(\u001b[38;5;28mself\u001b[39m),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 801\u001b[0m )\n\u001b[1;32m 802\u001b[0m ]\n\u001b[0;32m--> 803\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 804\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrun_managers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mbool\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mnew_arg_supported\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 805\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 806\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output\n\u001b[1;32m 807\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(missing_prompts) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain_core/language_models/llms.py:670\u001b[0m, in \u001b[0;36mBaseLLM._generate_helper\u001b[0;34m(self, prompts, stop, run_managers, new_arg_supported, **kwargs)\u001b[0m\n\u001b[1;32m 668\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m run_manager \u001b[38;5;129;01min\u001b[39;00m run_managers:\n\u001b[1;32m 669\u001b[0m run_manager\u001b[38;5;241m.\u001b[39mon_llm_error(e, response\u001b[38;5;241m=\u001b[39mLLMResult(generations\u001b[38;5;241m=\u001b[39m[]))\n\u001b[0;32m--> 670\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 671\u001b[0m flattened_outputs \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39mflatten()\n\u001b[1;32m 672\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m manager, flattened_output \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(run_managers, flattened_outputs):\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain_core/language_models/llms.py:657\u001b[0m, in \u001b[0;36mBaseLLM._generate_helper\u001b[0;34m(self, prompts, stop, run_managers, new_arg_supported, **kwargs)\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_generate_helper\u001b[39m(\n\u001b[1;32m 648\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 649\u001b[0m prompts: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 653\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any,\n\u001b[1;32m 654\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m LLMResult:\n\u001b[1;32m 655\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 656\u001b[0m output \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m--> 657\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 658\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 659\u001b[0m \u001b[43m \u001b[49m\u001b[43mstop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstop\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 660\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# TODO: support multiple run managers\u001b[39;49;00m\n\u001b[1;32m 661\u001b[0m \u001b[43m \u001b[49m\u001b[43mrun_manager\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrun_managers\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mrun_managers\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 662\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 663\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 664\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m new_arg_supported\n\u001b[1;32m 665\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_generate(prompts, stop\u001b[38;5;241m=\u001b[39mstop)\n\u001b[1;32m 666\u001b[0m )\n\u001b[1;32m 667\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mBaseException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 668\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m run_manager \u001b[38;5;129;01min\u001b[39;00m run_managers:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/langchain_community/llms/huggingface_pipeline.py:267\u001b[0m, in \u001b[0;36mHuggingFacePipeline._generate\u001b[0;34m(self, prompts, stop, run_manager, **kwargs)\u001b[0m\n\u001b[1;32m 264\u001b[0m batch_prompts \u001b[38;5;241m=\u001b[39m prompts[i : i \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbatch_size]\n\u001b[1;32m 266\u001b[0m \u001b[38;5;66;03m# Process batch of prompts\u001b[39;00m\n\u001b[0;32m--> 267\u001b[0m responses \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpipeline\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mbatch_prompts\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mpipeline_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[38;5;66;03m# Process each response in the batch\u001b[39;00m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m j, response \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(responses):\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/pipelines/text_generation.py:240\u001b[0m, in \u001b[0;36mTextGenerationPipeline.__call__\u001b[0;34m(self, text_inputs, **kwargs)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m(chats, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 239\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 240\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__call__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtext_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/pipelines/base.py:1187\u001b[0m, in \u001b[0;36mPipeline.__call__\u001b[0;34m(self, inputs, num_workers, batch_size, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1183\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m can_use_iterator:\n\u001b[1;32m 1184\u001b[0m final_iterator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_iterator(\n\u001b[1;32m 1185\u001b[0m inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params\n\u001b[1;32m 1186\u001b[0m )\n\u001b[0;32m-> 1187\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mfinal_iterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n\u001b[1;32m 1189\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py:124\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_item()\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[0;32m--> 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mnext\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miterator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minfer(item, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mparams)\n\u001b[1;32m 126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py:125\u001b[0m, in \u001b[0;36mPipelineIterator.__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[38;5;66;03m# We're out of items within a batch\u001b[39;00m\n\u001b[1;32m 124\u001b[0m item \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mnext\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39miterator)\n\u001b[0;32m--> 125\u001b[0m processed \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparams\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 126\u001b[0m \u001b[38;5;66;03m# We now have a batch of \"inferred things\".\u001b[39;00m\n\u001b[1;32m 127\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloader_batch_size \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 128\u001b[0m \u001b[38;5;66;03m# Try to infer the size of the batch\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/pipelines/base.py:1112\u001b[0m, in \u001b[0;36mPipeline.forward\u001b[0;34m(self, model_inputs, **forward_params)\u001b[0m\n\u001b[1;32m 1110\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m inference_context():\n\u001b[1;32m 1111\u001b[0m model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_tensor_on_device(model_inputs, device\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[0;32m-> 1112\u001b[0m model_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mforward_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1113\u001b[0m model_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_ensure_tensor_on_device(model_outputs, device\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mdevice(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcpu\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 1114\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/pipelines/text_generation.py:327\u001b[0m, in \u001b[0;36mTextGenerationPipeline._forward\u001b[0;34m(self, model_inputs, **generate_kwargs)\u001b[0m\n\u001b[1;32m 324\u001b[0m generate_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmin_length\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m prefix_length\n\u001b[1;32m 326\u001b[0m \u001b[38;5;66;03m# BS x SL\u001b[39;00m\n\u001b[0;32m--> 327\u001b[0m generated_sequence \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgenerate_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 328\u001b[0m out_b \u001b[38;5;241m=\u001b[39m generated_sequence\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 329\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mframework \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/utils/_contextlib.py:115\u001b[0m, in \u001b[0;36mcontext_decorator..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/generation/utils.py:1527\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)\u001b[0m\n\u001b[1;32m 1509\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39massisted_decoding(\n\u001b[1;32m 1510\u001b[0m input_ids,\n\u001b[1;32m 1511\u001b[0m candidate_generator\u001b[38;5;241m=\u001b[39mcandidate_generator,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs,\n\u001b[1;32m 1524\u001b[0m )\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m generation_mode \u001b[38;5;241m==\u001b[39m GenerationMode\u001b[38;5;241m.\u001b[39mGREEDY_SEARCH:\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;66;03m# 11. run greedy search\u001b[39;00m\n\u001b[0;32m-> 1527\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_greedy_search\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1528\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[43m \u001b[49m\u001b[43mlogits_processor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_processor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1530\u001b[0m \u001b[43m \u001b[49m\u001b[43mstopping_criteria\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_stopping_criteria\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1531\u001b[0m \u001b[43m \u001b[49m\u001b[43mpad_token_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpad_token_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1532\u001b[0m \u001b[43m \u001b[49m\u001b[43meos_token_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43meos_token_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1533\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_scores\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput_scores\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1534\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_logits\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moutput_logits\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1535\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict_in_generate\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreturn_dict_in_generate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1536\u001b[0m \u001b[43m \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1537\u001b[0m \u001b[43m \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstreamer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1538\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1539\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1541\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m generation_mode \u001b[38;5;241m==\u001b[39m GenerationMode\u001b[38;5;241m.\u001b[39mCONTRASTIVE_SEARCH:\n\u001b[1;32m 1542\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m model_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muse_cache\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/generation/utils.py:2411\u001b[0m, in \u001b[0;36mGenerationMixin._greedy_search\u001b[0;34m(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, output_logits, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)\u001b[0m\n\u001b[1;32m 2408\u001b[0m model_inputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprepare_inputs_for_generation(input_ids, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs)\n\u001b[1;32m 2410\u001b[0m \u001b[38;5;66;03m# forward pass to get next token\u001b[39;00m\n\u001b[0;32m-> 2411\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2412\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_inputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2413\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 2414\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2415\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2416\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2418\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m synced_gpus \u001b[38;5;129;01mand\u001b[39;00m this_peer_finished:\n\u001b[1;32m 2419\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m \u001b[38;5;66;03m# don't waste resources running the code we don't need\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py:1157\u001b[0m, in \u001b[0;36mMistralForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1154\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m 1156\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m-> 1157\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1158\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1159\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1160\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1161\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1162\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1163\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1164\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1165\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1166\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1167\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1169\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1170\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py:1042\u001b[0m, in \u001b[0;36mMistralModel.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1032\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m 1033\u001b[0m decoder_layer\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m 1034\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1039\u001b[0m use_cache,\n\u001b[1;32m 1040\u001b[0m )\n\u001b[1;32m 1041\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1042\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1043\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1044\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1045\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1046\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1047\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1048\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1049\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1051\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 1053\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m use_cache:\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py:757\u001b[0m, in \u001b[0;36mMistralDecoderLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache, **kwargs)\u001b[0m\n\u001b[1;32m 754\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minput_layernorm(hidden_states)\n\u001b[1;32m 756\u001b[0m \u001b[38;5;66;03m# Self Attention\u001b[39;00m\n\u001b[0;32m--> 757\u001b[0m hidden_states, self_attn_weights, present_key_value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself_attn\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 758\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 759\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 760\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 761\u001b[0m \u001b[43m \u001b[49m\u001b[43mpast_key_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_value\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 762\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 763\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 764\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 765\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m residual \u001b[38;5;241m+\u001b[39m hidden_states\n\u001b[1;32m 767\u001b[0m \u001b[38;5;66;03m# Fully Connected\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/transformers/models/mistral/modeling_mistral.py:653\u001b[0m, in \u001b[0;36mMistralSdpaAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)\u001b[0m\n\u001b[1;32m 642\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mforward(\n\u001b[1;32m 643\u001b[0m hidden_states\u001b[38;5;241m=\u001b[39mhidden_states,\n\u001b[1;32m 644\u001b[0m attention_mask\u001b[38;5;241m=\u001b[39mattention_mask,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 648\u001b[0m use_cache\u001b[38;5;241m=\u001b[39muse_cache,\n\u001b[1;32m 649\u001b[0m )\n\u001b[1;32m 651\u001b[0m bsz, q_len, _ \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39msize()\n\u001b[0;32m--> 653\u001b[0m query_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mq_proj\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 654\u001b[0m key_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mk_proj(hidden_states)\n\u001b[1;32m 655\u001b[0m value_states \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mv_proj(hidden_states)\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1516\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1525\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1526\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1530\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", "File \u001b[0;32m/opt/conda/lib/python3.11/site-packages/torch/nn/modules/linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlinear\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "chatbot = ChatBot()\n", "chatbot.start_chat()" ] }, { "cell_type": "markdown", "id": "62283248-165c-4c46-8b15-c96049675240", "metadata": { "tags": [] }, "source": [ "## Llama 3\n", "#### Llama3, https://huggingface.co/meta-llama/Meta-Llama-3-8B/blob/main/LICENSE\n", "##### https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct" ] }, { "cell_type": "code", "execution_count": 4, "id": "8d728ce9-7e20-439c-91fd-4d41b7b73a8e", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "3819110006524529b27ea6a520dfa824", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading checkpoint shards: 0%| | 0/4 [00:00\")\n", "]\n", "\n", "outputs = model.generate(\n", " input_ids,\n", " max_new_tokens=256,\n", " eos_token_id=terminators,\n", " do_sample=True,\n", " temperature=0.6,\n", " top_p=0.9,\n", ")\n", "response = outputs[0][input_ids.shape[-1]:]\n", "print(tokenizer.decode(response, skip_special_tokens=True))" ] }, { "cell_type": "code", "execution_count": null, "id": "4357f0d3-ed25-4b83-a339-84a11930d18e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "genai", "language": "python", "name": "genai" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }