{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "3ca2b642-078a-4551-bb2e-aab8885a5b14", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "url = f'{\"https://github.com/DataTalksClub/llm-zoomcamp/blob/main/04-monitoring/data/results-gpt4o-mini.csv\"}?raw=1'\n", "df = pd.read_csv(url)\n", "df = df.iloc[:300]" ] }, { "cell_type": "code", "execution_count": 6, "id": "8a8a1d8f-4484-46f3-8154-23735c75648c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/python/3.10.13/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", " from tqdm.autonotebook import tqdm, trange\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4839ec40c3d645a79a1585dfd103be68", "version_major": 2, "version_minor": 0 }, "text/plain": [ "modules.json: 0%| | 0.00/229 [00:00 \u001b[0m\u001b[32;49m24.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "# pip install rouge" ] }, { "cell_type": "code", "execution_count": 17, "id": "83ce8954-cf66-4d9b-8956-d21bd392cc2b", "metadata": {}, "outputs": [], "source": [ "from rouge import Rouge" ] }, { "cell_type": "code", "execution_count": 19, "id": "80d29796-2226-4d9e-91f2-89f8c0b1c94d", "metadata": {}, "outputs": [], "source": [ "# Select the specific record at index 10\n", "r = df.iloc[10]" ] }, { "cell_type": "code", "execution_count": 20, "id": "e1424d61-a6df-41e9-ad7c-3be1bd96f752", "metadata": {}, "outputs": [], "source": [ "\n", "rouge_scorer = Rouge()\n", "scores = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]\n", "\n", "rouge_1_f1 = scores['rouge-1']['f']" ] }, { "cell_type": "code", "execution_count": 21, "id": "ed473c45-0278-41b2-a650-18f633a777b4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.45454544954545456" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rouge_1_f1" ] }, { "cell_type": "code", "execution_count": 23, "id": "c9220884-384b-4869-99b4-cc2a1b3fefac", "metadata": {}, "outputs": [], "source": [ "\n", "rouge_1_f1 = scores['rouge-1']['f']\n", "rouge_2_f1 = scores['rouge-2']['f']\n", "rouge_l_f1 = scores['rouge-l']['f']\n", "\n", "# Compute the average F1 score\n", "average_f1 = (rouge_1_f1 + rouge_2_f1 + rouge_l_f1) / 3\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "c759e757-23c4-45ce-b58c-bf3020bce520", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.35490034990035496" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "average_f1" ] }, { "cell_type": "code", "execution_count": 25, "id": "89b3423f-c0f1-4cd8-b135-4cc922eef466", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1f854d1e0552480dbdb8f2de5dcd872c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/300 [00:00