{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "This example illustrates how to solve typical side-by-side task with Crowdom.\n", "\n", "In our example, we ask workers which transcript is more suitable for the given audio.\n", "\n", "You may want to first study [image classification](../image_classification/image_classification.ipynb) example because it contains more detailed comments of overall process." ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Setup\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "%pip install crowdom\n", "%pip install pyyaml\n", "%pip install markdown2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from datetime import timedelta\n", "import logging.config\n", "import pandas as pd\n", "\n", "import markdown2\n", "import yaml\n", "\n", "from crowdom import base, datasource, client, objects, pricing, params as labeling_params" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "with open('logging.yaml') as f:\n", " logging.config.dictConfig(yaml.full_load(f.read()))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from IPython.display import clear_output, display" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "toloka_client = client.create_toloka_client(token=os.getenv('TOLOKA_TOKEN') or input('Enter your token: '))\n", "clear_output()" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Task definition\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "lang = 'EN'" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "instruction = {}\n", "for worker_lang in ['EN', 'RU']:\n", " with open(f'instruction_{worker_lang}.md') as f:\n", " instruction[worker_lang] = markdown2.markdown(f.read())" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "task_spec = base.TaskSpec(\n", " id='audio-transcript-sbs',\n", " function=base.SbSFunction(inputs=(objects.Text,), hints=(objects.Audio,)),\n", " name={\n", " 'EN': 'Audio transcripts comparison', \n", " 'RU': 'Сравнение расшифровок аудиозаписей',\n", " },\n", " description={\n", " 'EN': 'From the two transcripts, choose the more suitable for given audio recording',\n", " 'RU': 'Из двух расшифровок выберите более подходящую аудиозаписи',\n", " },\n", " instruction=instruction,\n", ")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "task_spec_en = client.PreparedTaskSpec(task_spec, lang)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "client.define_task(task_spec_en, toloka_client)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "task_duration_hint = timedelta(seconds=10) # audios are about 1-5 seconds each" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Data preparation\n" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "input_objects = datasource.read_tasks('tasks.json', task_spec_en.task_mapping)\n", "control_objects = datasource.read_tasks('control_tasks.json', task_spec_en.task_mapping, has_solutions=True)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<a href=\"https://tb.yandex.net/editor?config=N4Igxg9gdgZglgcxALgAQmAHRANzgUwHds0sQ4AXfAWwGcTUBtMgVwCcAbBsgBwEMKACwbY+LACZwIAfUFwoFbABpU2CgE8e+ESHEC+AOnk8WikAF8V2HHw5w9FKVG5rN2lKvDRJj6AZ4cfOr44gC0MCwcHOrYlp4aWjp4RAZikhCxKmSQ1NTQAMLQFGwQHPQeZJQ05WjM2A58LiD8QjpgghBwYO5WIAnuaPX6BhCmJmZx2BA8vlA1TGQ2HCwDno0gvYEARvhcHtgAMvgwFKjTs7EAur2StHxbHCEMMLa0+L39OvC74gZsfOlQggSiweJlUGQGk0WsJ9uAOl0evE3DoGiMxqZwVMZk55nVcLYVjotspPNtdjoAEqIQSnc5OK43OB3B5PDwvMrvZGJOHfDi-f6A4GjMEWa6eajyADq9laHgAjAAGAyK3o4fBsRxgWwAQTsCGccIo01J1ls9gEDIqQwo61I2BhbQR3VNfRRcLRowo4yxIDkCh0+Q6EDeZyg+DOMFQQgj9Og5Q+7sGXigPicf3wAEcWHA2GzJm6ecnAuovQZIMtqHNfVU6Ax8ZAFPgA9aU1QW-bmgJYcmqAAPCjSdaJoueNHGTEWEerawEQgGfsTOLZIrNsydxvt9cQh3dnSL6Qkjbcme6YYTibTpJzhf4Aexczi7CEez4Qq5ApFEplBjFFZXuESzLHJ+DzRkT2vFI7FoS9PACFgEHkPEyHlJooVbR04XaToXWPVxRxtQwvR9Kc4KCDgIABHRhwguE+DAWYDDeWD8NPeDEKgAwOgoABrfB1HKZdsAAJjQ-RoT3LDnSRVjUWGYjJwLfh1Aoqi4SPADk3oxjmN9T44XY+QuIgXj+MErJsCAydOx4+RxB0WgwG-PYPj4WgeJlcQ5TQAB2RVlUVAsoBMuB4G1WZkOwTc1x0AAhY4IDzVAtDYGBEslKAEFQPhozcniVGoPg+NQWh2AjIQBGyqJUGoEI4By3YaDXWhUEEPh1VQVTxBCAxXX05NknnRdwRXJsO08ABJKNKseNy6XDGq6oax5aoUVBmRq5laHkBAVDzLro0ECN+AQfBerwwtT0G297zFTS4OWDiF1KCAePWR9zGPcgoHGDwMFECQpFkeRt2wWlvVoZAAHooYoDgeKYgBmAx1D4VM7wMcMKChu9BwaZioeC5l1GkWgtHwdpCc6WgSaoGCQmkQh2qhnhhIAVh86RFWEgA2AwmZwPrcaHHQNqZlr6LAdgBBkvphaPZMNqEDbJelqhXXsHQdUBiAAAp2A4ABeABycGeEhmG4YR2hkdR9G+0x-BseF-Gnap4nSfJymiZp6Q6aocRGeZ1mOa53n+fa42AEpUAAFVx3XFxNsW3OysApf+Kho7jhOk+NpW5Al9O1fwaPYgsIA\" target=\"_blank\" rel=\"noopener noreferrer\">task preview</a>" ], "text/plain": [ "<IPython.core.display.HTML object>" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client.TaskPreview(input_objects[0], task_spec=task_spec_en).display_link()" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Launch configuration\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "params_form = labeling_params.get_interface(task_spec_en, task_duration_hint, toloka_client)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "params = params_form.get_params()" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Launch\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "clear formula, which does not account edge cases like min commission and incomplete assignments\n" ] }, { "data": { "text/latex": [ "$\\displaystyle TotalPrice_{clear} = TaskCount * PricePerTask_\\$ * Overlap * (1 + TolokaCommission) = 5 * 0.0022\\$ * 1 * 1.3 = 0.01\\$.$" ], "text/plain": [ "<IPython.core.display.Math object>" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "more precise formula, which accounts more edge cases\n" ] }, { "data": { "text/latex": [ "$\\displaystyle TotalPrice_{precise} = AssignmentCount * PricePerAssignment_\\$ * Overlap = \\left \\lceil \\frac {TaskCount} {TasksOnAssignment} \\right \\rceil * (PricePerAssignment_\\$ + max(PricePerAssignment_\\$ * TolokaCommission, MinTolokaCommission_\\$) * Overlap = \\lceil 5 / 4 \\rceil * (0.009\\$ + max(0.009\\$ * 0.3, 0.001\\$) * 1 = 2 * 0.012 * 1 = 0.02\\$.$" ], "text/plain": [ "<IPython.core.display.Math object>" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdin", "output_type": "stream", "text": [ "run classification of 5 objects for 0.02$? [Y/n] Y\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5170d84932d74226a2e74a8aca177626", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "2022-10-31 12:44:52,918 - crowdom.client.launch:_launch:193 - INFO: - classification has started\n" ] } ], "source": [ "artifacts = client.launch_sbs(\n", " task_spec_en,\n", " params,\n", " input_objects,\n", " control_objects,\n", " toloka_client,\n", " interactive=True,\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "results = artifacts.results" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## Results study\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" } }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>audio_hint</th>\n", " <th>text_a</th>\n", " <th>text_b</th>\n", " <th>result</th>\n", " <th>confidence</th>\n", " <th>overlap</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p257_026.wav</td>\n", " <td>is was accurate</td>\n", " <td>is this accurate</td>\n", " <td>b</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p232_300.wav</td>\n", " <td>what has altered</td>\n", " <td>has altered</td>\n", " <td>a</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p257_120.wav</td>\n", " <td>does this</td>\n", " <td>does this mean</td>\n", " <td>b</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p232_044.wav</td>\n", " <td>we will push</td>\n", " <td>we will pull</td>\n", " <td>a</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p232_405.wav</td>\n", " <td>we have not received a letter from the danish</td>\n", " <td>we have not yet received a letter from the irish</td>\n", " <td>b</td>\n", " <td>1.0</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " audio_hint \\\n", "0 https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p257_026.wav \n", "1 https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p232_300.wav \n", "2 https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p257_120.wav \n", "3 https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p232_044.wav \n", "4 https://tlk.s3.yandex.net/ext_dataset/noisy_speech/noisy_tested_wav/p232_405.wav \n", "\n", " text_a \\\n", "0 is was accurate \n", "1 what has altered \n", "2 does this \n", "3 we will push \n", "4 we have not received a letter from the danish \n", "\n", " text_b result confidence \\\n", "0 is this accurate b 1.0 \n", "1 has altered a 1.0 \n", "2 does this mean b 1.0 \n", "3 we will pull a 1.0 \n", "4 we have not yet received a letter from the irish b 1.0 \n", "\n", " overlap \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context('max_colwidth', 100):\n", " display(results.predict())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 0 }