{ "cells": [ { "cell_type": "markdown", "id": "0cf1ab1c", "metadata": {}, "source": [ "This example illustrates how to score your audios according to [MOS](https://www.microsoft.com/en-us/research/wp-content/uploads/2011/05/0002416.pdf) scale task with Crowdom.\n", "\n", "You may want to first study [image classification](../image_classification/image_classification.ipynb) example because it contains more detailed comments of overall process." ] }, { "cell_type": "markdown", "id": "f7b75b1f", "metadata": {}, "source": [ "# Setup environment" ] }, { "cell_type": "code", "execution_count": 1, "id": "6c69b016", "metadata": {}, "outputs": [], "source": [ "%pip install crowdom" ] }, { "cell_type": "code", "execution_count": null, "id": "d662a042", "metadata": {}, "outputs": [], "source": [ "from datetime import timedelta\n", "import os\n", "import pandas as pd\n", "import json\n", "from typing import Dict\n", "\n", "import toloka.client as toloka\n", "\n", "from crowdom import base, datasource, classification, classification_loop, client, control, mos, objects, pricing, params as labeling_params, worker" ] }, { "cell_type": "code", "execution_count": 2, "id": "cc2c25be", "metadata": {}, "outputs": [], "source": [ "import yaml\n", "import logging.config" ] }, { "cell_type": "code", "execution_count": 3, "id": "fceaef01", "metadata": {}, "outputs": [], "source": [ "with open('logging.yaml') as f:\n", " logging.config.dictConfig(yaml.full_load(f.read()))" ] }, { "cell_type": "code", "execution_count": null, "id": "04201b9d", "metadata": {}, "outputs": [], "source": [ "toloka_client = client.create_toloka_client(token=os.getenv('TOLOKA_TOKEN') or input('Enter your token: '))" ] }, { "cell_type": "markdown", "id": "32e72cb3", "metadata": {}, "source": [ "## Task definition" ] }, { "cell_type": "code", "execution_count": 7, "id": "25849967", "metadata": {}, "outputs": [], "source": [ "class MOS(base.ScoreEvaluation):\n", " BAD = '1'\n", " POOR = '2'\n", " FAIR = '3'\n", " GOOD = '4'\n", " EXCELLENT = '5'\n", "\n", " @classmethod\n", " def labels(cls) -> Dict['MOS', Dict[str, str]]:\n", " return {\n", " cls.EXCELLENT: {'EN': 'Completely natural', 'RU': 'Абсолютно естественно'},\n", " cls.GOOD: {'EN': 'Mostly natural', 'RU': 'В основном естественно'},\n", " cls.FAIR: {'EN': 'Equally natural and unnatural', 'RU': 'В одинаковой степени естественно и неестественно'},\n", " cls.POOR: {'EN': 'Mostly unnatural', 'RU': 'В основном неестественно'},\n", " cls.BAD: {'EN': 'Completely unnatural', 'RU': 'Абсолютно неестественно'},\n", " }" ] }, { "cell_type": "code", "execution_count": 8, "id": "e7ae9486", "metadata": {}, "outputs": [], "source": [ "lang = 'EN'" ] }, { "cell_type": "code", "execution_count": 9, "id": "87bb05b5", "metadata": {}, "outputs": [], "source": [ "function = base.ClassificationFunction(inputs=(objects.Audio,), cls=MOS)" ] }, { "cell_type": "code", "execution_count": 10, "id": "b365dd65", "metadata": {}, "outputs": [ { "data": { "text/html": [ "task preview" ], "text/plain": [ "" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "example_url = 'https://storage.yandexcloud.net/crowdom-public/examples/mos/data/00fa8c05-e960-4088-bdc8-7e37b5416e9b.wav'\n", "\n", "client.TaskPreview((objects.Audio(url=example_url),), task_function=function, lang=lang).display_link()" ] }, { "cell_type": "code", "execution_count": 11, "id": "5cde0d9c", "metadata": {}, "outputs": [], "source": [ "from markdown2 import Markdown as _Markdown" ] }, { "cell_type": "markdown", "id": "80578eaf", "metadata": {}, "source": [ "If your markdown instruction includes some complex objects, e.g. tables, there might be need for some text processing, so that it would be compatible with Toloka instruction format." ] }, { "cell_type": "code", "execution_count": 12, "id": "2722cf38", "metadata": {}, "outputs": [], "source": [ "class Markdown(_Markdown):\n", " def postprocess(self, text: str) -> str:\n", " for align in ['left', 'right', 'center']:\n", " text = text.replace(f'\"text-align:{align};\"', f'\"text-align:{align}\"')\n", " return text" ] }, { "cell_type": "code", "execution_count": 13, "id": "baf9f997", "metadata": {}, "outputs": [], "source": [ "instruction = {}\n", "for worker_lang in ['EN', 'RU']:\n", " with open(f'instruction_{worker_lang}.md') as f:\n", " instruction[worker_lang] = Markdown(extras=[\"tables\"]).convert(f.read())" ] }, { "cell_type": "code", "execution_count": 14, "id": "2add9e88", "metadata": {}, "outputs": [], "source": [ "name = {'EN': 'Speech quality evaluation', 'RU': 'Оценка качества речи'}\n", "\n", "description = {\n", " 'EN': 'Rate the quality of audio files on a scale of 1 to 5 (90 seconds to complete)',\n", " 'RU': 'Оцените качество аудиозаписи по шкале от 1 до 5 (можно выполнить за 90 секунд)',\n", "}" ] }, { "cell_type": "code", "execution_count": 15, "id": "cf44cd3f", "metadata": {}, "outputs": [], "source": [ "task_spec = base.TaskSpec(\n", " id='mos',\n", " function=function,\n", " name=name,\n", " description=description,\n", " instruction=instruction,\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "id": "9f0565d2", "metadata": { "scrolled": true }, "outputs": [], "source": [ "task_spec_en = client.PreparedTaskSpec(task_spec, lang)" ] }, { "cell_type": "code", "execution_count": null, "id": "15fa941a", "metadata": { "tags": [] }, "outputs": [], "source": [ "client.define_task(task_spec_en, toloka_client)" ] }, { "cell_type": "code", "execution_count": 18, "id": "10ac90f3", "metadata": {}, "outputs": [], "source": [ "task_duration_hint = timedelta(seconds=9) # audios are about 7-9 seconds each" ] }, { "cell_type": "markdown", "id": "d0714ef3", "metadata": {}, "source": [ "In the simplest case, MOS labeling is run on audios from the same source. \n", "\n", "But alternatively MOS audio labeling can be run for multiple data sources - we can compare different synthesis models with each other, or a synthesis model with its source data speaker. \n", "\n", "In the case, when audios from multiple sources are labeled simultaneously, each of the data sources must have te same set of texts being spoken on the audios from it. Also, you have to provide dict with metadata to enable algorithms to distinguish audios form different sources from each other and to determine the text from the audio. " ] }, { "cell_type": "code", "execution_count": 19, "id": "2de6a4ff", "metadata": {}, "outputs": [], "source": [ "objects_metadata = {}\n", "\n", "speaker_objects = datasource.read_tasks('speaker.json', task_spec_en.task_mapping)\n", "\n", "with open('speaker.json') as file:\n", " for entry, input_objects in zip(json.load(file), speaker_objects):\n", " objects_metadata[input_objects] = mos.ObjectsMetadata(item_id=entry['text'], algorithm='speaker')\n", " \n", "synthesis_objects = datasource.read_tasks('synthesis.json', task_spec_en.task_mapping)\n", "\n", "with open('synthesis.json') as file:\n", " for entry, input_objects in zip(json.load(file), synthesis_objects):\n", " objects_metadata[input_objects] = mos.ObjectsMetadata(item_id=entry['text'], algorithm='synthesis')" ] }, { "cell_type": "code", "execution_count": 20, "id": "9377c2da", "metadata": {}, "outputs": [], "source": [ "input_objects = speaker_objects + synthesis_objects" ] }, { "cell_type": "markdown", "id": "2e7e0719", "metadata": {}, "source": [ "## Launch configuration" ] }, { "cell_type": "markdown", "id": "b9269565", "metadata": {}, "source": [ "For MOS, we define labeling parameters with code, because interactive parameters form unaware of some MOS pipeline specifics." ] }, { "cell_type": "code", "execution_count": 22, "id": "a05502f9", "metadata": {}, "outputs": [], "source": [ "pricing_config = pricing.PoolPricingConfig(\n", " assignment_price=0.02,\n", " real_tasks_count=10,\n", " control_tasks_count=0,\n", ")\n", "\n", "params = client.Params(\n", " task_duration_hint=task_duration_hint,\n", " pricing_config=pricing_config,\n", " overlap=classification_loop.StaticOverlap(3),\n", " aggregation_algorithm=classification.AggregationAlgorithm.MAJORITY_VOTE,\n", " control=control.Control(\n", " rules=control.RuleBuilder().add_static_reward(0.5).add_complex_speed_control(\n", " [control.BlockTimePicker(0.1, '2d', True)]).build(),\n", " ),\n", " worker_filter=worker.WorkerFilter(\n", " filters=[\n", " worker.WorkerFilter.Params(\n", " langs={worker.LanguageRequirement(lang=lang)},\n", " regions=worker.lang_to_default_regions.get(lang, {}),\n", " age_range=(18, None),\n", " ),\n", " ],\n", " training_score=None,\n", " ),\n", ")" ] }, { "cell_type": "markdown", "id": "6eaa076f", "metadata": {}, "source": [ "## Launch" ] }, { "cell_type": "code", "execution_count": 24, "id": "ca2ea48d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "clear formula, which does not account edge cases like min commission and incomplete assignments\n" ] }, { "data": { "text/latex": [ "$\\displaystyle TotalPrice_{clear} = TaskCount * PricePerTask_\\$ * Overlap * (1 + TolokaCommission) = 10 * 0.0020\\$ * 2 * 1.3 = 0.05\\$.$" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "more precise formula, which accounts more edge cases\n" ] }, { "data": { "text/latex": [ "$\\displaystyle TotalPrice_{precise} = AssignmentCount * PricePerAssignment_\\$ * Overlap = \\left \\lceil \\frac {TaskCount} {TasksOnAssignment} \\right \\rceil * (PricePerAssignment_\\$ + max(PricePerAssignment_\\$ * TolokaCommission, MinTolokaCommission_\\$) * Overlap = \\lceil 10 / 10 \\rceil * (0.02\\$ + max(0.02\\$ * 0.3, 0.001\\$) * 2 = 1 * 0.026 * 2 = 0.05\\$.$" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "run classification of 10 objects for 0.05$? [Y/n] Y\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "453d552095f4423181d6f6aba5e885b2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Output()" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "2022-08-29 14:38:20,359 - crowdom.client.launch:_launch:187 - INFO: - classification has started\n" ] } ], "source": [ "artifacts = client.launch_mos(\n", " task_spec_en,\n", " params,\n", " input_objects,\n", " toloka_client,\n", " interactive=True,\n", " inputs_to_metadata=objects_metadata,\n", ")" ] }, { "cell_type": "code", "execution_count": 35, "id": "fc9270bb", "metadata": {}, "outputs": [], "source": [ "results = artifacts.results" ] }, { "cell_type": "markdown", "id": "bec08739", "metadata": {}, "source": [ "## Results study" ] }, { "cell_type": "code", "execution_count": 38, "id": "d9db1c0c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'speaker': MOSCI(mu=4.35, ci=0.67), 'synthesis': MOSCI(mu=4.21, ci=0.59)}" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "artifacts.ci" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }