{
    "Name": "MLADI",
    "Volume": 100000.0,
    "Unit": "sentences",
    "License": "unknown",
    "Link": "https://mohamedalaa9.github.io/lahjatbert/",
    "HF_Link": "",
    "Year": 2024,
    "Domain": [
        "social media"
    ],
    "Form": "text",
    "Collection_Style": [
        "crawling",
        "machine annotation",
        "LLM generated"
    ],
    "Description": "Multi-label dataset for Arabic Dialect Identification.",
    "Ethical_Risks": "Medium",
    "Provider": [
        "Mohamed bin Zayed University of Artificial Intelligence"
    ],
    "Derived_From": [
        "NADI2020",
        "NADI2021",
        "NADI2023"
    ],
    "Paper_Title": "Curriculum Learning and Pseudo-Labeling Improve the Generalization of Multi-Label Arabic Dialect Identification Models",
    "Paper_Link": "https://arxiv.org/pdf/2602.12937v2.pdf",
    "Tokenized": false,
    "Host": "GitHub",
    "Access": "Free",
    "Cost": "0",
    "Test_Split": true,
    "Tasks": [
        "dialect identification",
        "text classification"
    ],
    "Venue_Title": "ArabicNLP",
    "Venue_Type": "conference",
    "Venue_Name": "Second Arabic Natural Language Processing Conference",
    "Authors": [
        "Ali Mekky",
        "Mohamed El Zeftawy",
        "Lara Hassan",
        "Amr Keleg",
        "Preslav Nakov"
    ],
    "Affiliations": [
        "Mohamed bin Zayed University of Artificial Intelligence"
    ],
    "Abstract": "Dialects are often modeled as a single-label classification task for a long time, recent work has argued that Arabic Dialect Identification (ADI) should be framed as a multi-label classification task. However, ADI remains constrained by the availability of its training data. By analyzing models trained on single-label ADI datasets, we show that the false negatives in Multi-Label Arabic Dialect Identification (MLADI) lies in the selection of negative samples, as many sentences treated as negative could be acceptable in multiple dialects. To address these issues, we construct a multi-label dataset by generating automatic multi-label annotations using GPT-4o and binary dialect acceptability classifiers, with aggregation guided by the Arabic Level of Dialectness (ALDi). Afterward, we train a BERT-based multi-label classifier using curriculum learning strategies aligned with dialectal complexity and label cardinality. On the MLADI leaderboard, our best-performing LAHJATBERT model achieves a macro F1 of 0.69, compared to 0.55 for the strongest previously reported system. Code and data are available at https://mohamedalaa9.github.io/lahjatbert/.",
    "Subsets": [],
    "Dialect": "mixed",
    "Language": "ar",
    "Script": "Arab",
    "Added_By": "qwen/qwen3.6-35b-a3b"
}