{
    "Name": "turjuman",
    "Volume": 0.0,
    "Unit": "sentences",
    "License": "unknown",
    "Link": "https://github.com/UBC-NLP/turjuman",
    "HF_Link": "",
    "Year": 2022,
    "Domain": [
        "public datasets"
    ],
    "Form": "text",
    "Collection_Style": [
        "crawling"
    ],
    "Description": "Neural MT toolkit for Arabic",
    "Ethical_Risks": "Low",
    "Provider": [
        "University of British Columbia"
    ],
    "Derived_From": [
        "OPUS"
    ],
    "Paper_Title": "TURJUMAN: A Public Toolkit for Neural Arabic Machine Translation",
    "Paper_Link": "https://aclanthology.org/2022.osact-1.1.pdf",
    "Tokenized": false,
    "Host": "GitHub",
    "Access": "Free",
    "Cost": "",
    "Test_Split": true,
    "Tasks": [
        "machine translation"
    ],
    "Venue_Title": "ACL",
    "Venue_Type": "conference",
    "Venue_Name": "ACL",
    "Authors": [
        "ElmoatezBillahNagoudi",
        "AbdelRahimElmadany",
        "MuhammadAbdul-Mageed"
    ],
    "Affiliations": [
        "The University of British Columbia"
    ],
    "Abstract": "We present TURJUMAN, a neural toolkit for translating from 20 languages into Modern Standard Arabic (MSA). TURJUMAN exploits the recently-introduced text-to-text Transformer AraT5 model, endowing it with a powerful ability to decode into Arabic. The toolkit offers the possibility of employing a number of diverse decoding methods, making it suited for acquiring paraphrases for the MSA translations as an added value. To train TURJUMAN, we sample from publicly available parallel data employing a simple semantic similarity method to ensure data quality. This allows us to prepare and release AraOPUS-20, a new machine translation benchmark. We publicly release our translation toolkit (TURJUMAN) as well as our benchmark dataset (AraOPUS-20).",
    "Subsets": [],
    "Dialect": "Modern Standard Arabic",
    "Language": "ar",
    "Script": "Arab",
    "Added_By": "qwen/qwen3.6-35b-a3b"
}