@techreport{FOLIA19, Author = {M. {van Gompel}}, Title = {FoLiA: Format for Linguistic Annotation - Documentation and Reference Guide}, Number = {Language and Speech Technology Technical Report Series 19-01}, Institution = {Radboud University}, Location = {Nijmegen, The Netherlands}, Year = {2019}, url = "https://folia.readthedocs.io/en/latest/" } @techreport{CLAM18, Author = {M. {van Gompel}}, Title = {CLAM Documentation}, Number = {Language and Speech Technology Technical Report Series 18-03}, Institution = {Radboud University}, Location = {Nijmegen, The Netherlands}, Year = {2018}, url = "https://clam.readthedocs.io/en/latest/" } @techreport{FROG18, Author = {K. {van der Sloot} and I. Hendrickx and M. {van Gompel} and A. {van den Bosch} and W. {Daelemans}}, Title = {Frog, A Natural Language Processing Suite for Dutch. {Reference Guide}}, Number = {Language and Speech Technology Technical Report Series 18-02}, Institution = {Radboud University}, Location = {Nijmegen, The Netherlands}, Year = {2018}, url = "https://frognlp.readthedocs.io/en/latest/" } @techreport{UCTO18, Author = {M. {van Gompel} and K. {van der Sloot} and I. Hendrickx, and A. {van den Bosch}}, Title = {Ucto: Unicode Tokeniser. {Reference Guide}}, Number = {Language and Speech Technology Technical Report Series 18-01}, Institution = {Radboud University}, Location = {Nijmegen, The Netherlands}, Year = {2018}, url = "https://ucto.readthedocs.io/en/latest/" } @article{CLIN28SHAREDTASK, author = {Beeksma, Merijn and Van Gompel, Maarten and Kunneman, Florian and Onrust, Louis and Regnerus, Bouke and Vinke, Dennis and Brito, Eduardo and Bauckhage, Christian}, year = {2018}, month = {01}, pages = {122-137}, title = {Detecting and correcting spelling errors in high-quality Dutch Wikipedia text}, volume = {8}, journal = {Computational Linguistics in the Netherlands Journal} } @inbook{FOLIACLARINBOOK, ISBN = {9781911529248}, URL = {http://www.jstor.org/stable/j.ctv3t5qjk.13}, abstract = {We present an overview of the software and data infrastructure for FoLiA, a Format for Linguistic Annotation developed within the scope of the CLARIN-NL project and other projects. FoLiA aims to provide a single unified file format accommodating a wide variety of linguistic annotation types, preventing the proliferation of different formats for different annotation types. FoLiA is being developed in a bottom-up and practice-driven fashion. We have invested mainly in the creation of a rich infrastructure of tools that enable developers and end-users to work with the format. This work will present the current state of this infrastructure.}, author = {M. van Gompel and K. van der Sloot and M. Reynaert and A. van den Bosch}, booktitle = {CLARIN in the Low Countries}, pages = {71--82}, publisher = {Ubiquity Press}, title = {FoLiA in Practice: The Infrastructure of a Linguistic Annotation Format}, year = {2017} } @inbook{PARSEME, location={Berlin}, title={{PARSEME} multilingual corpus of verbal multiword expressions}, ISBN={978-3-96110-123-8}, DOI={10.5281/zenodo.1471591}, abstract={Multiword expressions (MWEs) are known as a 'pain in the neck' due to their idiosyncratic behaviour. While some categories of MWEs have been largely studied, verbal MWEs (VMWEs) such as to take a walk, to break one's heart or to turn off have been relatively rarely modelled. We describe an initiative meant to bring about substantial progress in understanding, modelling and processing VMWEs. In this joint effort carried out within a European research network we elaborated a universal terminology and annotation methodology for VMWEs. Its main outcomes, available under open licenses, are unified annotation guidelines, and a corpus of over 5.4 million words and 62 thousand annotated VMWEs in 18 languages.}, booktitle={Multiword expressions at length and in depth: Extended papers from the MWE 2017 workshop,}, publisher={Language Science Press}, author={Agata Savary and Marie Candito and Verginica Barbu Mititelu and Eduard Bejček and Fabienne Cap and Slavomír Čéplö and Silvio Ricardo Cordeiro and Gülşen Eryiğit and Voula Giouli and Maarten van Gompel and et al.}, year={2018}, month={Oct}, pages={87–147}} @inbook{TTNWW, ISBN = {9781911529248}, URL = {http://www.jstor.org/stable/j.ctv3t5qjk.14}, abstract = {The idea behind the Flemish/Dutch CLARIN project TTNWW¹ (’TST Tools voor het Nederlands als Webservices in een Workflow’, or ‘NLP Tools for Dutch as Web services in a Workflow’) was that many end users of resources and tools offered by CLARIN will not know how to use them, just as they will not know where they are located. With respect to the location, the CLARIN policy is that the Human and Social Sciences (HSS) researcher does not need to know this as the infrastructure will take care of that: the only thing the user needs to do is to indicate}, author = {Marc Kemps-Snijders and Ineke Schuurman and Walter Daelemans and Kris Demuynck and Brecht Desplanques and Véronique Hoste and Marijn Huijbregts and Jean-Pierre Martens and Hans Paulussen and Joris Pelemans and Martin Reynaert and Vincent Vandeghinste and Antal van den Bosch and Henk van den Heuvel and Maarten van Gompel and Gertjan van Noord and Patrick Wambacq}, booktitle = {CLARIN in the Low Countries}, pages = {83--94}, publisher = {Ubiquity Press}, title = {TTNWW to the Rescue: No Need to Know How to Handle Tools and Resources}, year = {2017} } @techreport{SOFTWAREQUALITY, Author = {M. {van Gompel} and J. Noordzij and R. {de Valk} and A. Scharnhorst}, Title = {Guidelines for Software Quality}, Institution = {CLARIAH}, Year = {2018}, url = "https://github.com/CLARIAH/software-quality-guidelines/raw/v1.0/softwareguidelines.pdf" } @article{COLIBRICORE, title={Efficient n-gram, skipgram and flexgram modelling with Colibri Core}, author={van Gompel, Maarten and van den Bosch, Antal}, journal={Journal of Open Research Software}, volume={4}, number={1}, year={2016}, publisher={Ubiquity Press}, doi = {10.5334/jors.105}, url={https://openresearchsoftware.metajnl.com/articles/10.5334/jors.105/}, abstract={Counting n-grams lies at the core of any frequentist corpus analysis and is often considered a trivial matter. Going beyond consecutive n-grams to patterns such as skipgrams and flexgrams increases the demand for efficient solutions. The need to operate on big corpus data does so even more. Lossless compression and non-trivial algorithms are needed to lower the memory demands, yet retain good speed. Colibri Core is software for the efficient computation and querying of n-grams, skipgrams and flexgrams from corpus data. The resulting pattern models can be analysed and compared in various ways. The software offers a programming library for C++ and Python, as well as command-line tools.} } @article{COLIBRITAFINAL, title={The role of context information in {L2} translation assistance}, author={{van Gompel}, Maarten and {van den Bosch}, Antal}, journal={International Journal of Translation}, volume={28}, number={1-2}, year={2016}, publisher={Bahri Publications} } @Article{F1000RESEARCH, AUTHOR = { Jiménez, RC. and Kuzak, M. and Alhamdoosh, M. and Barker, M. and Batut, B. and Borg, M. and Capella-Gutierrez, S. and Chue Hong, N. and Cook, M. and Corpas, M. and Flannery, M. and Garcia, L. and Gelpí, JL. and Gladman, S. and Goble, C. and González Ferreiro, M. and Gonzalez-Beltran, A. and Griffin, PC. and Grüning, B. and Hagberg, J. and Holub, P. and Hooft, R. and Ison, J. and Katz, DS. and Leskošek, B. and López Gómez, F. and Oliveira, LJ. and Mellor, D. and Mosbergen, R. and Mulder, N. and Perez-Riverol, Y. and Pergl, R. and Pichler, H. and Pope, B. and Sanz, F. and Schneider, MV. and Stodden, V. and Suchecki, R. and Svobodová Vařeková, R. and Talvik, HA. and Todorov, I. and Treloar, A. and Tyagi, S. and van Gompel, M. and Vaughan, D. and Via, A. and Wang, X. and Watson-Haigh, NS. and Crouch, S.}, TITLE = {Four simple recommendations to encourage best practices in research software}, JOURNAL = {F1000Research}, VOLUME = {6}, YEAR = {2017}, NUMBER = {876}, DOI = {10.12688/f1000research.11407.1}, url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5490478/", abstract = {Scientific research relies on computer software, yet software is not always developed following practices that ensure its quality and sustainability. This manuscript does not aim to propose new software development best practices, but rather to provide simple recommendations that encourage the adoption of existing best practices. Software development best practices promote better quality software, and better quality software improves the reproducibility and reusability of research. These recommendations are designed around Open Source values, and provide practical suggestions that contribute to making research software and its source code more discoverable, reusable and transparent. This manuscript is aimed at developers, but also at organisations, projects, journals and funders that can increase the quality and sustainability of research software by encouraging the adoption of these recommendations.} } @inproceedings{Reynaert+15, author = "M. Reynaert and M. van Gompel and K. van der Sloot and A. van den Bosch", title = "{PICCL}: {P}hilosophical {I}ntegrator of {C}omputational and {C}orpus {L}ibraries", year = 2015, booktitle = "Proceedings of {CLARIN} {A}nnual {C}onference 2015 -- {B}ook of {A}bstracts", publisher = "CLARIN ERIC", url = "http://www.clarin.eu/sites/default/files/book%20of%20abstracts%202015.pdf" } @inproceedings{COLIBRITAPILOT, author = "M. van Gompel and A. van den Bosch", title = "Translation Assistance by Translation of L1 Fragments in an L2 Context", year = 2014, booktitle = "Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", pages = {871--880}, publisher = "Association for Computational Linguistics", month = "Jun", url = "http://www.aclweb.org/anthology/P14-1082", abstract = {In this paper we present new research in translation assistance. We describe a system capable of translating native language (L1) fragments to foreign language (L2) fragments in an L2 context. Practical applications of this research can be framed in the context of second language learning. The type of translation assistance system under investigation here encourages language learners to write in their target language while allowing them to fall back to their native language in case the correct word or expression is not known. These code switches are subsequently translated to L2 given the L2 context. We study the feasibility of exploiting cross-lingual context to obtain high-quality translation suggestions that improve over statistical language modelling and word-sense disambiguation baselines. A classification-based approach is presented that is indeed found to improve significantly over these baselines by making use of a contextual window spanning a small number of neighbouring words.} } @article{TSCAN, author = "H. P. Maat and R. Kraf and A. van den Bosch and N. Dekker and M. van Gompel and S. Kleijn and T. Sanders and K. van der Sloot", title = "T-Scan: a new tool for analyzing Dutch text", year = 2014, journal = "Computational Linguistics in the Netherlands Journal", volume = 4, issn = {2211-4009}, pages = {53-74}, url = "https://www.clinjournal.org/sites/clinjournal.org/files/05-PanderMaat-etal-CLIN2014.pdf", abstract = {T-Scan is a new tool for analyzing Dutch text. It aims at extracting text features that are theoretically interesting, in that they relate to genre and text complexity, as well as practically interesting, in that they enable users and text producers to make text-specific diagnoses. T-Scan derives it features from tools such as Frog and Alpino, and resources such as SoNaR, SUBTLEX-NL and Referentie Bestand Nederlands. This paper offers a qualitative discussion of a number of T-Scan features, based on a minimal demonstration corpus of six texts, three of them scientific articles and three of them drawn from a women’s magazine. We discuss features concerning lexical complexity, sentence complexity, referential cohesion and lexical diversity, lexical semantics and personal style. For all these domains we examine the construct validity as well as the reliability of a number of important features. We conclude that T-Scan offers a number of promising lexical and syntactic features, while the interpretation of referential cohesion/ lexical diversity features and personal style features is less clear. Further developing the application and analyzing authentic text need to go hand in hand.} } @inproceedings{CLAMPAPER, author = "M. van Gompel and M. Reynaert", title = "CLAM: Quickly deploy NLP command-line tools on the web", year = 2014, booktitle = "Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: System Demonstrations", publisher = "Dublin City University and Association for Computational Linguistics", pages = "71--75", location = "Dublin, Ireland", abstract = {In this paper we present the software CLAM; the Computational Linguistics Application Mediator. CLAM is a tool that allows you to quickly and transparently transform command-line NLP tools into fully-fledged RESTful webservices with which automated clients can communicate, as well as a generic webapplication interface for human end-users.}, url = {http://aclweb.org/anthology/C14-2016}, } @techreport{CLAMDOC, author = "M. van Gompel", title = "CLAM: Computational Linguistics Application Mediator", year = 2014, booktitle = "Language and Speech Technology Technical Report Series 14-02", publisher = "Radboud University Nijmegen", url = "https://github.com/proycon/clam/raw/v2.3.6/docs/clam_manual.pdf" } @techreport{FOLIADOC, author = "M. van Gompel", title = "FoLiA: Format for Linguistic Annotation. Documentation", year = 2014, booktitle = "Language and Speech Technology Technical Report Series 14-01", publisher = "Radboud University Nijmegen", url = "https://github.com/proycon/folia/raw/v1.5.1.60/docs/folia.pdf" } @inproceedings{SEMEVAL2014TASK5, author = "M. van Gompel and I. Hendrickx and A. van den Bosch and E. Lefever and V. Hoste", title = "Semeval-2014 Task 5: L2 writing assistant", year = 2014, booktitle = "Proceedings of the 8th International Workshop on Semantic Evaluation (SemEval 2014)", abstract = {We present a new cross-lingual task for SemEval concerning the translation of L1 fragments in an L2 context. The task is at the boundary of Cross-Lingual Word Sense Disambiguation and Machine Translation. It finds its application in the field of computer-assisted translation, particularly in the context of second language learning. Translating L1 fragments in an L2 context allows language learners when writing in a target language (L2) to fall back to their native language (L1) whenever they are uncertain of the right word or phrase.}, url = "http://aclweb.org/anthology/S14-2005" } @article{OERSETTER, author = "M. van Gompel and A. van den Bosch and A. Dykstra", title = "Oersetter: Frisian-Dutch statistical machine translation", year = 2014, pages={287--296}, journal = "Philologia Frisica anno 2012", publisher = "Fryske Akademy", abstract = {In this paper we present a statistical machine translation (SMT) system for Frisian to Dutch and Dutch to Frisian. A parallel training corpus has been established, which has subsequently been used to automatically learn a phrase-based SMT model. The translation system is built around the open-source SMT software Moses. The resulting system, named Oersetter , is released as a website for human end users, as well as a web service for software to interact with. We here discuss the workings, setup and performance of our system, which to our knowledge is the very first Frisian-Dutch SMT system.}, url = "http://hdl.handle.net/2066/129749", } @inproceedings{CMCTEI, author = "M. Beißwenger and T. Chanier and I. Chiari and M. Ermakova and M. v. Gompel and I. Hendrickx and A. Herold and H. V. D. Heuvel and L. Lemnitzer and A. Storrer and others", title = "Computer-mediated communication in TEI: What lies ahead", year = 2013, booktitle = "The Linked {TEI}: Text Encoding in the Web. 2013 Annual Conference and Members' Meeting of the TEI Consortium" } @article{FOLIAPAPER, author = "M. van Gompel and M. Reynaert", title = "FoLiA: A practical XML Format for Linguistic Annotation - a descriptive and comparative study", year = 2013, journal = "Computational Linguistics in the Netherlands Journal", volume = 3, url = {http://clinjournal.org/sites/clinjournal.org/files/05-vanGompel-Reynaert-CLIN2013.pdf}, abstract = {In this paper we present FoLiA, a Format for Linguistic Annotation, and conduct a comparative study with other annotation schemes, including the Linguistic Annotation Framework (LAF), the Text Encoding Initiative (TEI) and Text Corpus Format (TCF). An additional point of focus is the interoperability between FoLiA and metadata standards such as the Component MetaData Infrastructure (CMDI), as well as data category registries such as ISOcat. The aim of the paper is to present a clear image of the capabilities of FoLiA and how it relates to other formats. This should open discussion and aid users in their decision for a particular format. FoLiA is a practically-oriented XML-based annotation format for the representation of language resources, explicitly supporting a wide variety of annotation types. It introduces a flexible and uniform paradigm and a representation independent of language or label set. It is designed to be highly expressive, generic, and formalised, whilst at the same time focussing on being as practical as possible to ease its adoption and implementation. The aspiration is to offer a generic format for storage, exchange, and machine-processing of linguistically annotated documents, preventing users as well as software tools from having to cope with a wide variety of different formats, which in the field regularly causes convertibility issues and proliferation of ad-hoc formats. FoLiA emerged from such a practical need in the context of Computational Linguistics in the Netherlands and Flanders. It has been successfully adopted by numerous projects within this community. FoLiA was developed in a bottom-up fashion, with special emphasis on software libraries and tools to handle it.} } @inproceedings{WSD2, author = "M. van Gompel and A. van den Bosch", title = "WSD2: parameter optimisation for memory-based cross-lingual word-sense disambiguation", year = 2013, booktitle = "Proceedings of the 7th International Workshop on Semantic Evaluation ({SemEval} 2013), in conjunction with the Second Joint Conference on Lexical and Computational Semantics", publisher = "New Brunswick, NJ: Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/S13-2033", abstract = {We present our system WSD2 which participated in the Cross-Lingual Word-Sense Disambiguation task for SemEval 2013 (Lefever and Hoste, 2013). The system closely resembles our winning system for the same task in SemEval 2010. It is based on k-nearest neighbour classifiers which map words with local and global context features onto their transla tion, i.e. their cross-lingual sense. The system participated in the task for all five languages and obtained winning scores for four of them when asked to predict the best translation(s). We tested various configurations of our system, focusing on various levels of hyperparameter optimisation and feature selection. Our final results indicate that hyperparameter optimisation did not lead to the best results, indicating overfitting by our optimisation method in this aspect. Feature selection does have a modest positive impact.} } @inproceedings{SONAR, author = "M. Reynaert and I. Schuurman and V. Hoste and N. Oostdijk and M. van Gompel", title = "Beyond SoNaR: towards the facilitation of large corpus building efforts", year = 2012, booktitle = "Proceedings of the Eighth International conference on Language Resources and Evaluation (LREC)", volume = 8, url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/748_Paper.pdf", abstract = {In this paper we report on the experiences gained in the recent construction of the SoNaR corpus, a 500 MW reference corpus of contemporary, written Dutch. It shows what can realistically be done within the confines of a project setting where there are limitations to the duration in time as well to the budget, employing current state-of-the-art tools, standards and best practices. By doing so we aim to pass on insights that may be beneficial for anyone considering to undertake an effort towards building a large, varied yet balanced corpus for use by the wider research community. Various issues are discussed that come into play while compiling a large corpus, including approaches to acquiring texts, the arrangement of IPR, the choice of text formats, and steps to be taken in the preprocessing of data from widely different origins. We describe FoLiA, a new XML format geared at rich linguistic annotations. We also explain the rationale behind the investment in the high-quality semi-automatic enrichment of a relatively small (1 MW) subset with very rich syntactic and semantic annotations. Finally, we present some ideas about future developments and the direction corpus development may take, such as setting up an integrated work flow between web services and the potential role for ISOcat. We list tips for potential corpus builders, tricks they may want to try and further recommendations regarding technical developments future corpus builders may wish to hope for.} } @techreport{UCTO12, Author = {M. {van Gompel} and K. {van der Sloot} and A. {van den Bosch}}, Title = {Ucto: Unicode Tokeniser. Version 0.5.3. {Reference Guide}}, Number = {ILK 12-05}, Institution = {ILK Research Group, Tilburg University}, Location = {Tilburg, The Netherlands}, Year = {2012}, url = "https://github.com/LanguageMachines/ucto/raw/v0.14.1/docs/ucto_manual.pdf" } @inproceedings{DUTCHSEMCOR, author = "P. Vossen and A. Görög and F. Laan and M. van Gompel and R. Izquierdo-Bevia and A. van den Bosch", title = "DutchSemCor: building a semantically annotated corpus for Dutch", year = 2011, booktitle = "Electronic lexicography in the 21st century: New Applications for New Users: Proceedings of eLex 2011, Bled, 10-12 November 2011", url = "https://repository.ubn.ru.nl/handle/2066/94383", abstract = {State of the art Word Sense Disambiguation (WSD) systems require large sense-tagged corpora along with lexical databases to reach satisfactory results. The number of English language resources for developed WSD increased in the past years, while most other languages are still under-resourced. The situation is no different for Dutch. In order to overcome this data bottleneck, the DutchSemCor project will deliver a Dutch corpus that is sense-tagged with senses from the Cornetto lexical database. Part of this corpus (circa 300K examples) is manually tagged. The remainder is automatically tagged using different WSD systems and validated by human annotators. The project uses existing corpora compiled in other projects; these are extended with Internet examples f or word senses that are less frequent and do not (sufficiently) appear in the corpora. We report on the status of the project and the evaluations of the WSD systems with the current training data.} } @inproceedings{UVTWSD1, Address = {Morristown, NJ, USA}, Author = {M. {van Gompel}}, Booktitle = {SemEval '10: Proceedings of the 5th International Workshop on Semantic Evaluation}, Date-Modified = {2011-02-01 22:27:37 +0100}, Keywords = {ilk, vici, dutchsemcor, wsd, semeval, cross-lingual, word sense disambiguation}, Location = {Los Angeles, California}, Pages = {238--241}, Publisher = {Association for Computational Linguistics}, Title = {{UvT-WSD1}: A cross-lingual word sense disambiguation system}, Year = {2010}, url = "http://aclweb.org/anthology/S10-1053", abstract = {This paper describes the Cross-Lingual Word Sense Disambiguation system UvT-WSD1, developed at Tilburg University, for participation in two SemEval-2 tasks: the Cross-Lingual Word Sense Disambiguation task and the Cross-Lingual Lexical Substitution task. The UvT-WSD1 system makes use of k-nearest neighbour classifiers, in the form of single-word experts for each target word to be disam- biguated. These classifiers can be constructed using a variety of local and global context features, and these are mapped onto the translations, i.e. the senses, of the words. The system works for a given language-pair, either English-Dutch or English-Spanish in the current implementation, and takes a word-aligned parallel corpus as its input.} } @inproceedings{PBMBMTPAPER, Address = {Dublin, Ireland}, Author = {M. {van Gompel} and A. {van den Bosch} and P. Berck}, Booktitle = {Proceedings of the Third Workshop on Example-Based Machine Translation}, Date-Added = {2010-01-02 19:47:11 +0100}, Date-Modified = {2011-02-01 22:27:45 +0100}, Editor = {M. Forcada and A. Way}, Keywords = {ilk, dutchsemcor, memory-based machine translation, vici, pbmbmt, mbmt}, Pages = {79--86}, Title = {Extending memory-based machine translation to phrases}, Year = {2009}, url = "https://ilk.uvt.nl/mbmt/pbmbmt/pbmbmt-dublin.pdf", abstract = {We present a phrase-based extension to memory-based machine translation. This form of example- based machine translation employs lazy-learning classifiers to translate fragments of the source sen- tence to fragments of the target sentence. Source-side fragments consist of variable-length phrases in a local context of neighboring words, translated by the classifier to a target-language phrase. We compare three methods of phrase extraction, and present a new decoder that reassembles the trans- lated fragments into one final translation. Results show that one of the proposed phrase-extraction methods—the one used in Moses—leads to a translation system that outperforms context-sensitive word-based approaches. The differences, however, are small, arguably because the word-based ap- proaches already capture phrasal context implicitly due to their source-side and target-side context sensitivity.} } @MastersThesis{MASTERSTHESIS, author = {M. {van Gompel}}, title = {{Phrase-based Memory-based Machine Translation}}, school = {Tilburg University}, address = {the Netherlands}, number = {HAIT Master Thesis series nr. 09-003}, year = {2009}, url = {https://proycon.anaproy.nl/pubs/pbmbmt_thesis.pdf}, }