@misc{BrzozowskiETAL2023:geniegraphs, author = {Lukasz Brzozowski and Grzegorz Siudem and Marek Gagolewski}, title = {Community detection in complex networks via node similarity, graph representation learning, and hierarchical clustering}, year = {2023}, publisher = {arXiv}, doi = {10.48550/arXiv.2303.12212}, preprint = {https://arxiv.org/pdf/2303.12212.pdf}, note = {under review (preprint)} } @article{BertoliBarsottiETAL2024:inequality3dsi, author = {Lucio Bertoli-Barsotti and Marek Gagolewski and Grzegorz Siudem and Barbara Żogała-Siudem}, title = {Equivalence of inequality indices in the three-dimensional model of informetric impact}, year = {2024}, doi = {10.1016/j.joi.2024.101566}, journal = {Journal of Informetrics}, volume = {18}, number = {4}, pages = {101566}, preprint = {https://arxiv.org/pdf/2304.07479.pdf}, abstract = { Inequality is an inherent part of our lives: we see it in the distribution of incomes, talents, citations, to name a few. However, its intensity varies across environments: there are systems where the available resources are relatively evenly distributed but also where a small group of items or agents controls the majority of assets. Numerous indices for quantifying the degree of inequality have been proposed but in general, they work quite differently. We recently observed (Siudem et al., 2020) that many rank-size distributions might be approximated by a time-dependent agent-based model involving a mixture of preferential (rich-get-richer) and accidental (sheer chance) attachment. In this paper, we point out its relationship to an iterative process that generates rank distributions of any length and a predefined level of inequality, as measured by the Gini index. We prove that, under our model, the Gini, Bonferroni, De Vergottini, and Hoover indices are equivalent for samples of similar sizes. Given one of them, we can recreate the value of another measure. Thanks to the obtained formulae, we can also understand how they depend on the sample size. An empirical analysis of a large database of citation records in economics (RePEc) yields a good match with our theoretical derivations. }, keywords = { Gini index; Bonferroni index; Power law; Rich-get-richer; Inequality; Sensitivity } } @incollection{BeliakovETAL2024:tractable-choquet, author = {Gleb Beliakov and Simon James and Jian-Zhang Wu and Marek Gagolewski}, booktitle = {Proc. 2024 IEEE International Conference on Fuzzy Systems (FUZZ-IEEE)}, doi = {10.1109/FUZZ-IEEE60900.2024.10611753}, pages = {10611753}, publisher = {IEEE}, title = {A Tractable Approach to Fitting the Choquet Integral for Explainable Prediction and Analysis}, year = {2024}, abstract = { When it comes to explainable prediction, there is great potential for modeling data with high accuracy and flexibility using fuzzy integrals such as the Choquet integral. In this contribution, we investigate the trade-off between flexibility and tractability when learning fuzzy measures, and propose a method involving random subset selection for reducing the size of the fitting problem when datasets are too large for learning a general fuzzy measure. We conduct some numerical experiments to compare some existing simplification approaches and show that random subset selection, especially when based on partitions, could serve as a suitable compromise if we want to incorporate interaction between larger subsets. We note the savings in both the number of variables and number of constraints required depending on how the random subsets are chosen. } } @article{GagolewskiETAL2024:cvimst, author = {Marek Gagolewski and Anna Cena and Maciej Bartoszuk and Lukasz Brzozowski}, title = {Clustering with minimum spanning trees: {H}ow good can it be?}, journal = {Journal of Classification}, year = {2024}, preprint = {https://link.springer.com/content/pdf/10.1007/s00357-024-09483-1.pdf}, doi = {10.1007/s00357-024-09483-1}, note = {in press}, abstract = { Minimum spanning trees (MSTs) provide a convenient representation of datasets in numerous pattern recognition activities. Moreover, they are relatively fast to compute. In this paper, we quantify the extent to which they are meaningful in low-dimensional partitional data clustering tasks. By identifying the upper bounds for the agreement between the best (oracle) algorithm and the expert labels from a large battery of benchmark data, we discover that MST methods can be very competitive. Next, we review, study, extend, and generalise a few existing, state-of-the-art MST-based partitioning schemes. This leads to some new noteworthy approaches. Overall, the Genie and the information-theoretic methods often outperform the non-MST algorithms such as K-means, Gaussian mixtures, spectral clustering, Birch, density-based, and classical hierarchical agglomerative procedures. Nevertheless, we identify that there is still some room for improvement, and thus the development of novel algorithms is encouraged. }, keywords = { hierarchical partitional clustering; minimum spanning tree; MST; cluster validity measure; single linkage; Genie algorithm; mutual information } } @article{Gagolewski2024:nca, author = {Marek Gagolewski}, title = {Normalised clustering accuracy: {A}n asymmetric external cluster validity measure}, journal = {Journal of Classification}, year = {2024}, preprint = {https://link.springer.com/content/pdf/10.1007/s00357-024-09482-2.pdf}, doi = {10.1007/s00357-024-09482-2}, note = {in press}, abstract = { There is no, nor will there ever be, single best clustering algorithm. Nevertheless, we would still like to be able to distinguish between methods that work well on certain task types and those that systematically underperform. Clustering algorithms are traditionally evaluated using either internal or external validity measures. Internal measures quantify different aspects of the obtained partitions, e.g., the average degree of cluster compactness or point separability. However, their validity is questionable because the clusterings they endorse can sometimes be meaningless. External measures, on the other hand, compare the algorithms' outputs to fixed ground truth groupings provided by experts. In this paper, we argue that the commonly used classical partition similarity scores, such as the normalised mutual information, Fowlkes--Mallows, or adjusted Rand index, miss some desirable properties. In particular, they do not identify worst-case scenarios correctly, nor are they easily interpretable. As a consequence, the evaluation of clustering algorithms on diverse benchmark datasets can be difficult. To remedy these issues, we propose and analyse a new measure: a version of the optimal set-matching accuracy, which is normalised, monotonic with respect to some similarity relation, scale-invariant, and corrected for the imbalancedness of cluster sizes (but neither symmetric nor adjusted for chance). }, keywords = { clustering; external cluster validity; optimal set matching; normalisation; accuracy; adjusted Rand index; mutual information } } @article{BertoliBarsottiETAL2024:lorenz, author = {Lucio Bertoli-Barsotti and Marek Gagolewski and Grzegorz Siudem and Barbara Żogała-Siudem}, title = {{G}ini-stable {L}orenz curves and their relation to the generalised {P}areto distribution}, year = {2024}, journal = {Journal of Informetrics}, doi = {10.1016/j.joi.2024.101499}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2024-gini-stable-lorenz-curves.pdf}, volume = {18}, number = {2}, pages = {101499}, abstract = { We introduce an iterative discrete information production process where we can extend ordered normalised vectors by new elements based on a simple affine transformation, while preserving the predefined level of inequality, G, as measured by the Gini index. Then, we derive the family of empirical Lorenz curves of the corresponding vectors and prove that it is stochastically ordered with respect to both the sample size and G which plays the role of the uncertainty parameter. We prove that asymptotically, we obtain all, and only, Lorenz curves generated by a new, intuitive parametrisation of the finite-mean Pickands' Generalised Pareto Distribution (GPD) that unifies three other families, namely: the Pareto Type II, exponential, and scaled beta distributions. The family is not only totally ordered with respect to the parameter G, but also, thanks to our derivations, has a nice underlying interpretation. Our result may thus shed a new light on the genesis of this family of distributions. Our model fits bibliometric, informetric, socioeconomic, and environmental data reasonably well. It is quite user-friendly for it only depends on the sample size and its Gini index. }, keywords = { Gini index; Lorenz ordering; inequality; Generalised Pareto Distributions; information production process; econometrics and scientometrics } } @article{WuETAL2024:randomfm, author = {Jian-Zhang Wu and Gleb Beliakov and Simon James and Marek Gagolewski}, title = {Random generation of linearly constrained fuzzy measures and domain coverage performance evaluation}, journal = {Information Sciences}, year = {2024}, volume = {659}, pages = {120080}, doi = {10.1016/j.ins.2023.120080}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2024-random-linearly-constrained-fuzzy-measures.pdf}, abstract = { The random generation of fuzzy measures under complex linear constraints holds significance in various fields, including optimization solutions, machine learning, decision making, and property investigation. However, most existing random generation methods primarily focus on addressing the monotonicity and normalization conditions inherent in the construction of fuzzy measures, rather than the linear constraints that are crucial for representing special families of fuzzy measures and additional preference information. In this paper, we present two categories of methods to address the generation of linearly constrained fuzzy measures using linear programming models. These methods enable a comprehensive exploration and coverage of the entire feasible convex domain. The first category involves randomly selecting a subset and assigning measure values within the allowable range under given linear constraints. The second category utilizes convex combinations of constrained extreme fuzzy measures and vertex fuzzy measures. Then we employ some indices of fuzzy measures, objective functions, and distances to domain boundaries to evaluate the coverage performance of these methods across the entire feasible domain. We further provide enhancement techniques to improve the coverage ratios. Finally, we discuss and demonstrate potential applications of these generation methods in practical scenarios. }, keywords = {Fuzzy measure; random generation; linear programming; domain coverage; convex combination} } @article{GagolewskiETAL2023:owalink, author = {Marek Gagolewski and Anna Cena and Simon James and Gleb Beliakov}, title = {Hierarchical clustering with {OWA}-based linkages, the {L}ance--{W}illiams formula, and dendrogram inversions}, year = {2023}, journal = {Fuzzy Sets and Systems}, preprint = {https://arxiv.org/pdf/2303.05683.pdf}, doi = {10.1016/j.fss.2023.108740}, volume = {473}, pages = {108740}, abstract = {Agglomerative hierarchical clustering based on Ordered Weighted Averaging (OWA) operators not only generalises the single, complete, and average linkages, but also includes intercluster distances based on a few nearest or farthest neighbours, trimmed and winsorised means of pairwise point similarities, amongst many others. We explore the relationships between the famous Lance-Williams update formula and the extended OWA-based linkages with weights generated via infinite coefficient sequences. Furthermore, we provide some conditions for the weight generators to guarantee the resulting dendrograms to be free from unaesthetic inversions.}, keywords = {OWA operators; hierarchical clustering; dendrogram; inversion; the Lance-Williams formula} } @book{Gagolewski2023:deepr, author = {Marek Gagolewski}, title = {Deep {R} Programming}, address = {Melbourne}, doi = {10.5281/zenodo.7490464}, isbn = {978-0-6455719-2-9}, edition = {v1.0.0}, note = {🔓}, url = {https://deepr.gagolewski.com/}, year = {2023}, pages = {456}, abstract = { Deep R Programming is a comprehensive and in-depth introductory course on one of the most popular languages for data science. It equips ambitious students, professionals, and researchers with the knowledge and skills to become independent users of this potent environment so that they can tackle any problem related to data wrangling and analytics, numerical computing, statistics, and machine learning. This textbook is a non-profit project. Its online and PDF versions are freely available at . }, keywords = {R; S; programming; data wrangling; data science; statistics; machine learning; data frames; matrices; vectors; tensors; data cleansing; text processing; graphics}, preprint = {https://deepr.gagolewski.com/deepr.pdf} } @article{BoczekETAL2023:benchmarkint, author = {Michał Boczek and Marek Gagolewski and Marek Kaluszka and Andrzej Okolewski}, title = {A benchmark-type generalization of the {S}ugeno integral with applications in bibliometrics}, journal = {Fuzzy Sets and Systems}, year = {2023}, doi = {10.1016/j.fss.2023.01.014}, volume = {466}, pages = {108479}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2023benchmark-sugeno.pdf}, keywords = {scientometric indices; h-index; Sugeno integral; subadditivity; Jensen's inequality; monotone measure}, abstract = { We propose a new generalization of the classical Sugeno integral motivated by the Hirsch, Woeginger, and other geometrically-inspired indices of scientific impact. The new integral adapts to the rank-size curve better as it allows for putting more emphasis on highly-valued items and/or the tail of the distribution (level measure). We study its fundamental properties and give the conditions guaranteeing the fulfillment of subadditivity as well as the Jensen, Liapunov, Hardy, Markov, and Paley-Zygmund type inequalities. We discuss its applications in scientometrics. } } @misc{Gagolewski2022:clustering-data-v1.1.0, author = {Marek Gagolewski and others}, title = {A benchmark suite for clustering algorithms: Version 1.1.0}, year = {2022}, url = {https://github.com/gagolews/clustering-data-v1/releases/tag/v1.1.0}, doi = {10.5281/zenodo.7088171} } @article{Gagolewski2022:clustering-benchmarks, author = {Marek Gagolewski}, title = {A framework for benchmarking clustering algorithms}, journal = {SoftwareX}, year = {2022}, doi = {10.1016/j.softx.2022.101270}, volume = {20}, pages = {101270}, url = {https://clustering-benchmarks.gagolewski.com/}, abstract = { The evaluation of clustering algorithms can involve running them on a variety of benchmark problems, and comparing their outputs to the reference, ground-truth groupings provided by experts. Unfortunately, many research papers and graduate theses consider only a small number of datasets. Also, the fact that there can be many equally valid ways to cluster a given problem set is rarely taken into account. In order to overcome these limitations, we have developed a framework whose aim is to introduce a consistent methodology for testing clustering algorithms. Furthermore, we have aggregated, polished, and standardised many clustering benchmark dataset collections referred to across the machine learning and data mining literature, and included new datasets of different dimensionalities, sizes, and cluster types. An interactive datasets explorer, the documentation of the Python API, a description of the ways to interact with the framework from other programming languages such as R or MATLAB, and other details are all provided at . }, keywords = { clustering; machine learning; benchmark data; noise points; external cluster validity; partition similarity score }, preprint = {https://arxiv.org/pdf/2209.09493.pdf} } @article{ZogalaETAL2023:interpretable-citation-models, author = {Barbara Żogała-Siudem and Anna Cena and Grzegorz Siudem and Marek Gagolewski}, title = {Interpretable reparameterisations of citation models}, journal = {Journal of Informetrics}, year = {2023}, doi = {10.1016/j.joi.2022.101355}, pages = {101355}, volume = {17}, number = {1}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2023interpretable-citation-models.pdf}, abstract = { This paper aims to find the reasons why some citation models can predict a set of specific bibliometric indices extremely well. We show why fitting a model that preserves the total sum of a vector can be beneficial in the case of heavy-tailed data that are frequently observed in informetrics and similar disciplines. Based on this observation, we introduce the reparameterised versions of the discrete generalised beta distribution (DGBD) and power law models that preserve the total sum of elements in a citation vector and, as a byproduct, they enjoy much better predictive power when predicting many bibliometric indices as well as partial cumulative sums. This also results in the underlying model parameters' being easier to fit numerically. Moreover, they are also more interpretable. Namely, just like in our recently-introduced 3DSI (three dimensions of scientific impact) model, we have a clear distinction between the coefficients determining the total productivity (size), total impact (sum), and those that affect the shape of the resulting theoretical curve. }, keywords = {science of science; bibliometric indices; informetrics; citation models; interpretability }, } @book{Gagolewski2021:lmlcr, author = {Marek Gagolewski}, doi = {10.5281/zenodo.3679976}, note = {draft:v0.2.3 🔓}, year = {2022}, address = {Melbourne}, title = {Lightweight Machine Learning Classics with R}, url = {https://lmlcr.gagolewski.com/}, abstract = { Explore some of the most fundamental algorithms which have stood the test of time and provide the basis for innovative solutions in data-driven AI. Learn how to use the R language for implementing various stages of data processing and modelling activities. Appreciate mathematics as the universal language for formalising data-intense problems and communicating their solutions. The book is for you if you're yet to be fluent with university-level linear algebra, calculus and probability theory or you've forgotten all the maths you've ever learned, and are seeking a gentle, albeit thorough, introduction to the topic. This textbook is a non-profit project. Its online and PDF versions are freely available at . }, keywords = {machine learning; classification; regression; clustering; recommender systems; optimisation; R}, preprint = {https://lmlcr.gagolewski.com/lmlcr.pdf} } @article{BeliakovGagolewskiJames2022:antibuoyant, author = {Gleb Beliakov and Marek Gagolewski and Simon James}, title = {Reduction of variables and constraints in fitting antibuoyant fuzzy measures to data using linear programming}, journal = {Fuzzy Sets and Systems}, year = {2022}, volume = {451}, pages = {266--284}, doi = {10.1016/j.fss.2022.06.025}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022antibuoyant.pdf}, keywords = {fuzzy measures; Choquet integral; supermodularity; capacities; progressive transfers}, abstract = { The discrete Choquet integral with respect to various types of fuzzy measures serves as an important aggregation function which accounts for mutual dependencies between the inputs. The Choquet integral can be used as an objective (or constraint) in optimisation problems, and the type of fuzzy measure used determines its complexity. This paper examines the class of antibuoyant fuzzy measures, which restrict the supermodular (convex) measures and satisfy the Pigou–Dalton progressive transfers principle. We determine subsets of extreme points of the set of antibuoyant fuzzy measures, whose convex combinations form a basis of three proposed algorithms for random generation of fuzzy measures from that class, and also for fitting fuzzy measures to empirical data or solving best approximation problems. Potential applications of the proposed methods are envisaged in social welfare, ecology, and optimisation. } } @article{GerasETAL2022:timetovote, author = {Agnieszka Geras and Grzegorz Siudem and Marek Gagolewski}, journal = {Journal of the Association for Information Science and Technology}, title = {Time to vote: {T}emporal clustering of user activity on {S}tack {O}verflow}, year = {2022}, volume = {73}, number = {12}, pages = {1681--1691}, doi = {10.1002/asi.24658}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022timetovote.pdf}, abstract = { Question-and-answer (Q\&A) sites improve access to information and ease transfer of knowledge. In recent years, they have grown in popularity and importance, enabling research on behavioural patterns of their users. We study the dynamics related to the casting of 7M votes across a sample of 700k posts on Stack Overflow, a large community of professional software developers. We employ log-Gaussian mixture modelling and Markov chains to formulate a simple yet elegant description of the considered phenomena. We indicate that the inter-event times can naturally be clustered into 3 typical time scales: those which occur within hours, weeks, and months and show how the events become rarer and rarer as time passes. It turns out that the posts' popularity in a short period after publication is a weak predictor of its overall success, contrary to what was observed, e.g., in case of YouTube clips. Nonetheless, the sleeping beauties sometimes awake and can receive bursts of votes following each other relatively quickly. }, keywords = {clustering; inter-event times; log-normal mixtures; Q\&A networks; Stack Overflow; burstiness} } @article{SiudemNowakGagolewski2022:pricepareto2, title = {Power laws, the {P}rice Model, and the {P}areto type-2 distribution}, author = {Grzegorz Siudem and Przemysław Nowak and Marek Gagolewski}, journal = {Physica A: Statistical Mechanics and its Applications}, doi = {10.1016/j.physa.2022.128059}, year = {2022}, volume = {606}, pages = {128059}, abstract = { We consider a version of D. Price's model for the growth of a bibliographic network, where in each iteration, a constant number of citations is randomly allocated according to a weighted combination of the accidental (uniformly distributed) and the preferential (rich-get-richer) rule. Instead of relying on the typical master equation approach, we formulate and solve this problem in terms of the rank-size distribution. We show that, asymptotically, such a process leads to a Pareto-type 2 distribution with a new, appealingly interpretable parametrisation. We prove that the solution to the Price model expressed in terms of the rank-size distribution coincides with the expected values of order statistics in an independent Paretian sample. An empirical analysis of a large repository of academic papers yields a good fit not only in the tail of the distribution (as it is usually the case in the power law-like framework), but also across a significantly larger fraction of the data domain. }, keywords = {Price model; Pareto distribution; power laws; rich get richer; complex networks; citations }, preprint = {https://arxiv.org/pdf/2201.11456} } @book{Gagolewski2022:datawranglingpy, author = {Marek Gagolewski}, title = {Minimalist Data Wrangling with Python}, doi = {10.5281/zenodo.6451068}, isbn = {978-0-6455719-1-2}, address = {Melbourne}, edition = {v1.0.3}, note = {🔓}, pages = {442}, url = {https://datawranglingpy.gagolewski.com/}, year = {2023}, abstract = { Minimalist Data Wrangling with Python is envisaged as a student's first introduction to data science, providing a high-level overview as well as discussing key concepts in detail. We explore methods for cleaning data gathered from different sources, transforming, selecting, and extracting features, performing exploratory data analysis and dimensionality reduction, identifying naturally occurring data clusters, modelling patterns in data, comparing data between groups, and reporting the results. This textbook is a non-profit project. Its online and PDF versions are freely available at . }, keywords = {data wrangling; data science; Python; numpy; scipy; pandas; matplotlib; regression; classification; clustering; scikit-learn; time series; text processing; data frames; matrices; vectors; data cleansing; missing values; outliers}, preprint = {https://datawranglingpy.gagolewski.com/datawranglingpy.pdf} } @book{Gagolewski2022:aipp, author = {Marek Gagolewski}, title = {Algorytmy i postawy programowania w języku C++ (Introduction to Algorithms and Programming in {C++})}, url = {https://github.com/gagolews/aipp}, address = {Melbourne}, doi = {10.5281/zenodo.6451054}, isbn = {978-0-6455719-0-5}, year = {2022}, pages = {209}, edition = {v1.2.0}, note = {🇵🇱 🔓}, abstract = { Skrypt do wykładu z Algorytmów i podstaw programowania w języku C++, prowadzonego w latach 2010–2016 na Wydziale Matematyki i Nauk Informacyjnych Politechniki Warszawskiej dla studentów I roku kierunku Matematyka. Zawiera wiele przykładowych zadań na ćwiczenia i laboratoria. Książka dystrybuowana jest bezpłatnie. }, keywords = {algorytmy; programowanie; C++}, preprint = {https://raw.githubusercontent.com/gagolews/aipp/master/aipp.pdf} } @article{GagolewskiETAL2022:ockham, author = {Marek Gagolewski and Barbara Żogała-Siudem and Grzegorz Siudem and Anna Cena}, journal = {Scientometrics}, title = {{O}ckham's index of citation impact}, year = {2022}, doi = {10.1007/s11192-022-04345-2}, volume = {127}, pages = {2829--2845}, abstract = { We demonstrate that by using a triple of simple numerical summaries: an author's productivity, their overall impact, and a single other bibliometric index that aims to capture the shape of the citation distribution, we can reconstruct other popular metrics of bibliometric impact with a sufficient degree of precision. We thus conclude that the use of many indices may be unnecessary – entities should not be multiplied beyond necessity. Such a study was possible thanks to our new agent-based model (Siudem, Żogała-Siudem, Cena, Gagolewski; PNAS 117; 2020), which not only assumes that citations are distributed according to a mixture of the rich-get-richer rule and sheer chance, but also fits real bibliometric data quite well. We investigate which bibliometric indices have good discriminative power, which measures can be easily predicted as functions of other ones, and what implications to the research evaluation practice our findings have.}, keywords = {3DSI model; h-index; g-index; w-index; equivalence of bibliometric indices}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022ockham.pdf} } @article{MrowinskiETAL2022:journals, author = {Maciej J. Mrowiński and Marek Gagolewski and Grzegorz Siudem}, journal = {Journal of Informetrics}, title = {Accidentality in journal citation patterns}, year = {2022}, volume = {16}, number = {4}, pages = {101341}, doi = {10.1016/j.joi.2022.101341}, abstract = { We study an agent-based model for generating citation distributions in complex networks of scientific papers, where a fraction of citations is allotted according to the preferential attachment rule (rich get richer) and the remainder is allocated accidentally (purely at random, uniformly). Previously, we derived and analysed such a process in the context of describing individual authors, but now we apply it to scientific journals in computer and information sciences. Based on the large DBLP dataset as well as the CORE (Computing Research and Education Association of Australasia) journal ranking, we find that the impact of journals is correlated with the degree of accidentality of their citation distribution. Citations to impactful journals tend to be more preferential, while citations to lower-ranked journals are distributed in a more accidental manner. Further, applied fields of research such as artificial intelligence seem to be driven by a stronger preferential component – and hence have a higher degree of inequality – than the more theoretical ones, e.g., mathematics and computation theory. }, keywords = {complex networks; DBLP; CORE; rich get richer; Pareto principle}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022journals.pdf} } @article{CenaETAL2022:validcit, author = {Anna Cena and Marek Gagolewski and Grzegorz Siudem and Barbara Żogała-Siudem}, journal = {Journal of Informetrics}, title = {Validating citation models by proxy indices}, year = {2022}, doi = {10.1016/j.joi.2022.101267}, volume = {16}, number = {2}, pages = {101267}, abstract = { There are many approaches to the modelling of citation vectors of individual authors. Models may serve different purposes, but usually they are evaluated with regards to how well they align to citation distributions in large networks of papers. Here we compare a few leading models in terms of their ability to correctly reproduce the values of selected bibliometric indices of individual authors. Our recently-proposed three-dimensional model of scientific impact serves this purpose equally well as the discrete generalised beta distribution and the log-normal models, but has fewer parameters which additionally are all easy to interpret. We also indicate which indices can be predicted with high accuracy and which are more difficult to model.}, keywords = {science of science; bibliometric indices; scientometrics; citation models; power law}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022validcit.pdf} } @article{Gagolewski2022:stringi, author = {Marek Gagolewski}, title = {{stringi}: {F}ast and portable character string processing in {R}}, journal = {Journal of Statistical Software}, year = {2022}, doi = {10.18637/jss.v103.i02}, volume = {103}, number = {2}, pages = {1--59}, url = {https://stringi.gagolewski.com/}, keywords = {stringi; character strings; text; ICU; Unicode; regular expressions; data cleansing; natural language processing; R}, abstract = { Effective processing of character strings is required at various stages of data analysis pipelines: from data cleansing and preparation, through information extraction, to report generation. Pattern searching, string collation and sorting, normalisation, transliteration, and formatting are ubiquitous in text mining, natural language processing, and bioinformatics. This paper discusses and demonstrates how and why stringi, a mature R package for fast and portable handling of string data based on the ICU library (International Components for Unicode), should be included in each statistician's or data scientist's repertoire to complement their numerical computing and data wrangling skills.}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022stringi.pdf} } @article{BeliakovGagolewskiJames2022:mobius, author = {Gleb Beliakov and Marek Gagolewski and Simon James}, title = {Hierarchical data fusion processes involving the {M}öbius representation of capacities}, journal = {Fuzzy Sets and Systems}, year = {2022}, doi = {10.1016/j.fss.2021.02.006}, pages = {1--21}, volume = {433}, keywords = {non-additive measures; capacities; fuzzy measures; 2-step Choquet integral; aggregation operators; high dimensional data}, abstract = { The use of the Choquet integral in data fusion processes allows for the effective modelling of interactions and dependencies between data features or criteria. Its application requires identification of the defining capacity (also known as fuzzy measure) values. The main limiting factor is the complexity of the underlying parameter learning problem, which grows exponentially in the number of variables. However, in practice we may have expert knowledge regarding which of the subsets of criteria interact with each other, and which groups are independent. In this paper we study hierarchical aggregation processes, architecturally similar to feed-forward neural networks, but which allow for the simplification of the fitting problem both in terms of the number of variables and monotonicity constraints. We note that the Möbius representation lets us identify a number of relationships between the overall fuzzy measure and the data pipeline structure. Included in our findings are simplified fuzzy measures that generalise both k-intolerant and k-interactive capacities.}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022mobius.pdf} } @article{GagolewskiBartoszukCena2021:cvi, author = {Marek Gagolewski and Maciej Bartoszuk and Anna Cena}, journal = {Information Sciences}, title = {Are cluster validity measures (in)valid?}, year = {2021}, pages = {620--636}, volume = {581}, url = {https://github.com/gagolews/optim_cvi}, doi = {10.1016/j.ins.2021.10.004}, abstract = { Internal cluster validity measures (such as the Caliński--Harabasz, Dunn, or Davies--Bouldin indices) are frequently used for selecting the appropriate number of partitions a dataset should be split into. In this paper we consider what happens if we treat such indices as objective functions in unsupervised learning activities. Is the optimal grouping with regards to, say, the Silhouette index really meaningful? It turns out that many cluster (in)validity indices promote clusterings that match expert knowledge quite poorly. We also introduce a new, well-performing variant of the Dunn index that is built upon OWA operators and the near-neighbour graph so that subspaces of higher density, regardless of their shapes, can be separated from each other better. }, keywords = {clustering methodology; cluster validity index; Dunn index; nearest neighbours (NNs); ordered weighted averaging (OWA) operator; no free lunch }, preprint = {https://arxiv.org/pdf/2208.01261} } @article{BartoszukGagolewski2021:tnormsimilar, author = {Maciej Bartoszuk and Marek Gagolewski}, title = {T-norms or t-conorms? {H}ow to aggregate similarity degrees for plagiarism detection}, journal = {Knowledge-Based Systems}, year = {2021}, volume = {231}, pages = {107427}, doi = {10.1016/j.knosys.2021.107427}, keywords = {fuzzy logic connectives; similarity aggregation; decision making; data-driven optimisation; R language}, abstract = { Making correct decisions as to whether code chunks should be considered similar becomes increasingly important in software design and education and not only can improve the quality of computer programs, but also help assure the integrity of student assessments. In this paper we test numerous source code similarity detection tools on pairs of code fragments written in the data science-oriented functional programming language R. Contrary to mainstream approaches, instead of considering symmetric measures of “how much code chunks A and B are similar to each other”, we propose and study the nonsymmetric degrees of inclusion “to what extent A is a subset of B” and “to what degree B is included in A”. Overall, t-norms yield better precision (how many suspicious pairs are actually similar), t-conorms maximise recall (how many similar pairs are successfully retrieved), and custom aggregation functions fitted to training data provide a good balance between the two. Also, we find that program dependence graph-based methods tend to outperform those relying on normalised source code text, tokens, and names of functions invoked. }, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021tnormsimilar.pdf} } @article{LasekGagolewski2021:iteratings, title = {Interpretable sports team rating models based on the gradient descent algorithm}, author = {Jan Lasek and Marek Gagolewski}, journal = {International Journal of Forecasting}, doi = {10.1016/j.ijforecast.2020.11.008}, year = {2021}, volume = {37}, number = {3}, pages = {1061--1071}, abstract = { We introduce several new sport team rating models based upon the gradient descent algorithm. More precisely, the models can be formulated by maximising the likelihood of match results observed using a single step of this optimisation heuristic. The framework proposed, inspired by the prominent Elo rating system, yields an iterative version of the ordinal logistic regression as well as different variants of the Poisson regression-based models. This construction makes the update equations easy to interpret as well as adjusts ratings once new match results are observed. Thus, it naturally handles temporal changes in team strength. Moreover, a study of association football data indicates that the new models yield more accurate forecasts and are less computationally demanding than corresponding methods that jointly optimise likelihood for the whole set of matches }, keywords = {rating systems; association football; match outcome forecasting; gradient descent; Poisson regression; ordinal logistic regression; Elo rating system}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021iteratings.pdf} } @article{Gagolewski2021:genieclust, author = {Marek Gagolewski}, title = {{genieclust}: {F}ast and robust hierarchical clustering}, journal = {SoftwareX}, year = {2021}, doi = {10.1016/j.softx.2021.100722}, volume = {15}, pages = {100722}, url = {https://genieclust.gagolewski.com/}, keywords = {hierarchical clustering; robust methods; noise points; Python; R}, abstract = { genieclust is an open source Python and R package that implements the hierarchical clustering algorithm called Genie. This method frequently outperforms other state-of-the-art approaches in terms of clustering quality and speed, supports various distances over dense, sparse, and string data domains, and can be robustified even further with the built-in noise point detector. As domain-independent software, it can be used for solving problems arising in all data-driven research and development activities, including environmental, health, biological, physical, decision, and social sciences as well as technology and engineering. The Python version provides a scikit-learn-compliant API, whereas the R variant is compatible with the classic hclust(). Numerous tutorials, use cases, non-trivial examples, documentation, installation instructions, benchmark results and timings can be found at https://genieclust.gagolewski.com/. }, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021genieclust.pdf} } @article{PerezGagolewskiDeBaets2021:compositional, author = {Raúl Pérez-Fernández and Marek Gagolewski and De~Baets, Bernard}, title = {On the aggregation of compositional data}, journal = {Information Fusion}, year = {2021}, doi = {10.1016/j.inffus.2021.02.021}, volume = {73}, pages = {103--110}, keywords = {aggregation; compositional data; beset; centroid}, abstract = { Compositional data naturally appear in many fields of application. For instance, in chemistry, the relative contributions of different chemical substances to a product are typically described in terms of a compositional data vector. Although the aggregation of compositional data frequently arises in practice, the functions formalizing this process do not fit the standard order-based aggregation framework. This is due to the fact that there is no intuitive order that carries the semantics of the set of compositional data vectors (referred to as the standard simplex). In this paper, we consider the more general betweenness-based aggregation framework that yields a natural definition of an aggregation function for compositional data. The weighted centroid is proved to fit within this definition and discussed to be linked to a very tangible interpretation. Other functions for the aggregation of compositional data are presented and their fit within the proposed definition is discussed. }, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021compositional.pdf} } @article{BeliakovGagolewskiJames2020:dcsugeno, author = {Gleb Beliakov and Marek Gagolewski and Simon James}, title = {{DC} optimization for constructing discrete {S}ugeno integrals and learning nonadditive measures}, doi = {10.1080/02331934.2019.1705300}, journal = {Optimization}, volume = {69}, number = {12}, year = {2020}, pages = {2515--2534}, abstract = { Defined solely by means of order-theoretic operations meet (min) and join (max), weighted lattice polynomial functions are particularly useful for modelling data on an ordinal scale. A special case, the discrete Sugeno integral, defined with respect to a nonadditive measure (a capacity), enables accounting for the interdependencies between input variables. However, until recently the problem of identifying the fuzzy measure values with respect to various objectives and requirements has not received a great deal of attention. By expressing the learning problem as the difference of convex functions, we are able to apply DC (difference of convex) optimization methods. Here we formulate one of the global optimization steps as a local linear programming problem and investigate the improvement under different conditions. }, keywords = {aggregation functions; nonadditive measures; Sugeno integral; capacities; DC optimization}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020dcsugeno.pdf} } @article{BartoszukGagolewski2020:similar, author = {Maciej Bartoszuk and Marek Gagolewski}, title = {{SimilaR}: {R} Code Clone and Plagiarism Detection}, journal = {R Journal}, doi = {10.32614/RJ-2020-017}, url = {https://CRAN.R-project.org/package=SimilaR}, year = {2020}, pages = {367--385}, volume = {12}, number = {1}, abstract = { Third-party software for assuring source code quality is becoming increasingly popular. Tools that evaluate the coverage of unit tests, perform static code analysis, or inspect run-time memory use are crucial in the software development life cycle. More sophisticated methods allow for performing meta-analyses of large software repositories, e.g., to discover abstract topics they relate to or common design patterns applied by their developers. They may be useful in gaining a better understanding of the component interdependencies, avoiding cloned code as well as detecting plagiarism in programming classes. A meaningful measure of similarity of computer programs often forms the basis of such tools. While there are a few noteworthy instruments for similarity assessment, none of them turns out particularly suitable for analysing R code chunks. Existing solutions rely on rather simple techniques and heuristics and fail to provide a user with the kind of sensitivity and specificity required for working with R scripts. In order to fill this gap, we propose a new algorithm based on a Program Dependence Graph, implemented in the SimilaR package. It can serve as a tool not only for improving R code quality but also for detecting plagiarism, even when it has been masked by applying some obfuscation techniques or imputing dead code. We demonstrate its accuracy and efficiency in a real-world case study. }, keywords = {plagiarism detection; R; code clones}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020similar.pdf} } @article{SiudemZogalaCenaGagolewski2020:pnas3d, author = {Grzegorz Siudem and Barbara Żogała-Siudem and Anna Cena and Marek Gagolewski}, title = {Three dimensions of scientific impact}, journal = {Proceedings of the National Academy of Sciences of the United States of America (PNAS)}, doi = {10.1073/pnas.2001064117}, issue = {25}, pages = {13896--13900}, volume = {117}, year = {2020}, abstract = { The growing popularity of bibliometric indexes (whose most famous example is the h index by J. E. Hirsch [J. E. Hirsch, Proc. Natl. Acad. Sci. U.S.A. 102, 16569--16572 (2005)]) is opposed by those claiming that one's scientific impact cannot be reduced to a single number. Some even believe that our complex reality fails to submit to any quantitative description. We argue that neither of the two controversial extremes is true. By assuming that some citations are distributed according to the rich get richer rule (success breeds success, preferential attachment) while some others are assigned totally at random (all in all, a paper needs a bibliography), we have crafted a model that accurately summarizes citation records with merely three easily interpretable parameters: productivity, total impact, and how lucky an author has been so far. }, keywords = {science of science; scientometrics; bibliometric indexes; rich get richer}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020pnas3d.pdf} } @article{CoroianuETAL2019:owacomonotone, author = {Lucian Coroianu and Robert Fullér and Marek Gagolewski and Simon James}, title = {Constrained ordered weighted averaging aggregation with multiple comonotone constraints}, doi = {10.1016/j.fss.2019.09.006}, journal = {Fuzzy Sets and Systems}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019owacomonotone.pdf}, pages = {21--39}, volume = {395}, year = {2020}, abstract = { The constrained ordered weighted averaging (OWA) aggregation problem arises when we aim to maximize or minimize a convex combination of order statistics under linear inequality constraints that act on the variables with respect to their original sources. The standalone approach to optimizing the OWA under constraints is to consider all permutations of the inputs, which becomes quickly infeasible when there are more than a few variables, however in certain cases we can take advantage of the relationships amongst the constraints and the corresponding solution structures. For example, we can consider a land-use allocation satisfaction problem with an auxiliary aim of balancing land-types, whereby the response curves for each species are non-decreasing with respect to the land-types. This results in comonotone constraints, which allow us to drastically reduce the complexity of the problem. In this paper, we show that if we have an arbitrary number of constraints that are comonotone (i.e., they share the same ordering permutation of the coefficients), then the optimal solution occurs for decreasing components of the solution. After investigating the form of the solution in some special cases and providing theoretical results that shed light on the form of the solution, we detail practical approaches to solving and give real-world examples. }, keywords = {multiple criteria evaluation; ordered weighted averaging; constrained OWA aggregation; ecology; work allocation} } @article{CenaGagolewski2020:genieowa, author = {Anna Cena and Marek Gagolewski}, title = {{Genie+OWA}: {R}obustifying hierarchical clustering with {OWA}-based linkages}, journal = {Information Sciences}, doi = {10.1016/j.ins.2020.02.025}, pages = {324--336}, volume = {520}, year = {2020}, abstract = { We investigate the application of the Ordered Weighted Averaging (OWA) data fusion operator in agglomerative hierarchical clustering. The examined setting generalises the well-known single, complete and average linkage schemes. It allows to embody expert knowledge in the cluster merge process and to provide a much wider range of possible linkages. We analyse various families of weighting functions on numerous benchmark data sets in order to assess their influence on the resulting cluster structure. Moreover, we inspect the correction for the inequality of cluster size distribution -- similar to the one in the Genie algorithm. Our results demonstrate that by robustifying the procedure with the Genie correction, we can obtain a significant performance boost in terms of clustering quality. This is particularly beneficial in the case of the linkages based on the closest distances between clusters, including the single linkage and its "smoothed" counterparts. To explain this behaviour, we propose a new linkage process called three-stage OWA which yields further improvements. This way we confirm the intuition that hierarchical cluster analysis should rather take into account a few nearest neighbours of each point, instead of trying to adapt to their non-local neighbourhood. }, keywords = {hierarchical clustering; OWA; data fusion; aggregation; Genie}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020genieowa.pdf} } @article{GagolewskiPerezDeBaets2020:inherent, author = {Marek Gagolewski and Raúl Pérez-Fernández and De~Baets, Bernard}, title = {An inherent difficulty in the aggregation of multidimensional data}, journal = {IEEE Transactions on Fuzzy Systems}, doi = {10.1109/TFUZZ.2019.2908135}, issue = {3}, pages = {602--606}, volume = {28}, year = {2020}, abstract = {In the field of information fusion, the problem of data aggregation has been formalized as an order-preserving process that builds upon the property of monotonicity. However, fields such as computational statistics, data analysis and geometry, usually emphasize the role of equivariances to various geometrical transformations in aggregation processes. Admittedly, if we consider a unidimensional data fusion task, both requirements are often compatible with each other. Nevertheless, in this paper we show that, in the multidimensional setting, the only idempotent functions that are monotone and orthogonal equivariant are the over-simplistic weighted centroids. Even more, this result still holds after replacing monotonicity and orthogonal equivariance by the weaker property of orthomonotonicity. This implies that the aforementioned approaches to the aggregation of multidimensional data are irreconcilable, and that, if a weighted centroid is to be avoided, we must choose between monotonicity and a desirable behaviour with regard to orthogonal transformations. }, keywords = {multidimensional data aggregation; monotonicity; orthogonal equivariance; centroid} } @article{BeliakovGagolewskiJames2019:SugenoRobustGeneral, author = {Gleb Beliakov and Marek Gagolewski and Simon James}, title = {Robust fitting for the {S}ugeno integral with respect to general fuzzy measures}, journal = {Information Sciences}, doi = {10.1016/j.ins.2019.11.024}, pages = {449--461}, volume = {514}, year = {2020}, keywords = {Sugeno integral; fuzzy measure; parameter learning; aggregation functions}, abstract = { The Sugeno integral is an expressive aggregation function with potential applications across a range of decision contexts. Its calculation requires only the lattice minimum and maximum operations, making it particularly suited to ordinal data and robust to scale transformations. However, for practical use in data analysis and prediction, we require efficient methods for learning the associated fuzzy measure. While such methods are well developed for the Choquet integral, the fitting problem is more difficult for the Sugeno integral because it is not amenable to being expressed as a linear combination of weights, and more generally due to plateaus and non-differentiability in the objective function. Previous research has hence focused on heuristic approaches or simplified fuzzy measures. Here we show that the problem of fitting the Sugeno integral to data such that the maximum absolute error is minimized can be solved using an efficient bilevel program. This method can be incorporated into algorithms that learn fuzzy measures with the aim of minimizing the median residual. This equips us with tools that make the Sugeno integral a feasible option in robust data regression and analysis. We provide experimental comparison with a genetic algorithms approach and an example in data analysis. } } @article{GerasETAL2020:dislike, author = {Agnieszka Geras and Grzegorz Siudem and Marek Gagolewski}, doi = {10.1002/ASI.24231}, journal = {Journal of the Association for Information Science and Technology}, number = {2}, pages = {221--229}, title = {Should we introduce a dislike button for academic papers?}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019dislike.pdf}, volume = {71}, year = {2020}, abstract = { On the grounds of the revealed, mutual resemblance between the behaviour of users of the Stack Exchange and the dynamics of the citations accumulation process in the scientific community, we tackled an outwardly intractable problem of assessing the impact of introducing negative citations. Although the most frequent reason to cite a paper is to highlight the connection between the two publications, researchers sometimes mention an earlier work to cast a negative light. While computing citation-based scores, for instance the h-index, information about the reason why a paper was mentioned is neglected. Therefore it can be questioned whether these indices describe scientific achievements accurately. In this contribution we shed insight into the problem of negative citations, analysing data from Stack Exchange and, to draw more universal conclusions, we derive an approximation of citations scores. Here we show that the quantified influence of introducing negative citations is of lesser importance and that they could be used as an indicator of where attention of scientific community is allocated. }, keywords = {citation analysis; the Hirsch index; negative citations; research evaluation; science of science} } @article{PerezDeBaetsGagolewski2019:taxonomy, author = {Raúl Pérez-Fernández and De~Baets, Bernard and Marek Gagolewski}, doi = {10.1016/j.inffus.2019.05.006}, journal = {Information Fusion}, pages = {322--334}, title = {A taxonomy of monotonicity properties for the aggregation of multidimensional data}, volume = {52}, year = {2019}, abstract = { The property of monotonicity, which requires a function to preserve a given order, has been considered the standard in the aggregation of real numbers for decades. In this paper, we argue that, for the case of multidimensional data, an order-based definition of monotonicity is far too restrictive. We propose several meaningful alternatives to this property not involving the preservation of a given order by returning to its early origins stemming from the field of calculus. Numerous aggregation methods for multidimensional data commonly used by practitioners are studied within our new framework. }, keywords = { Monotonicity; Aggregation; Multidimensional data; Centroid; Spatial median }, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019taxonomymonotonicity.pdf} } @article{GagolewskiJamesBeliakov2019:fitsugenol1, author = {Marek Gagolewski and Simon James and Gleb Beliakov}, doi = {10.1109/TFUZZ.2019.2895565}, journal = {IEEE Transactions on Fuzzy Systems}, number = {4}, pages = {810--815}, title = {Supervised learning to aggregate data with the {S}ugeno integral}, volume = {27}, year = {2019}, abstract = { The problem of learning symmetric capacities (or fuzzy measures) from data is investigated toward applications in data analysis and prediction as well as decision making. Theoretical results regarding the solution minimizing the mean absolute error are exploited to develop an exact branch-refine-and-bound-type algorithm for fitting Sugeno integrals (weighted lattice polynomial functions, max-min operators) with respect to symmetric capacities. The proposed method turns out to be particularly suitable for acting on ordinal data. In addition to providing a model that can be used for the general data regression task, the results can be used, among others, to calibrate generalized h-indices to bibliometric data. }, keywords = {Fuzzy measures; h-index; lattice polynomials; ordinal data fitting; Sugeno integral; weight learning}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019fitsugenol1.pdf} } @article{CoroianuGagolewskiGrzegorzewski2019:piecewise, author = {Lucian Coroianu and Marek Gagolewski and Przemysław Grzegorzewski}, doi = {10.1007/s00500-019-03800-2}, journal = {Soft Computing}, number = {19}, pages = {9491--9505}, url = {https://CRAN.R-project.org/package=FuzzyNumbers}, title = {Piecewise linear approximation of fuzzy numbers: {A}lgorithms, arithmetic operations and stability of characteristics}, volume = {23}, year = {2019} } @article{BeliakovGagolewskiJames2019:SugenoBiomed, author = {Gleb Beliakov and Marek Gagolewski and Simon James}, doi = {10.1016/j.ins.2019.06.023}, journal = {Information Sciences}, pages = {377--387}, title = {Aggregation on ordinal scales with the {S}ugeno integral for biomedical applications}, volume = {501}, year = {2019} } @incollection{CoroianuGagolewski2019:penaltyvector, author = {Lucian Coroianu and Marek Gagolewski}, booktitle = {New Trends in Aggregation Theory}, doi = {10.1007/978-3-030-19494-9_15}, editor = {Radomír Halaš and others}, pages = {160--171}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {Penalty-based data aggregation in real normed vector spaces}, volume = {981}, year = {2019} } @article{LasekGagolewski2018:leagues, author = {Jan Lasek and Marek Gagolewski}, doi = {10.1177/1471082X18798426}, journal = {Statistical Modelling}, number = {5--6}, pages = {411--435}, title = {The efficacy of league formats in ranking teams}, volume = {18}, year = {2018}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2018leagues.pdf} } @article{BeliakovETAL2019:traffic, author = {Gleb Beliakov and Marek Gagolewski and Simon James and Shannon Pace and Nicola Pastorello and Elodie Thilliez and Rajesh Vasa}, doi = {10.1016/j.asoc.2017.07.014}, journal = {Applied Soft Computing}, pages = {910--919}, title = {Measuring traffic congestion: {A}n approach based on learning weighted inequality, spread and aggregation indices from comparison data}, volume = {67}, year = {2019}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2018traffic.pdf} } @incollection{BeliakovETAL2018:lmslts, author = {Gleb Beliakov and Marek Gagolewski and Simon James}, booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems. Theory and Foundations}, doi = {10.1007/978-3-319-91476-3_31}, editor = {Jesus Medina and others}, pages = {367--378}, publisher = {Springer}, title = {Least median of squares ({LMS}) and least trimmed squares ({LTS}) fitting for the weighted arithmetic mean}, year = {2018} } @incollection{BartoszukGagolewski2017:binagopplagiarism, author = {Maciej Bartoszuk and Marek Gagolewski}, booktitle = {Proc. 2017 IEEE International Conference on Fuzzy Systems (FUZZ-IEEE)}, doi = {10.1109/FUZZ-IEEE.2017.8015582}, pages = {8015582}, publisher = {IEEE}, title = {Binary aggregation functions in software plagiarism detection}, year = {2017} } @incollection{CenaGagolewski2017:owagenie, author = {Anna Cena and Marek Gagolewski}, booktitle = {Proc. 2017 IEEE International Conference on Fuzzy Systems (FUZZ-IEEE)}, doi = {10.1109/FUZZ-IEEE.2017.8015652}, pages = {8015652}, publisher = {IEEE}, title = {{OWA}-based linkage and the {G}enie correction for hierarchical clustering}, year = {2017}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2017genieowatest.pdf} } @incollection{GagolewskiJames2017:eusflat, author = {Marek Gagolewski and Simon James}, booktitle = {Advances in Fuzzy Logic and Technology 2017}, doi = {10.1007/978-3-319-66824-6_10}, editor = {Janusz Kacprzyk and others}, pages = {104--116}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {Fitting symmetric fuzzy measures for discrete {S}ugeno integration}, volume = {642}, year = {2018} } @article{Gagolewski2017:pbamultidim, author = {Marek Gagolewski}, doi = {10.1016/j.fss.2016.12.009}, journal = {Fuzzy Sets and Systems}, pages = {4--20}, title = {Penalty-based aggregation of multidimensional data}, volume = {325}, year = {2017}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2017penaltymultidim.pdf} } @article{MesiarGagolewski2016:defects, author = {Radko Mesiar and Marek Gagolewski}, doi = {10.1109/TFUZZ.2016.2516579}, journal = {IEEE Transactions on Fuzzy Systems}, number = {6}, pages = {1668--1672}, title = {{H}-index and other {S}ugeno integrals: {S}ome defects and their compensation}, volume = {24}, year = {2016}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016hdeffects.pdf} } @article{BeliakovETAL2016:penaltyinequality, author = {Gleb Beliakov and Marek Gagolewski and Simon James}, doi = {10.1142/S0218488516400018}, journal = {International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems}, pages = {1--23}, title = {Penalty-based and other representations of economic inequality}, volume = {24(Suppl.1)}, year = {2016}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016penaltyeconomic.pdf} } @incollection{GagolewskiETAL2016:genie2, author = {Marek Gagolewski and Anna Cena and Maciej Bartoszuk}, booktitle = {Modeling Decisions for Artificial Intelligence}, doi = {10.1007/978-3-319-45656-0_16}, editor = {Vicenç Torra and others}, pages = {191--202}, publisher = {Springer}, series = {Lecture Notes in Artificial Intelligence}, title = {Hierarchical clustering via penalty-based aggregation and the {G}enie approach}, volume = {9880}, year = {2016}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016genie2.pdf} } @article{GagolewskiETAL2016:genie, author = {Marek Gagolewski and Maciej Bartoszuk and Anna Cena}, doi = {10.1016/j.ins.2016.05.003}, journal = {Information Sciences}, pages = {8--23}, url = {https://genieclust.gagolewski.com/}, title = {Genie: {A} new, fast, and outlier-resistant hierarchical clustering algorithm}, volume = {363}, year = {2016}, preprint = {https://arxiv.org/pdf/2209.05757} } @incollection{CenaGagolewski2016:generickmeans, author = {Anna Cena and Marek Gagolewski}, booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II}, doi = {10.1007/978-3-319-40581-0_36}, editor = {J.P. Carvalho and others}, pages = {445--456}, publisher = {Springer}, series = {Communications in Computer and Information Science}, title = {Fuzzy k-minpen clustering and k-nearest-minpen classification procedures incorporating generic distance-based penalty minimizers}, volume = {611}, year = {2016} } @incollection{BartoszukETAL2016:fitagop1, author = {Maciej Bartoszuk and Gleb Beliakov and Marek Gagolewski and Simon James}, booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II}, doi = {10.1007/978-3-319-40581-0_62}, editor = {J.P. Carvalho and others}, pages = {767--779}, publisher = {Springer}, series = {Communications in Computer and Information Science}, title = {Fitting aggregation functions to data: {Part} {I} -- {L}inearization and regularization}, volume = {611}, year = {2016}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016fitagop1.pdf} } @incollection{BartoszukETAL2016:fitagop2, author = {Maciej Bartoszuk and Gleb Beliakov and Marek Gagolewski and Simon James}, booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II}, doi = {10.1007/978-3-319-40581-0_63}, editor = {J.P. Carvalho and others}, pages = {780--789}, publisher = {Springer}, series = {Communications in Computer and Information Science}, title = {Fitting aggregation functions to data: {Part} {II} -- {I}dempotization}, volume = {611}, year = {2016}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016fitagop2.pdf} } @article{LasekETAL2016:fifa, author = {Jan Lasek and Zoltan Szlavik and Marek Gagolewski and Sandjai Bhulai}, doi = {10.1080/02664763.2015.1100593}, journal = {Journal of Applied Statistics}, number = {7}, pages = {1349--1368}, title = {How to improve a team's position in the {FIFA} ranking -- {A} simulation study}, volume = {43}, year = {2016}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016improvefifa.pdf} } @article{ZogalaETAL2016:agenth, author = {Barbara Żogała-Siudem and Grzegorz Siudem and Anna Cena and Marek Gagolewski}, doi = {10.1140/epjb/e2015-60757-1}, journal = {European Physical Journal B}, number = {21}, title = {Agent-based model for the bibliometric h-index -- {E}xact solution}, volume = {89}, year = {2016}, preprint = {https://arxiv.org/pdf/1509.05798} } @incollection{BartoszukGagolewski2014:fuzzyrsimilar, author = {Maciej Bartoszuk and Marek Gagolewski}, booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part III}, doi = {10.1007/978-3-319-08852-5_3}, editor = {A. Laurent and others}, pages = {21--30}, publisher = {Springer}, series = {Communications in Computer and Information Science}, title = {A fuzzy {R} code similarity detection algorithm}, volume = {444}, year = {2014} } @incollection{BartoszukGagolewski2015:similar2, author = {Maciej Bartoszuk and Marek Gagolewski}, booktitle = {Proc. IFSA/EUSFLAT'15}, doi = {10.2991/ifsa-eusflat-15.2015.61}, editor = {J.M. Alonso and H. Bustince and M. Reformat}, pages = {419--426}, publisher = {Atlantis Press}, title = {Detecting similarity of {R} functions via a fusion of multiple heuristic methods}, year = {2015} } @article{CenaETAL2015:prodclust, author = {Anna Cena and Marek Gagolewski and Radko Mesiar}, doi = {10.1016/j.joi.2015.02.005}, journal = {Journal of Informetrics}, number = {2}, pagesg = {273--284}, title = {Problems and challenges of information resources producers' clustering}, volume = {9}, year = {2015} } @incollection{CenaGagolewski2013:om31, author = {Anna Cena and Marek Gagolewski}, booktitle = {Aggregation Functions in Theory and in Practise}, doi = {10.1007/978-3-642-39165-1_13}, editor = {Humberto Bustince and others}, pages = {93--103}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {{OM3}: {O}rdered maxitive, minitive, and modular aggregation operators -- {P}art {I}: {A}xiomatic analysis under arity-dependence}, volume = {228}, year = {2013}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013om31.pdf} } @incollection{CenaGagolewski2013:om32, author = {Anna Cena and Marek Gagolewski}, booktitle = {Aggregation Functions in Theory and in Practise}, doi = {10.1007/978-3-642-39165-1_14}, editor = {Humberto Bustince and others}, pages = {105--115}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {{OM3}: {O}rdered maxitive, minitive, and modular aggregation operators -- {P}art {II}: {A}~simulation study}, volume = {228}, year = {2013}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013om32.pdf} } @incollection{CenaGagolewski2015:kmeansinformetric, author = {Anna Cena and Marek Gagolewski}, booktitle = {Proc. IFSA/EUSFLAT'15}, doi = {10.2991/ifsa-eusflat-15.2015.77}, editor = {J.M. Alonso and H. Bustince and M. Reformat}, pages = {536--543}, publisher = {Atlantis Press}, title = {A {K}-means-like algorithm for informetric data clustering}, year = {2015} } @article{CenaGagolewski2015:om3fss, author = {Anna Cena and Marek Gagolewski}, doi = {10.1016/j.fss.2014.04.001}, journal = {Fuzzy Sets and Systems}, pages = {138--159}, title = {{OM3}: {O}rdered maxitive, minitive, and modular aggregation operators -- {A}xiomatic and probabilistic properties in an arity-monotonic setting}, volume = {264}, year = {2015}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015om3fss.pdf} } @article{CoroianuETAL2013:piecewise1, author = {Lucian Coroianu and Marek Gagolewski and Przemysław Grzegorzewski}, doi = {10.1016/j.fss.2013.02.005}, journal = {Fuzzy Sets and Systems}, pages = {26--51}, title = {Nearest piecewise linear approximation of fuzzy numbers}, url = {https://CRAN.R-project.org/package=FuzzyNumbers}, volume = {233}, year = {2013}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013plfnknot1.pdf} } @incollection{CoroianuETAL2014:piecewise1suppcore, author = {Lucian Coroianu and Marek Gagolewski and Przemysław Grzegorzewski and Adabitabar Firozja, M. and Tahereh Houlari}, booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II}, doi = {10.1007/978-3-319-08855-6_25}, editor = {A. Laurent and others}, pages = {244--254}, publisher = {Springer}, series = {Communications in Computer and Information Science}, title = {Piecewise linear approximation of fuzzy numbers preserving the support and core}, volume = {443}, year = {2014} } @article{Gagolewski2011:CITAN, author = {Marek Gagolewski}, doi = {10.1016/j.joi.2011.06.006}, journal = {Journal of Informetrics}, number = {4}, pages = {678--692}, url = {https://CRAN.R-project.org/package=CITAN}, title = {Bibliometric impact assessment with {R} and the {CITAN} package}, volume = {5}, year = {2011}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2011citan.pdf} } @incollection{Gagolewski2012:effortdom, author = {Marek Gagolewski}, booktitle = {Advances in Computational Intelligence, Part III}, doi = {10.1007/978-3-642-31718-7_29}, editor = {Salvatore Greco and others}, pages = {276--285}, publisher = {Springer}, series = {Communications in Computer and Information Science}, title = {On the relation between effort-dominating and symmetric minitive aggregation operators}, volume = {299}, year = {2012}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2012effortdom.pdf} } @incollection{Gagolewski2012:smps, author = {Marek Gagolewski}, booktitle = {Synergies of Soft Computing and Statistics for Intelligent Data Analysis}, doi = {10.1007/978-3-642-33042-1_39}, editor = {Rudolf Kruse and others}, pages = {359--367}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {Statistical hypothesis test for the difference between {H}irsch indices of two {P}areto-distributed random samples}, volume = {190}, year = {2013}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013htest.pdf} } @article{Gagolewski2013:fair, author = {Marek Gagolewski}, doi = {10.1016/j.joi.2013.07.001}, journal = {Journal of Informetrics}, number = {4}, pages = {792--802}, title = {Scientific impact assessment cannot be fair}, volume = {7}, year = {2013}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013fair.pdf} } @article{Gagolewski2013:om3, author = {Marek Gagolewski}, doi = {10.1016/j.ins.2012.09.005}, journal = {Information Sciences}, pages = {170--180}, title = {On the relationship between symmetric maxitive, minitive, and modular aggregation operators}, volume = {221}, year = {2013}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013rel3.pdf} } @incollection{Gagolewski2015:hconfint, author = {Marek Gagolewski}, booktitle = {Strengthening Links Between Data Analysis and Soft Computing}, doi = {10.1007/978-3-319-10765-3_28}, editor = {P. Grzegorzewski and others}, pages = {233--240}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {Sugeno integral-based confidence intervals for the theoretical h-index}, volume = {315}, year = {2015}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015hconfint.pdf} } @incollection{Gagolewski2015:normalizedspread, author = {Marek Gagolewski}, booktitle = {Proc. IFSA/EUSFLAT'15}, doi = {10.2991/ifsa-eusflat-15.2015.32}, editor = {J.M. Alonso and H. Bustince and M. Reformat}, pages = {210--216}, publisher = {Atlantis Press}, title = {Normalized {WD$_p$WAM} and {WD$_p$OWA} spread measures}, year = {2015}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015normalizedspread.pdf} } @article{Gagolewski2015:spread, author = {Marek Gagolewski}, doi = {10.1016/j.ejor.2014.08.034}, journal = {European Journal of Operational Research}, number = {2}, pages = {469--477}, title = {Spread measures and their relation to aggregation functions}, volume = {241}, year = {2015}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015spread.pdf} } @article{GagolewskiGrzegorzewski2009:geometricapproach, author = {Marek Gagolewski and Przemysław Grzegorzewski}, doi = {10.1007/s11192-008-2253-y}, journal = {Scientometrics}, number = {3}, pages = {617--634}, title = {A geometric approach to the construction of scientific impact indices}, volume = {81}, year = {2009}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2009geometricapproach.pdf} } @incollection{GagolewskiGrzegorzewski2010:ipmu, author = {Marek Gagolewski and Przemysław Grzegorzewski}, booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems}, doi = {10.1007/978-3-642-14055-6_73}, editor = {E. Hüllermeier and others}, pages = {693--702}, publisher = {Springer}, series = {Communications in Computer and Information Science}, title = {Arity-monotonic extended aggregation operators}, volume = {80}, year = {2010}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2010aritymonotonic.pdf} } @incollection{GagolewskiGrzegorzewski2010:smps, author = {Marek Gagolewski and Przemysław Grzegorzewski}, booktitle = {Combining Soft Computing and Statistical Methods in Data Analysis}, doi = {10.1007/978-3-642-14746-3_35}, editor = {Christian Borgelt and others}, pages = {281--288}, publisher = {Springer}, series = {Advances in Intelligent and Soft Computing}, title = {{S}-statistics and their basic properties}, volume = {77}, year = {2010}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2010sstatprop.pdf} } @incollection{GagolewskiGrzegorzewski2011:axcharquasils, author = {Marek Gagolewski and Przemysław Grzegorzewski}, booktitle = {Proc. EUSFLAT/LFA'11}, doi = {10.2991/eusflat.2011.112}, editor = {Sylvie Galichet and others}, location = {Aix-Les-Bains, France}, pages = {53--58}, publisher = {Atlantis Press}, title = {Axiomatic characterizations of {(quasi-)} {L}-statistics and {S}-statistics and the {P}roducer {A}ssessment {P}roblem}, year = {2011}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2011axcharquasils.pdf} } @article{GagolewskiGrzegorzewski2011:possibilistic, author = {Marek Gagolewski and Przemysław Grzegorzewski}, doi = {10.1016/j.ijar.2011.01.010}, journal = {International Journal of Approximate Reasoning}, number = {9}, pages = {1312--1324}, title = {Possibilistic analysis of arity-monotonic aggregation operators and its relation to bibliometric impact assessment of individuals}, volume = {52}, year = {2011}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2011possibilistic.pdf} } @incollection{GagolewskiLasek2015:fuzzrelpap, author = {Marek Gagolewski and Jan Lasek}, booktitle = {Proc. 7th IEEE International Conference Intelligent Systems IS'2014, Vol. 2: Tools, Architectures, Systems, Applications}, doi = {10.1007/978-3-319-11310-4_25}, pages = {289--300}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {The use of fuzzy relations in the assessment of information resources producers' performance}, volume = {323}, year = {2015} } @incollection{GagolewskiLasek2015:preflearn, author = {Marek Gagolewski and Jan Lasek}, booktitle = {Proc. IFSA/EUSFLAT'15}, doi = {10.2991/ifsa-eusflat-15.2015.70}, editor = {J.M. Alonso and H. Bustince and M. Reformat}, pages = {484--491}, publisher = {Atlantis Press}, title = {Learning experts' preferences from informetric data}, year = {2015} } @article{GagolewskiMesiar2012:pqm, author = {Marek Gagolewski and Radko Mesiar}, doi = {10.1016/j.joi.2012.05.001}, journal = {Journal of Informetrics}, number = {4}, pages = {566--579}, title = {Aggregating different paper quality measures with a generalized {h}-index}, volume = {6}, year = {2012}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2012pqm.pdf} } @article{GagolewskiMesiar2014:integrals, author = {Marek Gagolewski and Radko Mesiar}, doi = {10.1016/j.ins.2013.12.004}, journal = {Information Sciences}, pages = {166--174}, title = {Monotone measures and universal integrals in a uniform framework for the scientific impact assessment problem}, volume = {263}, year = {2014}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2014integrals.pdf} } @incollection{LasekGagolewski2015:fireman, author = {Jan Lasek and Marek Gagolewski}, booktitle = {Proc. FedCSIS'15}, doi = {10.15439/2015F418}, editor = {M. Ganzha and L. Maciaszek and M. Paprzycki}, pages = {375--380}, publisher = {IEEE}, title = {The winning solution to the {AAIA'15} {D}ata {M}ining {C}ompetition: {T}agging firefighter activities at a fire scene}, year = {2015}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015fireman.pdf} } @incollection{GagolewskiDebskiNowakiewicz2013:graphintegrals, author = {Marek Gagolewski and Michał Dębski and Michał Nowakiewicz}, booktitle = {Proc. Uncertainty Modeling}, editor = {Radko Mesiar and Tomas Bacigal}, isbn = {ISBN:978-80-227-4067-8}, pages = {17--23}, publisher = {STU Bratislava}, title = {Efficient algorithm for computing certain graph-based monotone integrals: {T}he {$l_p$}-indices}, year = {2013}, abstract = { The Choquet, Sugeno, and Shilkret integrals with respect to monotone measures are useful as tools in decision support systems. In this paper we propose a new class of graph-based integrals that generalize these three operations. Then, an efficient linear-time algorithm for computing their special case, that is lp-indices, 1 ≤ p < ∞, is presented. The algorithm is based on R.L. Graham's routine for determining the convex hull of a finite planar set. }, keywords = {monotone measures; Choquet, Sugeno, and Shilkret integral; lp-index; convex hull; Graham's scan; scientific impact indices}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013graphintegrals.pdf} } @incollection{GagolewskiGrzegorzewski2009:possibleh, author = {Marek Gagolewski and Przemysław Grzegorzewski}, booktitle = {Proc. IFSA/EUSFLAT'09}, editor = {J. P. Carvalho and others}, location = {Lisbon, Portugal}, pages = {1691--1695}, publisher = {IFSA}, title = {Possible and necessary {h}-indices}, year = {2009} } @incollection{LasekGagolewski2015:esttourmet, author = {Jan Lasek and Marek Gagolewski}, booktitle = {Selected problems in information technologies (Proc. ITRIA'15 vol. 2)}, pages = {67--78}, publisher = {Institute of Computer Science, Polish Academy of Sciences}, title = {Estimation of tournament metrics for association football league formats}, year = {2015} } @incollection{CenaGagolewski2015:clustaginf, author = {Anna Cena and Marek Gagolewski}, booktitle = {Computational methods in data analysis (Proc. ITRIA'15 vol. 1)}, pages = {5--26}, publisher = {Institute of Computer Science, Polish Academy of Sciences}, title = {Clustering and aggregation of informetric data sets}, year = {2015} } @incollection{Gagolewski2015:issuesmultidim, author = {Marek Gagolewski}, booktitle = {Proc. 8th International Summer School on Aggregation Operators (AGOP 2015)}, editor = {Michał Baczyński and De~Baets, Bernard and Radko Mesiar}, isbn = {978-83-8012-519-3}, pages = {127--132}, publisher = {University of Silesia}, title = {Some issues in aggregation of multidimensional data}, year = {2015} } @incollection{CenaGagolewski2015:fuzzycmeansinformetric, author = {Anna Cena and Marek Gagolewski}, booktitle = {Proc. 8th International Summer School on Aggregation Operators (AGOP 2015)}, editor = {Michał Baczyński and De~Baets, Bernard and Radko Mesiar}, isbn = {978-83-8012-519-3}, pages = {79--84}, publisher = {University of Silesia}, title = {Aggregation and soft clustering of informetric data}, year = {2015} } @incollection{GagolewskiGrzegorzewski2010:metodyiproblemy, address = {Warsaw}, author = {Marek Gagolewski and Przemysław Grzegorzewski}, booktitle = {Psychologia i~informatyka. Synergia i~kontradykcje}, editor = {Tomasz Rowiński and Ryszard Tadeusiewicz}, isbn = {978-83-707-2679-9}, note = {in Polish}, pages = {103--125}, publisher = {Wyd.~UKSW}, title = {Metody i~problemy naukometrii (Methods and problems of scientometrics)}, year = {2010} } @incollection{GagolewskiGrzegorzewski2009:uogolnienieh, address = {Lublin}, author = {Marek Gagolewski and Przemysław Grzegorzewski}, booktitle = {Kadry i infrastruktura nowoczesnej nauki: Teoria i praktyka, Proc. 1st Intl. Conf. Zarządzanie Nauką}, editor = {P. Kawalec and P. Lipski}, isbn = {978-83-61671-12-1}, note = {in Polish}, pages = {15--29}, publisher = {Wydawnictwo Lubelskiej Szkoły Biznesu}, title = {O pewnym uogólnieniu indeksu Hirscha}, volume = {2}, year = {2009} } @incollection{RowinskiGagolewski2011:internetkryzys, address = {Warsaw}, author = {Tomasz Rowiński and Marek Gagolewski}, booktitle = {Kryzys: Pułapka czy szansa?}, editor = {M. Jankowska and M. Starzomska}, isbn = {978-83-609-5885-8}, note = {in Polish}, pages = {211--224}, publisher = {WN Akapit}, title = {Internet a kryzys}, year = {2011} } @article{RowinskiGagolewski2007:online, author = {Tomasz Rowiński and Marek Gagolewski}, journal = {Studia Psychologica UKSW}, note = {in Polish}, pages = {195--210}, title = {Preferencje i postawy wobec pomocy online (Attitudes towards online counselling and psychotherapy)}, volume = {7}, year = {2007} } @phdthesis{Gagolewski2011:PhD, author = {Marek Gagolewski}, note = {in Polish}, school = {Systems Research Institute, Polish Academy of Sciences}, title = {Aggregation operators and their application in a formal model for quality evaluation system of scientific research (Wybrane operatory agregacji i~ich zastosowanie w~modelu formalnym systemu jakości w~nauce)}, year = {2011} } @proceedings{HalasETAL2019:agop2019, editor = {Radomír Halaš and Marek Gagolewski and Radko Mesiar}, isbn = {978-3-030-19493-2}, pages = {348}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {New Trends in Aggregation Theory}, volume = {981}, doi = {10.1007/978-3-030-19494-9}, year = {2019} } @proceedings{GrzegorzewskiETAL2015:smps2014, editor = {Przemysław Grzegorzewski and Marek Gagolewski and Olgierd Hryniewicz and María Ángeles Gil}, isbn = {978-3-319-10764-6}, pages = {294}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {Strengthening Links Between Data Analysis and Soft Computing}, doi = {10.1007/978-3-319-10765-3}, volume = {315}, year = {2015} } @proceedings{FerraroETAL2017:smps2016, editor = {Maria Brigida Ferraro and Paulo Giordani and Barbara Vantaggi and Marek Gagolewski and María Ángeles Gil and Przemysław Grzegorzewski and Olgierd Hryniewicz}, isbn = {978-3-319-42971-7}, pages = {535}, publisher = {Springer}, series = {Advances in Intelligent Systems and Computing}, title = {Soft Methods for Data Science}, doi = {10.1007/978-3-319-42972-4}, volume = {456}, year = {2017} } @book{GagolewskiETAL2016:Pythonksiazka, address = {Warsaw}, author = {Marek Gagolewski and Maciej Bartoszuk and Anna Cena}, isbn = {978-83-01-18940-2}, note = {🇵🇱}, pages = {369}, publisher = {Wydawnictwo Naukowe PWN}, title = {Przetwarzanie i analiza danych w języku {Python} (Data Processing and Analysis in Python)}, year = {2016}, url = {https://github.com/gagolews/Analiza_danych_w_jezyku_Python} } @book{Gagolewski2016:Rksiazka, address = {Warsaw}, author = {Marek Gagolewski}, edition = {2nd}, isbn = {978-83-01-18939-6}, note = {🇵🇱}, pages = {550}, publisher = {Wydawnictwo Naukowe PWN}, title = {Programowanie w języku {R}. {A}naliza danych, obliczenia, symulacje (R Programming. Data Analysis, Computing, Simulations)}, year = {2016}, url = {https://github.com/gagolews/Programowanie_w_jezyku_R} } @book{Gagolewski2015:datafusionbook, author = {Marek Gagolewski}, isbn = {978-83-63159-20-7}, note = {🔓}, pages = {290}, publisher = {Institute of Computer Science, Polish Academy of Sciences}, address = {Warsaw}, title = {Data Fusion: {T}heory, Methods, and Applications}, year = {2015}, url = {https://github.com/gagolews/datafusion}, preprint = {https://raw.githubusercontent.com/gagolews/datafusion/master/datafusion.pdf}, keywords = {data aggregation, data fusion, means, t-norms, spread measures, multidimensional data, strings}, abstract = { A proper fusion of complex data is of interest to many researchers in diverse fields, including computational statistics, computational geometry, bioinformatics, machine learning, pattern recognition, quality management, engineering, statistics, finance, economics, etc. It plays a crucial role in: synthetic description of data processes or whole domains, creation of rule bases for approximate reasoning tasks, reaching consensus and selection of the optimal strategy in decision support systems, imputation of missing values, data deduplication and consolidation, record linkage across heterogeneous databases, and clustering. This open-access research monograph integrates the spread-out results from different domains using the methodology of the well-established classical aggregation framework, introduces researchers and practitioners to Aggregation 2.0, as well as points out the challenges and interesting directions for further research. } } @book{Gagolewski2014:Rksiazka, address = {Warsaw}, author = {Marek Gagolewski}, edition = {1st}, isbn = {978-83-01-17461-3}, note = {🇵🇱}, pages = {494}, publisher = {Wydawnictwo Naukowe PWN}, title = {Programowanie w języku {R}. {A}naliza danych, obliczenia, symulacje (R Programming. Data Analysis, Computing, Simulations)}, year = {2014} } @book{GrzegorzewskiETAL2014:wnioskowaniestatystyczne, address = {Warsaw}, author = {Przemysław Grzegorzewski and Marek Gagolewski and Konstancja Bobecka-Wesołowska}, isbn = {978-83-93-72601-1}, note = {🇵🇱 🔓}, pages = {183}, publisher = {Politechnika Warszawska}, title = {Wnioskowanie statystyczne z wykorzystaniem środowiska R (Statistical Inference with R)}, year = {2014}, preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2014wnioskowaniestatystyczne.pdf} }