@misc{BrzozowskiETAL2023:geniegraphs,
author = {Lukasz Brzozowski and Grzegorz Siudem and Marek Gagolewski},
title = {Community detection in complex networks via node similarity, graph representation learning, and hierarchical clustering},
year = {2023},
publisher = {arXiv},
doi = {10.48550/arXiv.2303.12212},
preprint = {https://arxiv.org/pdf/2303.12212.pdf},
note = {under review (preprint)}
}
@article{BertoliBarsottiETAL2024:inequality3dsi,
author = {Lucio Bertoli-Barsotti and Marek Gagolewski and Grzegorz Siudem and Barbara Żogała-Siudem},
title = {Equivalence of inequality indices in the three-dimensional model of informetric impact},
year = {2024},
doi = {10.1016/j.joi.2024.101566},
journal = {Journal of Informetrics},
volume = {18},
number = {4},
pages = {101566},
preprint = {https://arxiv.org/pdf/2304.07479.pdf},
abstract = {
Inequality is an inherent part of our lives: we see it in the distribution of incomes, talents, citations, to name a few. However, its intensity varies across environments: there are systems where the available resources are relatively evenly distributed but also where a small group of items or agents controls the majority of assets. Numerous indices for quantifying the degree of inequality have been proposed but in general, they work quite differently.
We recently observed (Siudem et al., 2020) that many rank-size distributions might be approximated by a time-dependent agent-based model involving a mixture of preferential (rich-get-richer) and accidental (sheer chance) attachment. In this paper, we point out its relationship to an iterative process that generates rank distributions of any length and a predefined level of inequality, as measured by the Gini index.
We prove that, under our model, the Gini, Bonferroni, De Vergottini, and Hoover indices are equivalent for samples of similar sizes. Given one of them, we can recreate the value of another measure. Thanks to the obtained formulae, we can also understand how they depend on the sample size. An empirical analysis of a large database of citation records in economics (RePEc) yields a good match with our theoretical derivations.
},
keywords = {
Gini index;
Bonferroni index;
Power law;
Rich-get-richer;
Inequality;
Sensitivity
}
}
@incollection{BeliakovETAL2024:tractable-choquet,
author = {Gleb Beliakov and Simon James and Jian-Zhang Wu and Marek Gagolewski},
booktitle = {Proc. 2024 IEEE International Conference on Fuzzy Systems (FUZZ-IEEE)},
doi = {10.1109/FUZZ-IEEE60900.2024.10611753},
pages = {10611753},
publisher = {IEEE},
title = {A Tractable Approach to Fitting the Choquet Integral for Explainable Prediction and Analysis},
year = {2024},
abstract = {
When it comes to explainable prediction, there is great potential for modeling data with high accuracy and flexibility using fuzzy integrals such as the Choquet integral. In this contribution, we investigate the trade-off between flexibility and tractability when learning fuzzy measures, and propose a method involving random subset selection for reducing the size of the fitting problem when datasets are too large for learning a general fuzzy measure. We conduct some numerical experiments to compare some existing simplification approaches and show that random subset selection, especially when based on partitions, could serve as a suitable compromise if we want to incorporate interaction between larger subsets. We note the savings in both the number of variables and number of constraints required depending on how the random subsets are chosen.
}
}
@article{GagolewskiETAL2024:cvimst,
author = {Marek Gagolewski and Anna Cena and Maciej Bartoszuk and Lukasz Brzozowski},
title = {Clustering with minimum spanning trees: {H}ow good can it be?},
journal = {Journal of Classification},
year = {2024},
preprint = {https://link.springer.com/content/pdf/10.1007/s00357-024-09483-1.pdf},
doi = {10.1007/s00357-024-09483-1},
note = {in press},
abstract = {
Minimum spanning trees (MSTs) provide a convenient representation of datasets in numerous pattern recognition activities. Moreover, they are relatively fast to compute. In this paper, we quantify the extent to which they are meaningful in low-dimensional partitional data clustering tasks. By identifying the upper bounds for the agreement between the best (oracle) algorithm and the expert labels from a large battery of benchmark data, we discover that MST methods can be very competitive. Next, we review, study, extend, and generalise a few existing, state-of-the-art MST-based partitioning schemes. This leads to some new noteworthy approaches. Overall, the Genie and the information-theoretic methods often outperform the non-MST algorithms such as K-means, Gaussian mixtures, spectral clustering, Birch, density-based, and classical hierarchical agglomerative procedures. Nevertheless, we identify that there is still some room for improvement, and thus the development of novel algorithms is encouraged.
},
keywords = {
hierarchical partitional clustering;
minimum spanning tree;
MST;
cluster validity measure;
single linkage;
Genie algorithm;
mutual information
}
}
@article{Gagolewski2024:nca,
author = {Marek Gagolewski},
title = {Normalised clustering accuracy: {A}n asymmetric external cluster validity measure},
journal = {Journal of Classification},
year = {2024},
preprint = {https://link.springer.com/content/pdf/10.1007/s00357-024-09482-2.pdf},
doi = {10.1007/s00357-024-09482-2},
note = {in press},
abstract = {
There is no, nor will there ever be, single best clustering algorithm. Nevertheless, we would still like to be able to distinguish between methods that work well on certain task types and those that systematically underperform. Clustering algorithms are traditionally evaluated using either internal or external validity measures. Internal measures quantify different aspects of the obtained partitions, e.g., the average degree of cluster compactness or point separability. However, their validity is questionable because the clusterings they endorse can sometimes be meaningless. External measures, on the other hand, compare the algorithms' outputs to fixed ground truth groupings provided by experts. In this paper, we argue that the commonly used classical partition similarity scores, such as the normalised mutual information, Fowlkes--Mallows, or adjusted Rand index, miss some desirable properties. In particular, they do not identify worst-case scenarios correctly, nor are they easily interpretable. As a consequence, the evaluation of clustering algorithms on diverse benchmark datasets can be difficult. To remedy these issues, we propose and analyse a new measure: a version of the optimal set-matching accuracy, which is normalised, monotonic with respect to some similarity relation, scale-invariant, and corrected for the imbalancedness of cluster sizes (but neither symmetric nor adjusted for chance).
},
keywords = {
clustering;
external cluster validity;
optimal set matching;
normalisation;
accuracy;
adjusted Rand index;
mutual information
}
}
@article{BertoliBarsottiETAL2024:lorenz,
author = {Lucio Bertoli-Barsotti and Marek Gagolewski and Grzegorz Siudem and Barbara Żogała-Siudem},
title = {{G}ini-stable {L}orenz curves and their relation to the generalised {P}areto distribution},
year = {2024},
journal = {Journal of Informetrics},
doi = {10.1016/j.joi.2024.101499},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2024-gini-stable-lorenz-curves.pdf},
volume = {18},
number = {2},
pages = {101499},
abstract = {
We introduce an iterative discrete information production process where we can extend ordered normalised vectors by new elements based on a simple affine transformation, while preserving the predefined level of inequality, G, as measured by the Gini index.
Then, we derive the family of empirical Lorenz curves of the corresponding vectors and prove that it is stochastically ordered with respect to both the sample size and G which plays the role of the uncertainty parameter. We prove that asymptotically, we obtain all, and only, Lorenz curves generated by a new, intuitive parametrisation of the finite-mean Pickands' Generalised Pareto Distribution (GPD) that unifies three other families, namely: the Pareto Type II, exponential, and scaled beta distributions. The family is not only totally ordered with respect to the parameter G, but also, thanks to our derivations, has a nice underlying interpretation. Our result may thus shed a new light on the genesis of this family of distributions.
Our model fits bibliometric, informetric, socioeconomic, and environmental data reasonably well. It is quite user-friendly for it only depends on the sample size and its Gini index.
},
keywords = {
Gini index; Lorenz ordering; inequality; Generalised Pareto Distributions; information production process; econometrics and scientometrics
}
}
@article{WuETAL2024:randomfm,
author = {Jian-Zhang Wu and Gleb Beliakov and Simon James and Marek Gagolewski},
title = {Random generation of linearly constrained fuzzy measures and domain coverage performance evaluation},
journal = {Information Sciences},
year = {2024},
volume = {659},
pages = {120080},
doi = {10.1016/j.ins.2023.120080},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2024-random-linearly-constrained-fuzzy-measures.pdf},
abstract = {
The random generation of fuzzy measures under complex linear constraints holds significance in various fields, including optimization solutions, machine learning, decision making, and property investigation. However, most existing random generation methods primarily focus on addressing the monotonicity and normalization conditions inherent in the construction of fuzzy measures, rather than the linear constraints that are crucial for representing special families of fuzzy measures and additional preference information. In this paper, we present two categories of methods to address the generation of linearly constrained fuzzy measures using linear programming models. These methods enable a comprehensive exploration and coverage of the entire feasible convex domain. The first category involves randomly selecting a subset and assigning measure values within the allowable range under given linear constraints. The second category utilizes convex combinations of constrained extreme fuzzy measures and vertex fuzzy measures. Then we employ some indices of fuzzy measures, objective functions, and distances to domain boundaries to evaluate the coverage performance of these methods across the entire feasible domain. We further provide enhancement techniques to improve the coverage ratios. Finally, we discuss and demonstrate potential applications of these generation methods in practical scenarios.
},
keywords = {Fuzzy measure; random generation; linear programming; domain coverage; convex combination}
}
@article{GagolewskiETAL2023:owalink,
author = {Marek Gagolewski and Anna Cena and Simon James and Gleb Beliakov},
title = {Hierarchical clustering with {OWA}-based linkages, the {L}ance--{W}illiams formula, and dendrogram inversions},
year = {2023},
journal = {Fuzzy Sets and Systems},
preprint = {https://arxiv.org/pdf/2303.05683.pdf},
doi = {10.1016/j.fss.2023.108740},
volume = {473},
pages = {108740},
abstract = {Agglomerative hierarchical clustering based on Ordered Weighted Averaging (OWA) operators not only generalises the single, complete, and average linkages, but also includes intercluster distances based on a few nearest or farthest neighbours, trimmed and winsorised means of pairwise point similarities, amongst many others. We explore the relationships between the famous Lance-Williams update formula and the extended OWA-based linkages with weights generated via infinite coefficient sequences. Furthermore, we provide some conditions for the weight generators to guarantee the resulting dendrograms to be free from unaesthetic inversions.},
keywords = {OWA operators; hierarchical clustering; dendrogram; inversion; the Lance-Williams formula}
}
@book{Gagolewski2023:deepr,
author = {Marek Gagolewski},
title = {Deep {R} Programming},
address = {Melbourne},
doi = {10.5281/zenodo.7490464},
isbn = {978-0-6455719-2-9},
edition = {v1.0.0},
note = {🔓},
url = {https://deepr.gagolewski.com/},
year = {2023},
pages = {456},
abstract = {
Deep R Programming is a comprehensive and in-depth introductory course
on one of the most popular languages for data science. It equips
ambitious students, professionals, and researchers with the knowledge
and skills to become independent users of this potent environment
so that they can tackle any problem related to data wrangling and
analytics, numerical computing, statistics, and machine learning.
This textbook is a non-profit project. Its online and PDF versions
are freely available at .
},
keywords = {R; S; programming; data wrangling; data science; statistics;
machine learning; data frames; matrices; vectors; tensors;
data cleansing; text processing; graphics},
preprint = {https://deepr.gagolewski.com/deepr.pdf}
}
@article{BoczekETAL2023:benchmarkint,
author = {Michał Boczek and Marek Gagolewski and Marek Kaluszka and Andrzej Okolewski},
title = {A benchmark-type generalization of the {S}ugeno integral with applications in bibliometrics},
journal = {Fuzzy Sets and Systems},
year = {2023},
doi = {10.1016/j.fss.2023.01.014},
volume = {466},
pages = {108479},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2023benchmark-sugeno.pdf},
keywords = {scientometric indices; h-index; Sugeno integral;
subadditivity; Jensen's inequality; monotone measure},
abstract = {
We propose a new generalization of the classical Sugeno integral
motivated by the Hirsch, Woeginger, and other geometrically-inspired
indices of scientific impact. The new integral adapts to the rank-size
curve better as it allows for putting more emphasis on highly-valued
items and/or the tail of the distribution (level measure). We study
its fundamental properties and give the conditions guaranteeing the
fulfillment of subadditivity as well as the Jensen, Liapunov, Hardy,
Markov, and Paley-Zygmund type inequalities. We discuss its
applications in scientometrics.
}
}
@misc{Gagolewski2022:clustering-data-v1.1.0,
author = {Marek Gagolewski and others},
title = {A benchmark suite for clustering algorithms: Version 1.1.0},
year = {2022},
url = {https://github.com/gagolews/clustering-data-v1/releases/tag/v1.1.0},
doi = {10.5281/zenodo.7088171}
}
@article{Gagolewski2022:clustering-benchmarks,
author = {Marek Gagolewski},
title = {A framework for benchmarking clustering algorithms},
journal = {SoftwareX},
year = {2022},
doi = {10.1016/j.softx.2022.101270},
volume = {20},
pages = {101270},
url = {https://clustering-benchmarks.gagolewski.com/},
abstract = {
The evaluation of clustering algorithms can involve running them
on a variety of benchmark problems, and comparing their outputs
to the reference, ground-truth groupings provided by experts.
Unfortunately, many research papers and graduate theses consider
only a small number of datasets. Also, the fact that there can be many
equally valid ways to cluster a given problem set is rarely taken into
account. In order to overcome these limitations, we have developed
a framework whose aim is to introduce a consistent methodology for
testing clustering algorithms. Furthermore, we have aggregated,
polished, and standardised many clustering benchmark dataset
collections referred to across the machine learning and data mining
literature, and included new datasets of different dimensionalities,
sizes, and cluster types. An interactive datasets explorer, the
documentation of the Python API, a description of the ways to
interact with the framework from other programming languages such
as R or MATLAB, and other details are all provided at
.
},
keywords = {
clustering; machine learning; benchmark data; noise points;
external cluster validity; partition similarity score
},
preprint = {https://arxiv.org/pdf/2209.09493.pdf}
}
@article{ZogalaETAL2023:interpretable-citation-models,
author = {Barbara Żogała-Siudem and Anna Cena and Grzegorz Siudem
and Marek Gagolewski},
title = {Interpretable reparameterisations of citation models},
journal = {Journal of Informetrics},
year = {2023},
doi = {10.1016/j.joi.2022.101355},
pages = {101355},
volume = {17},
number = {1},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2023interpretable-citation-models.pdf},
abstract = {
This paper aims to find the reasons why some citation models can
predict a set of specific bibliometric indices extremely well.
We show why fitting a model that preserves the total sum of a vector
can be beneficial in the case of heavy-tailed data that are frequently
observed in informetrics and similar disciplines. Based on this
observation, we introduce the reparameterised versions of the discrete
generalised beta distribution (DGBD) and power law models that preserve
the total sum of elements in a citation vector and, as a byproduct,
they enjoy much better predictive power when predicting many
bibliometric indices as well as partial cumulative sums. This also
results in the underlying model parameters' being easier to fit
numerically. Moreover, they are also more interpretable. Namely,
just like in our recently-introduced 3DSI (three dimensions of
scientific impact) model, we have a clear distinction between the
coefficients determining the total productivity (size), total impact
(sum), and those that affect the shape of the resulting theoretical
curve.
},
keywords = {science of science; bibliometric indices; informetrics;
citation models; interpretability
},
}
@book{Gagolewski2021:lmlcr,
author = {Marek Gagolewski},
doi = {10.5281/zenodo.3679976},
note = {draft:v0.2.3 🔓},
year = {2022},
address = {Melbourne},
title = {Lightweight Machine Learning Classics with R},
url = {https://lmlcr.gagolewski.com/},
abstract = {
Explore some of the most fundamental algorithms which have stood the
test of time and provide the basis for innovative solutions in
data-driven AI. Learn how to use the R language for implementing
various stages of data processing and modelling activities.
Appreciate mathematics as the universal language for formalising
data-intense problems and communicating their solutions.
The book is for you if you're yet to be fluent with university-level
linear algebra, calculus and probability theory or you've forgotten
all the maths you've ever learned, and are seeking a gentle,
albeit thorough, introduction to the topic.
This textbook is a non-profit project. Its online and PDF versions
are freely available at .
},
keywords = {machine learning; classification; regression; clustering;
recommender systems; optimisation; R},
preprint = {https://lmlcr.gagolewski.com/lmlcr.pdf}
}
@article{BeliakovGagolewskiJames2022:antibuoyant,
author = {Gleb Beliakov and Marek Gagolewski and Simon James},
title = {Reduction of variables and constraints in fitting antibuoyant fuzzy
measures to data using linear programming},
journal = {Fuzzy Sets and Systems},
year = {2022},
volume = {451},
pages = {266--284},
doi = {10.1016/j.fss.2022.06.025},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022antibuoyant.pdf},
keywords = {fuzzy measures; Choquet integral; supermodularity;
capacities; progressive transfers},
abstract = {
The discrete Choquet integral with respect to various types
of fuzzy measures serves as an important aggregation function which
accounts for mutual dependencies between the inputs. The Choquet
integral can be used as an objective (or constraint) in optimisation
problems, and the type of fuzzy measure used determines its complexity.
This paper examines the class of antibuoyant fuzzy measures, which
restrict the supermodular (convex) measures and satisfy the Pigou–Dalton
progressive transfers principle. We determine subsets of extreme points
of the set of antibuoyant fuzzy measures, whose convex combinations form
a basis of three proposed algorithms for random generation of fuzzy
measures from that class, and also for fitting fuzzy measures to
empirical data or solving best approximation problems. Potential
applications of the proposed methods are envisaged in social
welfare, ecology, and optimisation.
}
}
@article{GerasETAL2022:timetovote,
author = {Agnieszka Geras and Grzegorz Siudem and Marek Gagolewski},
journal = {Journal of the Association for Information Science and Technology},
title = {Time to vote: {T}emporal clustering of user activity
on {S}tack {O}verflow},
year = {2022},
volume = {73},
number = {12},
pages = {1681--1691},
doi = {10.1002/asi.24658},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022timetovote.pdf},
abstract = {
Question-and-answer (Q\&A) sites improve access to information
and ease transfer of knowledge. In recent years, they have grown
in popularity and importance, enabling research on behavioural
patterns of their users. We study the dynamics related to the casting
of 7M votes across a sample of 700k posts on Stack Overflow,
a large community of professional software developers. We employ
log-Gaussian mixture modelling and Markov chains to formulate
a simple yet elegant description of the considered phenomena.
We indicate that the inter-event times can naturally be clustered
into 3 typical time scales: those which occur within hours, weeks,
and months and show how the events become rarer and rarer as time
passes. It turns out that the posts' popularity in a short period
after publication is a weak predictor of its overall success,
contrary to what was observed, e.g., in case of YouTube clips.
Nonetheless, the sleeping beauties sometimes awake and can receive
bursts of votes following each other relatively quickly.
},
keywords = {clustering; inter-event times; log-normal mixtures;
Q\&A networks; Stack Overflow; burstiness}
}
@article{SiudemNowakGagolewski2022:pricepareto2,
title = {Power laws, the {P}rice Model, and the {P}areto type-2 distribution},
author = {Grzegorz Siudem and Przemysław Nowak and Marek Gagolewski},
journal = {Physica A: Statistical Mechanics and its Applications},
doi = {10.1016/j.physa.2022.128059},
year = {2022},
volume = {606},
pages = {128059},
abstract = {
We consider a version of D. Price's model for the growth
of a bibliographic network, where in each iteration, a constant number
of citations is randomly allocated according to a weighted combination
of the accidental (uniformly distributed) and the preferential
(rich-get-richer) rule. Instead of relying on the typical master
equation approach, we formulate and solve this problem in terms of
the rank-size distribution. We show that, asymptotically, such a process
leads to a Pareto-type 2 distribution with a new, appealingly
interpretable parametrisation. We prove that the solution to the Price
model expressed in terms of the rank-size distribution coincides with
the expected values of order statistics in an independent Paretian
sample. An empirical analysis of a large repository of academic papers
yields a good fit not only in the tail of the distribution (as it is
usually the case in the power law-like framework), but also across
a significantly larger fraction of the data domain.
},
keywords = {Price model; Pareto distribution; power laws;
rich get richer; complex networks; citations
},
preprint = {https://arxiv.org/pdf/2201.11456}
}
@book{Gagolewski2022:datawranglingpy,
author = {Marek Gagolewski},
title = {Minimalist Data Wrangling with Python},
doi = {10.5281/zenodo.6451068},
isbn = {978-0-6455719-1-2},
address = {Melbourne},
edition = {v1.0.3},
note = {🔓},
pages = {442},
url = {https://datawranglingpy.gagolewski.com/},
year = {2023},
abstract = {
Minimalist Data Wrangling with Python is envisaged as a student's
first introduction to data science, providing a high-level overview
as well as discussing key concepts in detail. We explore methods
for cleaning data gathered from different sources, transforming,
selecting, and extracting features, performing exploratory data
analysis and dimensionality reduction, identifying naturally
occurring data clusters, modelling patterns in data, comparing
data between groups, and reporting the results.
This textbook is a non-profit project. Its online and PDF versions
are freely available at .
},
keywords = {data wrangling; data science; Python; numpy; scipy; pandas;
matplotlib; regression; classification; clustering; scikit-learn;
time series; text processing; data frames; matrices; vectors;
data cleansing; missing values; outliers},
preprint = {https://datawranglingpy.gagolewski.com/datawranglingpy.pdf}
}
@book{Gagolewski2022:aipp,
author = {Marek Gagolewski},
title = {Algorytmy i postawy programowania w języku C++
(Introduction to Algorithms and Programming in {C++})},
url = {https://github.com/gagolews/aipp},
address = {Melbourne},
doi = {10.5281/zenodo.6451054},
isbn = {978-0-6455719-0-5},
year = {2022},
pages = {209},
edition = {v1.2.0},
note = {🇵🇱 🔓},
abstract = {
Skrypt do wykładu z Algorytmów i podstaw programowania
w języku C++, prowadzonego w latach 2010–2016 na Wydziale Matematyki
i Nauk Informacyjnych Politechniki Warszawskiej dla studentów I roku
kierunku Matematyka. Zawiera wiele przykładowych zadań na ćwiczenia
i laboratoria. Książka dystrybuowana jest bezpłatnie.
},
keywords = {algorytmy; programowanie; C++},
preprint = {https://raw.githubusercontent.com/gagolews/aipp/master/aipp.pdf}
}
@article{GagolewskiETAL2022:ockham,
author = {Marek Gagolewski and Barbara Żogała-Siudem
and Grzegorz Siudem and Anna Cena},
journal = {Scientometrics},
title = {{O}ckham's index of citation impact},
year = {2022},
doi = {10.1007/s11192-022-04345-2},
volume = {127},
pages = {2829--2845},
abstract = {
We demonstrate that by using a triple of simple numerical
summaries: an author's productivity, their overall impact, and a single
other bibliometric index that aims to capture the shape
of the citation distribution, we can reconstruct other popular metrics
of bibliometric impact with a sufficient degree of precision.
We thus conclude that the use of many indices may be unnecessary –
entities should not be multiplied beyond
necessity. Such a study was possible thanks to our new agent-based model
(Siudem, Żogała-Siudem, Cena, Gagolewski; PNAS 117; 2020), which
not only assumes that citations are distributed according to a mixture
of the rich-get-richer rule and sheer chance, but also fits real
bibliometric data quite well. We investigate which bibliometric
indices have good discriminative power, which measures can be easily
predicted as functions of other ones, and what implications to the
research evaluation practice our findings have.},
keywords = {3DSI model; h-index; g-index; w-index;
equivalence of bibliometric indices},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022ockham.pdf}
}
@article{MrowinskiETAL2022:journals,
author = {Maciej J. Mrowiński and Marek Gagolewski and Grzegorz Siudem},
journal = {Journal of Informetrics},
title = {Accidentality in journal citation patterns},
year = {2022},
volume = {16},
number = {4},
pages = {101341},
doi = {10.1016/j.joi.2022.101341},
abstract = {
We study an agent-based model for generating citation distributions
in complex networks of scientific papers, where a fraction of citations
is allotted according to the preferential attachment rule
(rich get richer) and the remainder is allocated accidentally
(purely at random, uniformly). Previously, we derived and analysed
such a process in the context of describing individual authors,
but now we apply it to scientific journals in computer and
information sciences. Based on the large DBLP dataset as well as the
CORE (Computing Research and Education Association of Australasia)
journal ranking, we find that the impact of journals is correlated
with the degree of accidentality of their citation distribution.
Citations to impactful journals tend to be more preferential,
while citations to lower-ranked journals are distributed in a more
accidental manner. Further, applied fields of research such as
artificial intelligence seem to be driven by a stronger preferential
component – and hence have a higher degree of inequality – than the more
theoretical ones, e.g., mathematics and computation theory.
},
keywords = {complex networks; DBLP; CORE; rich get richer; Pareto principle},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022journals.pdf}
}
@article{CenaETAL2022:validcit,
author = {Anna Cena and Marek Gagolewski
and Grzegorz Siudem and Barbara Żogała-Siudem},
journal = {Journal of Informetrics},
title = {Validating citation models by proxy indices},
year = {2022},
doi = {10.1016/j.joi.2022.101267},
volume = {16},
number = {2},
pages = {101267},
abstract = {
There are many approaches to the modelling of citation vectors
of individual authors. Models may serve different purposes, but usually
they are evaluated with regards to how well they align to citation
distributions in large networks of papers. Here we compare a few
leading models in terms of their ability to correctly reproduce
the values of selected bibliometric indices of individual authors.
Our recently-proposed three-dimensional model of scientific impact
serves this purpose equally well as the discrete generalised beta
distribution and the log-normal models, but has fewer parameters which
additionally are all easy to interpret. We also indicate which indices
can be predicted with high accuracy and which are more difficult to
model.},
keywords = {science of science; bibliometric indices; scientometrics;
citation models; power law},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022validcit.pdf}
}
@article{Gagolewski2022:stringi,
author = {Marek Gagolewski},
title = {{stringi}: {F}ast and portable character string processing in {R}},
journal = {Journal of Statistical Software},
year = {2022},
doi = {10.18637/jss.v103.i02},
volume = {103},
number = {2},
pages = {1--59},
url = {https://stringi.gagolewski.com/},
keywords = {stringi; character strings; text; ICU; Unicode;
regular expressions; data cleansing; natural language processing; R},
abstract = {
Effective processing of character strings is required at various
stages of data analysis pipelines: from data cleansing and preparation,
through information extraction, to report generation. Pattern searching,
string collation and sorting, normalisation, transliteration,
and formatting are ubiquitous in text mining, natural language
processing, and bioinformatics. This paper discusses and demonstrates
how and why stringi, a mature R package for fast and
portable handling of string data based on the ICU library
(International Components for Unicode), should be included in each
statistician's or data scientist's repertoire to complement their
numerical computing and data wrangling skills.},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022stringi.pdf}
}
@article{BeliakovGagolewskiJames2022:mobius,
author = {Gleb Beliakov and Marek Gagolewski and Simon James},
title = {Hierarchical data fusion processes involving
the {M}öbius representation of capacities},
journal = {Fuzzy Sets and Systems},
year = {2022},
doi = {10.1016/j.fss.2021.02.006},
pages = {1--21},
volume = {433},
keywords = {non-additive measures; capacities; fuzzy measures;
2-step Choquet integral; aggregation operators; high dimensional data},
abstract = {
The use of the Choquet integral in data fusion processes allows
for the effective modelling of interactions and dependencies between
data features or criteria. Its application requires identification
of the defining capacity (also known as fuzzy measure) values.
The main limiting factor is the complexity of the underlying parameter
learning problem, which grows exponentially in the number of variables.
However, in practice we may have expert knowledge regarding which of
the subsets of criteria interact with each other, and which groups are
independent. In this paper we study hierarchical aggregation processes,
architecturally similar to feed-forward neural networks, but which
allow for the simplification of the fitting problem both in terms
of the number of variables and monotonicity constraints. We note that
the Möbius representation lets us identify a number of relationships
between the overall fuzzy measure and the data pipeline structure.
Included in our findings are simplified fuzzy measures that generalise
both k-intolerant and k-interactive capacities.},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2022mobius.pdf}
}
@article{GagolewskiBartoszukCena2021:cvi,
author = {Marek Gagolewski and Maciej Bartoszuk and Anna Cena},
journal = {Information Sciences},
title = {Are cluster validity measures (in)valid?},
year = {2021},
pages = {620--636},
volume = {581},
url = {https://github.com/gagolews/optim_cvi},
doi = {10.1016/j.ins.2021.10.004},
abstract = {
Internal cluster validity measures (such as the Caliński--Harabasz,
Dunn, or Davies--Bouldin indices) are frequently used for selecting
the appropriate number of partitions a dataset should be split into.
In this paper we consider what happens if we treat
such indices as objective functions in unsupervised learning activities.
Is the optimal grouping with regards to, say, the Silhouette index
really meaningful?
It turns out that many cluster (in)validity indices
promote clusterings that match expert knowledge quite poorly.
We also introduce a new, well-performing variant of the Dunn index that
is built upon OWA operators and the near-neighbour graph
so that subspaces of higher density,
regardless of their shapes, can be separated from each other better.
},
keywords = {clustering methodology;
cluster validity index; Dunn index; nearest neighbours (NNs);
ordered weighted averaging (OWA) operator; no free lunch
},
preprint = {https://arxiv.org/pdf/2208.01261}
}
@article{BartoszukGagolewski2021:tnormsimilar,
author = {Maciej Bartoszuk and Marek Gagolewski},
title = {T-norms or t-conorms? {H}ow to aggregate similarity
degrees for plagiarism detection},
journal = {Knowledge-Based Systems},
year = {2021},
volume = {231},
pages = {107427},
doi = {10.1016/j.knosys.2021.107427},
keywords = {fuzzy logic connectives; similarity aggregation;
decision making; data-driven optimisation; R language},
abstract = {
Making correct decisions as to whether code chunks should be
considered similar becomes increasingly important in software design
and education and not only can improve the quality of computer programs,
but also help assure the integrity of student assessments. In this paper
we test numerous source code similarity detection tools on pairs of code
fragments written in the data science-oriented functional programming
language R. Contrary to mainstream approaches, instead of considering
symmetric measures of “how much code chunks A and B are similar to
each other”, we propose and study the nonsymmetric degrees of inclusion
“to what extent A is a subset of B” and “to what degree B is included
in A”. Overall, t-norms yield better precision (how many suspicious
pairs are actually similar), t-conorms maximise recall (how many
similar pairs are successfully retrieved), and custom aggregation
functions fitted to training data provide a good balance between
the two. Also, we find that program dependence graph-based methods
tend to outperform those relying on normalised source code text,
tokens, and names of functions invoked.
},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021tnormsimilar.pdf}
}
@article{LasekGagolewski2021:iteratings,
title = {Interpretable sports team rating models based
on the gradient descent algorithm},
author = {Jan Lasek and Marek Gagolewski},
journal = {International Journal of Forecasting},
doi = {10.1016/j.ijforecast.2020.11.008},
year = {2021},
volume = {37},
number = {3},
pages = {1061--1071},
abstract = {
We introduce several new sport team rating models based upon
the gradient descent algorithm. More precisely, the models can be
formulated by maximising the likelihood of match results observed
using a single step of this optimisation heuristic.
The framework proposed, inspired by the prominent Elo rating system,
yields an iterative version of the ordinal logistic regression
as well as different variants of the Poisson regression-based models.
This construction makes the update equations easy to interpret
as well as adjusts ratings once new match results are observed.
Thus, it naturally handles temporal changes in team strength. Moreover,
a study of association football data indicates that the new models yield
more accurate forecasts and are less computationally demanding than
corresponding methods that jointly optimise likelihood for the whole
set of matches
},
keywords = {rating systems; association football;
match outcome forecasting; gradient descent; Poisson regression;
ordinal logistic regression; Elo rating system},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021iteratings.pdf}
}
@article{Gagolewski2021:genieclust,
author = {Marek Gagolewski},
title = {{genieclust}: {F}ast and robust hierarchical clustering},
journal = {SoftwareX},
year = {2021},
doi = {10.1016/j.softx.2021.100722},
volume = {15},
pages = {100722},
url = {https://genieclust.gagolewski.com/},
keywords = {hierarchical clustering; robust methods;
noise points; Python; R},
abstract = {
genieclust is an open source Python and R package that
implements the hierarchical clustering algorithm called Genie.
This method frequently outperforms other state-of-the-art approaches
in terms of clustering quality and speed, supports various distances over
dense, sparse, and string data domains, and can be robustified even
further with the built-in noise point detector. As domain-independent
software, it can be used for solving problems arising in all data-driven
research and development activities, including environmental, health,
biological, physical, decision, and social sciences as well as
technology and engineering. The Python version provides a
scikit-learn-compliant API, whereas the R variant is compatible with the
classic hclust(). Numerous tutorials, use cases, non-trivial examples,
documentation, installation instructions, benchmark results and timings
can be found at https://genieclust.gagolewski.com/.
},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021genieclust.pdf}
}
@article{PerezGagolewskiDeBaets2021:compositional,
author = {Raúl Pérez-Fernández and Marek Gagolewski and De~Baets, Bernard},
title = {On the aggregation of compositional data},
journal = {Information Fusion},
year = {2021},
doi = {10.1016/j.inffus.2021.02.021},
volume = {73},
pages = {103--110},
keywords = {aggregation; compositional data; beset; centroid},
abstract = {
Compositional data naturally appear in many fields of
application. For instance, in chemistry, the relative contributions
of different chemical substances to a product are typically described
in terms of a compositional data vector. Although the aggregation
of compositional data frequently arises in practice, the functions
formalizing this process do not fit the standard order-based
aggregation framework. This is due to the fact that there is
no intuitive order that carries the semantics of the set of
compositional data vectors (referred to as the standard simplex).
In this paper, we consider the more general betweenness-based
aggregation framework that yields a natural definition of an
aggregation function for compositional data.
The weighted centroid is proved to fit within this definition and
discussed to be linked to a very tangible interpretation. Other
functions for the aggregation of compositional data are presented
and their fit within the proposed definition is discussed.
},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2021compositional.pdf}
}
@article{BeliakovGagolewskiJames2020:dcsugeno,
author = {Gleb Beliakov and Marek Gagolewski and Simon James},
title = {{DC} optimization for constructing discrete {S}ugeno
integrals and learning nonadditive measures},
doi = {10.1080/02331934.2019.1705300},
journal = {Optimization},
volume = {69},
number = {12},
year = {2020},
pages = {2515--2534},
abstract = {
Defined solely by means of order-theoretic operations meet
(min) and join (max), weighted lattice polynomial functions
are particularly useful for modelling data on an ordinal scale.
A special case, the discrete Sugeno integral, defined with respect
to a nonadditive measure (a capacity), enables accounting for the
interdependencies between input variables. However, until recently
the problem of identifying the fuzzy measure values with respect to
various objectives and requirements has not received a great deal of
attention. By expressing the learning problem as the difference of
convex functions, we are able to apply DC (difference of convex)
optimization methods. Here we formulate one of the global optimization
steps as a local linear programming problem and investigate the
improvement under different conditions.
},
keywords = {aggregation functions; nonadditive measures; Sugeno integral;
capacities; DC optimization},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020dcsugeno.pdf}
}
@article{BartoszukGagolewski2020:similar,
author = {Maciej Bartoszuk and Marek Gagolewski},
title = {{SimilaR}: {R} Code Clone and Plagiarism Detection},
journal = {R Journal},
doi = {10.32614/RJ-2020-017},
url = {https://CRAN.R-project.org/package=SimilaR},
year = {2020},
pages = {367--385},
volume = {12},
number = {1},
abstract = {
Third-party software for assuring source code quality
is becoming increasingly popular. Tools that evaluate the coverage
of unit tests, perform static code analysis, or inspect run-time memory
use are crucial in the software development life cycle. More
sophisticated methods allow for performing meta-analyses of large
software repositories, e.g., to discover abstract topics they relate
to or common design patterns applied by their developers. They may be
useful in gaining a better understanding of the component
interdependencies, avoiding cloned code as well as detecting plagiarism
in programming classes. A meaningful
measure of similarity of computer programs often forms the basis of
such tools. While there are a few noteworthy instruments for
similarity assessment, none of them turns out particularly suitable
for analysing R code chunks. Existing solutions rely on rather
simple techniques and heuristics and fail to provide a user with
the kind of sensitivity and specificity required for working with
R scripts. In order to fill this gap, we propose a new algorithm
based on a Program Dependence Graph, implemented in the SimilaR package.
It can serve as a tool not only for improving R code quality but also
for detecting plagiarism, even when it has been masked by applying some
obfuscation techniques or imputing dead code. We demonstrate its
accuracy and efficiency in a real-world case study.
},
keywords = {plagiarism detection; R; code clones},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020similar.pdf}
}
@article{SiudemZogalaCenaGagolewski2020:pnas3d,
author = {Grzegorz Siudem and Barbara Żogała-Siudem
and Anna Cena and Marek Gagolewski},
title = {Three dimensions of scientific impact},
journal = {Proceedings of the National Academy of Sciences
of the United States of America (PNAS)},
doi = {10.1073/pnas.2001064117},
issue = {25},
pages = {13896--13900},
volume = {117},
year = {2020},
abstract = {
The growing popularity of bibliometric indexes
(whose most famous example is the h index by J. E. Hirsch
[J. E. Hirsch, Proc. Natl. Acad. Sci. U.S.A. 102, 16569--16572 (2005)])
is opposed by those claiming that one's scientific impact cannot be reduced
to a single number. Some even believe that our complex reality fails
to submit to any quantitative description. We argue that neither of
the two controversial extremes is true. By assuming that some citations
are distributed according to the rich get richer rule (success breeds
success, preferential attachment) while some others are assigned totally
at random (all in all, a paper needs a bibliography), we have crafted
a model that accurately summarizes citation records with merely
three easily interpretable parameters: productivity, total impact,
and how lucky an author has been so far.
},
keywords = {science of science; scientometrics; bibliometric indexes; rich get richer},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020pnas3d.pdf}
}
@article{CoroianuETAL2019:owacomonotone,
author = {Lucian Coroianu and Robert Fullér
and Marek Gagolewski and Simon James},
title = {Constrained ordered weighted averaging aggregation
with multiple comonotone constraints},
doi = {10.1016/j.fss.2019.09.006},
journal = {Fuzzy Sets and Systems},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019owacomonotone.pdf},
pages = {21--39},
volume = {395},
year = {2020},
abstract = {
The constrained ordered weighted averaging (OWA) aggregation
problem arises when we aim to maximize or minimize a convex combination
of order statistics under linear inequality constraints that act on
the variables with respect to their original sources. The standalone
approach to optimizing the OWA under constraints is to consider all
permutations of the inputs, which becomes quickly infeasible when
there are more than a few variables, however in certain cases we can
take advantage of the relationships amongst the constraints and
the corresponding solution structures. For example, we can consider
a land-use allocation satisfaction problem with an auxiliary aim of
balancing land-types, whereby the response curves for each species
are non-decreasing with respect to the land-types. This results in
comonotone constraints, which allow us to drastically reduce
the complexity of the problem.
In this paper, we show that if we have an arbitrary number of
constraints that are comonotone (i.e., they share the same ordering
permutation of the coefficients), then the optimal solution occurs
for decreasing components of the solution. After investigating the
form of the solution in some special cases and providing theoretical
results that shed light on the form of the solution, we detail
practical approaches to solving and give real-world examples.
},
keywords = {multiple criteria evaluation; ordered weighted averaging;
constrained OWA aggregation; ecology; work allocation}
}
@article{CenaGagolewski2020:genieowa,
author = {Anna Cena and Marek Gagolewski},
title = {{Genie+OWA}: {R}obustifying hierarchical clustering with {OWA}-based linkages},
journal = {Information Sciences},
doi = {10.1016/j.ins.2020.02.025},
pages = {324--336},
volume = {520},
year = {2020},
abstract = {
We investigate the application of the Ordered Weighted
Averaging (OWA) data fusion operator in agglomerative hierarchical
clustering. The examined setting generalises the well-known single,
complete and average linkage schemes. It allows to embody expert
knowledge in the cluster merge process and to provide a much wider
range of possible linkages. We analyse various families of weighting
functions on numerous benchmark data sets in order to assess their
influence on the resulting cluster structure. Moreover, we inspect
the correction for the inequality of cluster size distribution --
similar to the one in the Genie algorithm. Our results demonstrate
that by robustifying the procedure with the Genie correction,
we can obtain a significant performance boost in terms of clustering
quality. This is particularly beneficial in the case of the linkages
based on the closest distances between clusters, including the single
linkage and its "smoothed" counterparts. To explain this behaviour,
we propose a new linkage process called three-stage OWA which yields
further improvements. This way we confirm the intuition that
hierarchical cluster analysis should rather take into account
a few nearest neighbours of each point, instead of trying to adapt
to their non-local neighbourhood.
},
keywords = {hierarchical clustering; OWA; data fusion; aggregation; Genie},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2020genieowa.pdf}
}
@article{GagolewskiPerezDeBaets2020:inherent,
author = {Marek Gagolewski and Raúl Pérez-Fernández and De~Baets, Bernard},
title = {An inherent difficulty in the aggregation of multidimensional data},
journal = {IEEE Transactions on Fuzzy Systems},
doi = {10.1109/TFUZZ.2019.2908135},
issue = {3},
pages = {602--606},
volume = {28},
year = {2020},
abstract = {In the field of information fusion, the problem of data
aggregation has been formalized as an order-preserving process that
builds upon the property of monotonicity. However, fields such as
computational statistics, data analysis and geometry, usually emphasize
the role of equivariances to various geometrical transformations in
aggregation processes. Admittedly, if we consider a unidimensional
data fusion task, both requirements are often compatible with each
other. Nevertheless, in this paper we show that, in the
multidimensional setting, the only idempotent functions that are
monotone and orthogonal equivariant are the over-simplistic weighted
centroids. Even more, this result still holds after replacing
monotonicity and orthogonal equivariance by the weaker property of
orthomonotonicity. This implies that the aforementioned approaches
to the aggregation of multidimensional data are irreconcilable,
and that, if a weighted centroid is to be avoided, we must choose
between monotonicity and a desirable behaviour with regard to
orthogonal transformations.
},
keywords = {multidimensional data aggregation; monotonicity; orthogonal equivariance; centroid}
}
@article{BeliakovGagolewskiJames2019:SugenoRobustGeneral,
author = {Gleb Beliakov and Marek Gagolewski and Simon James},
title = {Robust fitting for the {S}ugeno integral
with respect to general fuzzy measures},
journal = {Information Sciences},
doi = {10.1016/j.ins.2019.11.024},
pages = {449--461},
volume = {514},
year = {2020},
keywords = {Sugeno integral; fuzzy measure; parameter learning;
aggregation functions},
abstract = {
The Sugeno integral is an expressive aggregation function with
potential applications across a range of decision contexts.
Its calculation requires only the lattice minimum and maximum
operations, making it particularly suited to ordinal data and robust
to scale transformations. However, for practical use in data analysis
and prediction, we require efficient methods for learning the associated
fuzzy measure. While such methods are well developed for the Choquet
integral, the fitting problem is more difficult for the Sugeno integral
because it is not amenable to being expressed as a linear combination
of weights, and more generally due to plateaus and non-differentiability
in the objective function. Previous research has hence focused on
heuristic approaches or simplified fuzzy measures. Here we show
that the problem of fitting the Sugeno integral to data such that
the maximum absolute error is minimized can be solved using an
efficient bilevel program. This method can be incorporated into
algorithms that learn fuzzy measures with the aim of minimizing
the median residual. This equips us with tools that make the Sugeno
integral a feasible option in robust data regression and analysis.
We provide experimental comparison with a genetic algorithms approach
and an example in data analysis.
}
}
@article{GerasETAL2020:dislike,
author = {Agnieszka Geras and Grzegorz Siudem and Marek Gagolewski},
doi = {10.1002/ASI.24231},
journal = {Journal of the Association for Information Science and Technology},
number = {2},
pages = {221--229},
title = {Should we introduce a dislike button for academic papers?},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019dislike.pdf},
volume = {71},
year = {2020},
abstract = {
On the grounds of the revealed, mutual resemblance between the behaviour of users of the Stack Exchange and the dynamics of the citations accumulation process in the scientific community, we tackled an outwardly intractable problem of assessing the impact of introducing negative citations.
Although the most frequent reason to cite a paper is to highlight the connection between the two publications, researchers sometimes mention an earlier work to cast a negative light. While computing citation-based scores, for instance the h-index, information about the reason why a paper was mentioned is neglected. Therefore it can be questioned whether these indices describe scientific achievements accurately. In this contribution we shed insight into the problem of negative citations, analysing data from Stack Exchange and, to draw more universal conclusions, we derive an approximation of citations scores. Here we show that the quantified influence of introducing negative citations is of lesser importance and that they could be used as an indicator of where attention of scientific community is allocated.
},
keywords = {citation analysis; the Hirsch index; negative citations; research evaluation; science of science}
}
@article{PerezDeBaetsGagolewski2019:taxonomy,
author = {Raúl Pérez-Fernández and De~Baets, Bernard and Marek Gagolewski},
doi = {10.1016/j.inffus.2019.05.006},
journal = {Information Fusion},
pages = {322--334},
title = {A taxonomy of monotonicity properties for the
aggregation of multidimensional data},
volume = {52},
year = {2019},
abstract = {
The property of monotonicity, which requires a function to preserve
a given order, has been considered the standard in the aggregation
of real numbers for decades. In this paper, we argue that, for the
case of multidimensional data, an order-based definition of monotonicity
is far too restrictive. We propose several meaningful alternatives to
this property not involving the preservation of a given order by
returning to its early origins stemming from the field of calculus.
Numerous aggregation methods for multidimensional data commonly
used by practitioners are studied within our new framework.
},
keywords = {
Monotonicity; Aggregation; Multidimensional data;
Centroid; Spatial median
},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019taxonomymonotonicity.pdf}
}
@article{GagolewskiJamesBeliakov2019:fitsugenol1,
author = {Marek Gagolewski and Simon James and Gleb Beliakov},
doi = {10.1109/TFUZZ.2019.2895565},
journal = {IEEE Transactions on Fuzzy Systems},
number = {4},
pages = {810--815},
title = {Supervised learning to aggregate data with the {S}ugeno integral},
volume = {27},
year = {2019},
abstract = {
The problem of learning symmetric capacities (or fuzzy measures)
from data is investigated toward applications in data analysis and
prediction as well as decision making. Theoretical results regarding
the solution minimizing the mean absolute error are exploited to develop
an exact branch-refine-and-bound-type algorithm for fitting Sugeno
integrals (weighted lattice polynomial functions, max-min operators)
with respect to symmetric capacities. The proposed method turns out
to be particularly suitable for acting on ordinal data. In addition
to providing a model that can be used for the general data regression
task, the results can be used, among others, to calibrate generalized
h-indices to bibliometric data.
},
keywords = {Fuzzy measures; h-index; lattice polynomials;
ordinal data fitting; Sugeno integral; weight learning},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2019fitsugenol1.pdf}
}
@article{CoroianuGagolewskiGrzegorzewski2019:piecewise,
author = {Lucian Coroianu and Marek Gagolewski and Przemysław Grzegorzewski},
doi = {10.1007/s00500-019-03800-2},
journal = {Soft Computing},
number = {19},
pages = {9491--9505},
url = {https://CRAN.R-project.org/package=FuzzyNumbers},
title = {Piecewise linear approximation of fuzzy numbers:
{A}lgorithms, arithmetic operations and stability of characteristics},
volume = {23},
year = {2019}
}
@article{BeliakovGagolewskiJames2019:SugenoBiomed,
author = {Gleb Beliakov and Marek Gagolewski and Simon James},
doi = {10.1016/j.ins.2019.06.023},
journal = {Information Sciences},
pages = {377--387},
title = {Aggregation on ordinal scales with the {S}ugeno integral
for biomedical applications},
volume = {501},
year = {2019}
}
@incollection{CoroianuGagolewski2019:penaltyvector,
author = {Lucian Coroianu and Marek Gagolewski},
booktitle = {New Trends in Aggregation Theory},
doi = {10.1007/978-3-030-19494-9_15},
editor = {Radomír Halaš and others},
pages = {160--171},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {Penalty-based data aggregation in real normed vector spaces},
volume = {981},
year = {2019}
}
@article{LasekGagolewski2018:leagues,
author = {Jan Lasek and Marek Gagolewski},
doi = {10.1177/1471082X18798426},
journal = {Statistical Modelling},
number = {5--6},
pages = {411--435},
title = {The efficacy of league formats in ranking teams},
volume = {18},
year = {2018},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2018leagues.pdf}
}
@article{BeliakovETAL2019:traffic,
author = {Gleb Beliakov and Marek Gagolewski and Simon James
and Shannon Pace and Nicola Pastorello and Elodie Thilliez and Rajesh Vasa},
doi = {10.1016/j.asoc.2017.07.014},
journal = {Applied Soft Computing},
pages = {910--919},
title = {Measuring traffic congestion: {A}n approach based on learning
weighted inequality, spread and aggregation indices from comparison data},
volume = {67},
year = {2019},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2018traffic.pdf}
}
@incollection{BeliakovETAL2018:lmslts,
author = {Gleb Beliakov and Marek Gagolewski and Simon James},
booktitle = {Information Processing and Management of Uncertainty
in Knowledge-Based Systems. Theory and Foundations},
doi = {10.1007/978-3-319-91476-3_31},
editor = {Jesus Medina and others},
pages = {367--378},
publisher = {Springer},
title = {Least median of squares ({LMS}) and least trimmed squares ({LTS})
fitting for the weighted arithmetic mean},
year = {2018}
}
@incollection{BartoszukGagolewski2017:binagopplagiarism,
author = {Maciej Bartoszuk and Marek Gagolewski},
booktitle = {Proc. 2017 IEEE International Conference on Fuzzy Systems (FUZZ-IEEE)},
doi = {10.1109/FUZZ-IEEE.2017.8015582},
pages = {8015582},
publisher = {IEEE},
title = {Binary aggregation functions in software plagiarism detection},
year = {2017}
}
@incollection{CenaGagolewski2017:owagenie,
author = {Anna Cena and Marek Gagolewski},
booktitle = {Proc. 2017 IEEE International Conference on Fuzzy Systems (FUZZ-IEEE)},
doi = {10.1109/FUZZ-IEEE.2017.8015652},
pages = {8015652},
publisher = {IEEE},
title = {{OWA}-based linkage and the {G}enie correction for hierarchical clustering},
year = {2017},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2017genieowatest.pdf}
}
@incollection{GagolewskiJames2017:eusflat,
author = {Marek Gagolewski and Simon James},
booktitle = {Advances in Fuzzy Logic and Technology 2017},
doi = {10.1007/978-3-319-66824-6_10},
editor = {Janusz Kacprzyk and others},
pages = {104--116},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {Fitting symmetric fuzzy measures for discrete {S}ugeno integration},
volume = {642},
year = {2018}
}
@article{Gagolewski2017:pbamultidim,
author = {Marek Gagolewski},
doi = {10.1016/j.fss.2016.12.009},
journal = {Fuzzy Sets and Systems},
pages = {4--20},
title = {Penalty-based aggregation of multidimensional data},
volume = {325},
year = {2017},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2017penaltymultidim.pdf}
}
@article{MesiarGagolewski2016:defects,
author = {Radko Mesiar and Marek Gagolewski},
doi = {10.1109/TFUZZ.2016.2516579},
journal = {IEEE Transactions on Fuzzy Systems},
number = {6},
pages = {1668--1672},
title = {{H}-index and other {S}ugeno integrals: {S}ome defects and their compensation},
volume = {24},
year = {2016},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016hdeffects.pdf}
}
@article{BeliakovETAL2016:penaltyinequality,
author = {Gleb Beliakov and Marek Gagolewski and Simon James},
doi = {10.1142/S0218488516400018},
journal = {International Journal of Uncertainty, Fuzziness and Knowledge-Based Systems},
pages = {1--23},
title = {Penalty-based and other representations of economic inequality},
volume = {24(Suppl.1)},
year = {2016},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016penaltyeconomic.pdf}
}
@incollection{GagolewskiETAL2016:genie2,
author = {Marek Gagolewski and Anna Cena and Maciej Bartoszuk},
booktitle = {Modeling Decisions for Artificial Intelligence},
doi = {10.1007/978-3-319-45656-0_16},
editor = {Vicenç Torra and others},
pages = {191--202},
publisher = {Springer},
series = {Lecture Notes in Artificial Intelligence},
title = {Hierarchical clustering via penalty-based aggregation and the {G}enie approach},
volume = {9880},
year = {2016},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016genie2.pdf}
}
@article{GagolewskiETAL2016:genie,
author = {Marek Gagolewski and Maciej Bartoszuk and Anna Cena},
doi = {10.1016/j.ins.2016.05.003},
journal = {Information Sciences},
pages = {8--23},
url = {https://genieclust.gagolewski.com/},
title = {Genie: {A} new, fast, and outlier-resistant hierarchical clustering algorithm},
volume = {363},
year = {2016},
preprint = {https://arxiv.org/pdf/2209.05757}
}
@incollection{CenaGagolewski2016:generickmeans,
author = {Anna Cena and Marek Gagolewski},
booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II},
doi = {10.1007/978-3-319-40581-0_36},
editor = {J.P. Carvalho and others},
pages = {445--456},
publisher = {Springer},
series = {Communications in Computer and Information Science},
title = {Fuzzy k-minpen clustering and k-nearest-minpen classification procedures incorporating generic distance-based penalty minimizers},
volume = {611},
year = {2016}
}
@incollection{BartoszukETAL2016:fitagop1,
author = {Maciej Bartoszuk and Gleb Beliakov and Marek Gagolewski and Simon James},
booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II},
doi = {10.1007/978-3-319-40581-0_62},
editor = {J.P. Carvalho and others},
pages = {767--779},
publisher = {Springer},
series = {Communications in Computer and Information Science},
title = {Fitting aggregation functions to data: {Part} {I} -- {L}inearization and regularization},
volume = {611},
year = {2016},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016fitagop1.pdf}
}
@incollection{BartoszukETAL2016:fitagop2,
author = {Maciej Bartoszuk and Gleb Beliakov and Marek Gagolewski and Simon James},
booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II},
doi = {10.1007/978-3-319-40581-0_63},
editor = {J.P. Carvalho and others},
pages = {780--789},
publisher = {Springer},
series = {Communications in Computer and Information Science},
title = {Fitting aggregation functions to data: {Part} {II} -- {I}dempotization},
volume = {611},
year = {2016},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016fitagop2.pdf}
}
@article{LasekETAL2016:fifa,
author = {Jan Lasek and Zoltan Szlavik and Marek Gagolewski and Sandjai Bhulai},
doi = {10.1080/02664763.2015.1100593},
journal = {Journal of Applied Statistics},
number = {7},
pages = {1349--1368},
title = {How to improve a team's position in the {FIFA} ranking -- {A} simulation study},
volume = {43},
year = {2016},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2016improvefifa.pdf}
}
@article{ZogalaETAL2016:agenth,
author = {Barbara Żogała-Siudem and Grzegorz Siudem and Anna Cena and Marek Gagolewski},
doi = {10.1140/epjb/e2015-60757-1},
journal = {European Physical Journal B},
number = {21},
title = {Agent-based model for the bibliometric h-index -- {E}xact solution},
volume = {89},
year = {2016},
preprint = {https://arxiv.org/pdf/1509.05798}
}
@incollection{BartoszukGagolewski2014:fuzzyrsimilar,
author = {Maciej Bartoszuk and Marek Gagolewski},
booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part III},
doi = {10.1007/978-3-319-08852-5_3},
editor = {A. Laurent and others},
pages = {21--30},
publisher = {Springer},
series = {Communications in Computer and Information Science},
title = {A fuzzy {R} code similarity detection algorithm},
volume = {444},
year = {2014}
}
@incollection{BartoszukGagolewski2015:similar2,
author = {Maciej Bartoszuk and Marek Gagolewski},
booktitle = {Proc. IFSA/EUSFLAT'15},
doi = {10.2991/ifsa-eusflat-15.2015.61},
editor = {J.M. Alonso and H. Bustince and M. Reformat},
pages = {419--426},
publisher = {Atlantis Press},
title = {Detecting similarity of {R} functions via a fusion of multiple heuristic methods},
year = {2015}
}
@article{CenaETAL2015:prodclust,
author = {Anna Cena and Marek Gagolewski and Radko Mesiar},
doi = {10.1016/j.joi.2015.02.005},
journal = {Journal of Informetrics},
number = {2},
pagesg = {273--284},
title = {Problems and challenges of information resources producers' clustering},
volume = {9},
year = {2015}
}
@incollection{CenaGagolewski2013:om31,
author = {Anna Cena and Marek Gagolewski},
booktitle = {Aggregation Functions in Theory and in Practise},
doi = {10.1007/978-3-642-39165-1_13},
editor = {Humberto Bustince and others},
pages = {93--103},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {{OM3}: {O}rdered maxitive, minitive, and modular aggregation operators -- {P}art {I}: {A}xiomatic analysis under arity-dependence},
volume = {228},
year = {2013},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013om31.pdf}
}
@incollection{CenaGagolewski2013:om32,
author = {Anna Cena and Marek Gagolewski},
booktitle = {Aggregation Functions in Theory and in Practise},
doi = {10.1007/978-3-642-39165-1_14},
editor = {Humberto Bustince and others},
pages = {105--115},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {{OM3}: {O}rdered maxitive, minitive, and modular aggregation operators -- {P}art {II}: {A}~simulation study},
volume = {228},
year = {2013},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013om32.pdf}
}
@incollection{CenaGagolewski2015:kmeansinformetric,
author = {Anna Cena and Marek Gagolewski},
booktitle = {Proc. IFSA/EUSFLAT'15},
doi = {10.2991/ifsa-eusflat-15.2015.77},
editor = {J.M. Alonso and H. Bustince and M. Reformat},
pages = {536--543},
publisher = {Atlantis Press},
title = {A {K}-means-like algorithm for informetric data clustering},
year = {2015}
}
@article{CenaGagolewski2015:om3fss,
author = {Anna Cena and Marek Gagolewski},
doi = {10.1016/j.fss.2014.04.001},
journal = {Fuzzy Sets and Systems},
pages = {138--159},
title = {{OM3}: {O}rdered maxitive, minitive, and modular aggregation operators -- {A}xiomatic and probabilistic properties in an arity-monotonic setting},
volume = {264},
year = {2015},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015om3fss.pdf}
}
@article{CoroianuETAL2013:piecewise1,
author = {Lucian Coroianu and Marek Gagolewski and Przemysław Grzegorzewski},
doi = {10.1016/j.fss.2013.02.005},
journal = {Fuzzy Sets and Systems},
pages = {26--51},
title = {Nearest piecewise linear approximation of fuzzy numbers},
url = {https://CRAN.R-project.org/package=FuzzyNumbers},
volume = {233},
year = {2013},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013plfnknot1.pdf}
}
@incollection{CoroianuETAL2014:piecewise1suppcore,
author = {Lucian Coroianu and Marek Gagolewski and Przemysław Grzegorzewski and Adabitabar Firozja, M. and Tahereh Houlari},
booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems, Part II},
doi = {10.1007/978-3-319-08855-6_25},
editor = {A. Laurent and others},
pages = {244--254},
publisher = {Springer},
series = {Communications in Computer and Information Science},
title = {Piecewise linear approximation of fuzzy numbers preserving the support and core},
volume = {443},
year = {2014}
}
@article{Gagolewski2011:CITAN,
author = {Marek Gagolewski},
doi = {10.1016/j.joi.2011.06.006},
journal = {Journal of Informetrics},
number = {4},
pages = {678--692},
url = {https://CRAN.R-project.org/package=CITAN},
title = {Bibliometric impact assessment with {R} and the {CITAN} package},
volume = {5},
year = {2011},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2011citan.pdf}
}
@incollection{Gagolewski2012:effortdom,
author = {Marek Gagolewski},
booktitle = {Advances in Computational Intelligence, Part III},
doi = {10.1007/978-3-642-31718-7_29},
editor = {Salvatore Greco and others},
pages = {276--285},
publisher = {Springer},
series = {Communications in Computer and Information Science},
title = {On the relation between effort-dominating and symmetric minitive aggregation operators},
volume = {299},
year = {2012},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2012effortdom.pdf}
}
@incollection{Gagolewski2012:smps,
author = {Marek Gagolewski},
booktitle = {Synergies of Soft Computing and Statistics for Intelligent Data Analysis},
doi = {10.1007/978-3-642-33042-1_39},
editor = {Rudolf Kruse and others},
pages = {359--367},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {Statistical hypothesis test for the difference between {H}irsch
indices of two {P}areto-distributed random samples},
volume = {190},
year = {2013},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013htest.pdf}
}
@article{Gagolewski2013:fair,
author = {Marek Gagolewski},
doi = {10.1016/j.joi.2013.07.001},
journal = {Journal of Informetrics},
number = {4},
pages = {792--802},
title = {Scientific impact assessment cannot be fair},
volume = {7},
year = {2013},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013fair.pdf}
}
@article{Gagolewski2013:om3,
author = {Marek Gagolewski},
doi = {10.1016/j.ins.2012.09.005},
journal = {Information Sciences},
pages = {170--180},
title = {On the relationship between symmetric maxitive, minitive, and modular aggregation operators},
volume = {221},
year = {2013},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013rel3.pdf}
}
@incollection{Gagolewski2015:hconfint,
author = {Marek Gagolewski},
booktitle = {Strengthening Links Between Data Analysis and Soft Computing},
doi = {10.1007/978-3-319-10765-3_28},
editor = {P. Grzegorzewski and others},
pages = {233--240},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {Sugeno integral-based confidence intervals for the theoretical h-index},
volume = {315},
year = {2015},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015hconfint.pdf}
}
@incollection{Gagolewski2015:normalizedspread,
author = {Marek Gagolewski},
booktitle = {Proc. IFSA/EUSFLAT'15},
doi = {10.2991/ifsa-eusflat-15.2015.32},
editor = {J.M. Alonso and H. Bustince and M. Reformat},
pages = {210--216},
publisher = {Atlantis Press},
title = {Normalized {WD$_p$WAM} and {WD$_p$OWA} spread measures},
year = {2015},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015normalizedspread.pdf}
}
@article{Gagolewski2015:spread,
author = {Marek Gagolewski},
doi = {10.1016/j.ejor.2014.08.034},
journal = {European Journal of Operational Research},
number = {2},
pages = {469--477},
title = {Spread measures and their relation to aggregation functions},
volume = {241},
year = {2015},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015spread.pdf}
}
@article{GagolewskiGrzegorzewski2009:geometricapproach,
author = {Marek Gagolewski and Przemysław Grzegorzewski},
doi = {10.1007/s11192-008-2253-y},
journal = {Scientometrics},
number = {3},
pages = {617--634},
title = {A geometric approach to the construction of scientific impact indices},
volume = {81},
year = {2009},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2009geometricapproach.pdf}
}
@incollection{GagolewskiGrzegorzewski2010:ipmu,
author = {Marek Gagolewski and Przemysław Grzegorzewski},
booktitle = {Information Processing and Management of Uncertainty in Knowledge-Based Systems},
doi = {10.1007/978-3-642-14055-6_73},
editor = {E. Hüllermeier and others},
pages = {693--702},
publisher = {Springer},
series = {Communications in Computer and Information Science},
title = {Arity-monotonic extended aggregation operators},
volume = {80},
year = {2010},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2010aritymonotonic.pdf}
}
@incollection{GagolewskiGrzegorzewski2010:smps,
author = {Marek Gagolewski and Przemysław Grzegorzewski},
booktitle = {Combining Soft Computing and Statistical Methods in Data Analysis},
doi = {10.1007/978-3-642-14746-3_35},
editor = {Christian Borgelt and others},
pages = {281--288},
publisher = {Springer},
series = {Advances in Intelligent and Soft Computing},
title = {{S}-statistics and their basic properties},
volume = {77},
year = {2010},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2010sstatprop.pdf}
}
@incollection{GagolewskiGrzegorzewski2011:axcharquasils,
author = {Marek Gagolewski and Przemysław Grzegorzewski},
booktitle = {Proc. EUSFLAT/LFA'11},
doi = {10.2991/eusflat.2011.112},
editor = {Sylvie Galichet and others},
location = {Aix-Les-Bains, France},
pages = {53--58},
publisher = {Atlantis Press},
title = {Axiomatic characterizations of {(quasi-)} {L}-statistics and {S}-statistics and the {P}roducer {A}ssessment {P}roblem},
year = {2011},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2011axcharquasils.pdf}
}
@article{GagolewskiGrzegorzewski2011:possibilistic,
author = {Marek Gagolewski and Przemysław Grzegorzewski},
doi = {10.1016/j.ijar.2011.01.010},
journal = {International Journal of Approximate Reasoning},
number = {9},
pages = {1312--1324},
title = {Possibilistic analysis of arity-monotonic aggregation operators and its relation to bibliometric impact assessment of individuals},
volume = {52},
year = {2011},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2011possibilistic.pdf}
}
@incollection{GagolewskiLasek2015:fuzzrelpap,
author = {Marek Gagolewski and Jan Lasek},
booktitle = {Proc. 7th IEEE International Conference Intelligent Systems IS'2014, Vol. 2: Tools, Architectures, Systems, Applications},
doi = {10.1007/978-3-319-11310-4_25},
pages = {289--300},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {The use of fuzzy relations in the assessment of information resources producers' performance},
volume = {323},
year = {2015}
}
@incollection{GagolewskiLasek2015:preflearn,
author = {Marek Gagolewski and Jan Lasek},
booktitle = {Proc. IFSA/EUSFLAT'15},
doi = {10.2991/ifsa-eusflat-15.2015.70},
editor = {J.M. Alonso and H. Bustince and M. Reformat},
pages = {484--491},
publisher = {Atlantis Press},
title = {Learning experts' preferences from informetric data},
year = {2015}
}
@article{GagolewskiMesiar2012:pqm,
author = {Marek Gagolewski and Radko Mesiar},
doi = {10.1016/j.joi.2012.05.001},
journal = {Journal of Informetrics},
number = {4},
pages = {566--579},
title = {Aggregating different paper quality measures with a generalized {h}-index},
volume = {6},
year = {2012},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2012pqm.pdf}
}
@article{GagolewskiMesiar2014:integrals,
author = {Marek Gagolewski and Radko Mesiar},
doi = {10.1016/j.ins.2013.12.004},
journal = {Information Sciences},
pages = {166--174},
title = {Monotone measures and universal integrals in a uniform framework for the scientific impact assessment problem},
volume = {263},
year = {2014},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2014integrals.pdf}
}
@incollection{LasekGagolewski2015:fireman,
author = {Jan Lasek and Marek Gagolewski},
booktitle = {Proc. FedCSIS'15},
doi = {10.15439/2015F418},
editor = {M. Ganzha and L. Maciaszek and M. Paprzycki},
pages = {375--380},
publisher = {IEEE},
title = {The winning solution to the {AAIA'15} {D}ata {M}ining {C}ompetition: {T}agging firefighter activities at a fire scene},
year = {2015},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2015fireman.pdf}
}
@incollection{GagolewskiDebskiNowakiewicz2013:graphintegrals,
author = {Marek Gagolewski and Michał Dębski and Michał Nowakiewicz},
booktitle = {Proc. Uncertainty Modeling},
editor = {Radko Mesiar and Tomas Bacigal},
isbn = {ISBN:978-80-227-4067-8},
pages = {17--23},
publisher = {STU Bratislava},
title = {Efficient algorithm for computing certain graph-based monotone integrals: {T}he {$l_p$}-indices},
year = {2013},
abstract = {
The Choquet, Sugeno, and Shilkret integrals with respect to monotone
measures are useful as tools in decision support systems.
In this paper we propose a new class of graph-based integrals that
generalize these three operations. Then, an efficient linear-time
algorithm for computing their special case, that is lp-indices,
1 ≤ p < ∞, is presented. The algorithm is based on R.L. Graham's
routine for determining the convex hull of a finite planar set.
},
keywords = {monotone measures; Choquet, Sugeno, and Shilkret integral;
lp-index; convex hull; Graham's scan; scientific impact indices},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2013graphintegrals.pdf}
}
@incollection{GagolewskiGrzegorzewski2009:possibleh,
author = {Marek Gagolewski and Przemysław Grzegorzewski},
booktitle = {Proc. IFSA/EUSFLAT'09},
editor = {J. P. Carvalho and others},
location = {Lisbon, Portugal},
pages = {1691--1695},
publisher = {IFSA},
title = {Possible and necessary {h}-indices},
year = {2009}
}
@incollection{LasekGagolewski2015:esttourmet,
author = {Jan Lasek and Marek Gagolewski},
booktitle = {Selected problems in information technologies (Proc. ITRIA'15 vol. 2)},
pages = {67--78},
publisher = {Institute of Computer Science, Polish Academy of Sciences},
title = {Estimation of tournament metrics for association football league formats},
year = {2015}
}
@incollection{CenaGagolewski2015:clustaginf,
author = {Anna Cena and Marek Gagolewski},
booktitle = {Computational methods in data analysis (Proc. ITRIA'15 vol. 1)},
pages = {5--26},
publisher = {Institute of Computer Science, Polish Academy of Sciences},
title = {Clustering and aggregation of informetric data sets},
year = {2015}
}
@incollection{Gagolewski2015:issuesmultidim,
author = {Marek Gagolewski},
booktitle = {Proc. 8th International Summer School on Aggregation Operators (AGOP 2015)},
editor = {Michał Baczyński and De~Baets, Bernard and Radko Mesiar},
isbn = {978-83-8012-519-3},
pages = {127--132},
publisher = {University of Silesia},
title = {Some issues in aggregation of multidimensional data},
year = {2015}
}
@incollection{CenaGagolewski2015:fuzzycmeansinformetric,
author = {Anna Cena and Marek Gagolewski},
booktitle = {Proc. 8th International Summer School on Aggregation Operators (AGOP 2015)},
editor = {Michał Baczyński and De~Baets, Bernard and Radko Mesiar},
isbn = {978-83-8012-519-3},
pages = {79--84},
publisher = {University of Silesia},
title = {Aggregation and soft clustering of informetric data},
year = {2015}
}
@incollection{GagolewskiGrzegorzewski2010:metodyiproblemy,
address = {Warsaw},
author = {Marek Gagolewski and Przemysław Grzegorzewski},
booktitle = {Psychologia i~informatyka. Synergia i~kontradykcje},
editor = {Tomasz Rowiński and Ryszard Tadeusiewicz},
isbn = {978-83-707-2679-9},
note = {in Polish},
pages = {103--125},
publisher = {Wyd.~UKSW},
title = {Metody i~problemy naukometrii (Methods and problems of scientometrics)},
year = {2010}
}
@incollection{GagolewskiGrzegorzewski2009:uogolnienieh,
address = {Lublin},
author = {Marek Gagolewski and Przemysław Grzegorzewski},
booktitle = {Kadry i infrastruktura nowoczesnej nauki: Teoria i praktyka, Proc. 1st Intl. Conf. Zarządzanie Nauką},
editor = {P. Kawalec and P. Lipski},
isbn = {978-83-61671-12-1},
note = {in Polish},
pages = {15--29},
publisher = {Wydawnictwo Lubelskiej Szkoły Biznesu},
title = {O pewnym uogólnieniu indeksu Hirscha},
volume = {2},
year = {2009}
}
@incollection{RowinskiGagolewski2011:internetkryzys,
address = {Warsaw},
author = {Tomasz Rowiński and Marek Gagolewski},
booktitle = {Kryzys: Pułapka czy szansa?},
editor = {M. Jankowska and M. Starzomska},
isbn = {978-83-609-5885-8},
note = {in Polish},
pages = {211--224},
publisher = {WN Akapit},
title = {Internet a kryzys},
year = {2011}
}
@article{RowinskiGagolewski2007:online,
author = {Tomasz Rowiński and Marek Gagolewski},
journal = {Studia Psychologica UKSW},
note = {in Polish},
pages = {195--210},
title = {Preferencje i postawy wobec pomocy online (Attitudes towards online counselling and psychotherapy)},
volume = {7},
year = {2007}
}
@phdthesis{Gagolewski2011:PhD,
author = {Marek Gagolewski},
note = {in Polish},
school = {Systems Research Institute, Polish Academy of Sciences},
title = {Aggregation operators and their application in a formal model for quality evaluation system of scientific research (Wybrane operatory agregacji i~ich zastosowanie w~modelu formalnym systemu jakości w~nauce)},
year = {2011}
}
@proceedings{HalasETAL2019:agop2019,
editor = {Radomír Halaš and Marek Gagolewski and Radko Mesiar},
isbn = {978-3-030-19493-2},
pages = {348},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {New Trends in Aggregation Theory},
volume = {981},
doi = {10.1007/978-3-030-19494-9},
year = {2019}
}
@proceedings{GrzegorzewskiETAL2015:smps2014,
editor = {Przemysław Grzegorzewski and Marek Gagolewski and Olgierd Hryniewicz and María Ángeles Gil},
isbn = {978-3-319-10764-6},
pages = {294},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {Strengthening Links Between Data Analysis and Soft Computing},
doi = {10.1007/978-3-319-10765-3},
volume = {315},
year = {2015}
}
@proceedings{FerraroETAL2017:smps2016,
editor = {Maria Brigida Ferraro and Paulo Giordani and Barbara Vantaggi
and Marek Gagolewski and María Ángeles Gil and Przemysław Grzegorzewski
and Olgierd Hryniewicz},
isbn = {978-3-319-42971-7},
pages = {535},
publisher = {Springer},
series = {Advances in Intelligent Systems and Computing},
title = {Soft Methods for Data Science},
doi = {10.1007/978-3-319-42972-4},
volume = {456},
year = {2017}
}
@book{GagolewskiETAL2016:Pythonksiazka,
address = {Warsaw},
author = {Marek Gagolewski and Maciej Bartoszuk and Anna Cena},
isbn = {978-83-01-18940-2},
note = {🇵🇱},
pages = {369},
publisher = {Wydawnictwo Naukowe PWN},
title = {Przetwarzanie i analiza danych w języku {Python} (Data Processing and Analysis in Python)},
year = {2016},
url = {https://github.com/gagolews/Analiza_danych_w_jezyku_Python}
}
@book{Gagolewski2016:Rksiazka,
address = {Warsaw},
author = {Marek Gagolewski},
edition = {2nd},
isbn = {978-83-01-18939-6},
note = {🇵🇱},
pages = {550},
publisher = {Wydawnictwo Naukowe PWN},
title = {Programowanie w języku {R}. {A}naliza danych, obliczenia, symulacje (R Programming. Data Analysis, Computing, Simulations)},
year = {2016},
url = {https://github.com/gagolews/Programowanie_w_jezyku_R}
}
@book{Gagolewski2015:datafusionbook,
author = {Marek Gagolewski},
isbn = {978-83-63159-20-7},
note = {🔓},
pages = {290},
publisher = {Institute of Computer Science, Polish Academy of Sciences},
address = {Warsaw},
title = {Data Fusion: {T}heory, Methods, and Applications},
year = {2015},
url = {https://github.com/gagolews/datafusion},
preprint = {https://raw.githubusercontent.com/gagolews/datafusion/master/datafusion.pdf},
keywords = {data aggregation, data fusion, means, t-norms, spread measures,
multidimensional data, strings},
abstract = {
A proper fusion of complex data is of interest to many researchers
in diverse fields, including computational statistics, computational
geometry, bioinformatics, machine learning, pattern recognition,
quality management, engineering, statistics, finance, economics, etc.
It plays a crucial role in: synthetic description of data processes
or whole domains, creation of rule bases for approximate reasoning
tasks, reaching consensus and selection of the optimal strategy in
decision support systems, imputation of missing values, data
deduplication and consolidation, record linkage across heterogeneous
databases, and clustering. This open-access research monograph
integrates the spread-out results from different domains using the
methodology of the well-established classical aggregation framework,
introduces researchers and practitioners to Aggregation 2.0,
as well as points out the challenges and interesting directions
for further research.
}
}
@book{Gagolewski2014:Rksiazka,
address = {Warsaw},
author = {Marek Gagolewski},
edition = {1st},
isbn = {978-83-01-17461-3},
note = {🇵🇱},
pages = {494},
publisher = {Wydawnictwo Naukowe PWN},
title = {Programowanie w języku {R}. {A}naliza danych, obliczenia, symulacje (R Programming. Data Analysis, Computing, Simulations)},
year = {2014}
}
@book{GrzegorzewskiETAL2014:wnioskowaniestatystyczne,
address = {Warsaw},
author = {Przemysław Grzegorzewski and Marek Gagolewski and Konstancja Bobecka-Wesołowska},
isbn = {978-83-93-72601-1},
note = {🇵🇱 🔓},
pages = {183},
publisher = {Politechnika Warszawska},
title = {Wnioskowanie statystyczne z wykorzystaniem środowiska R (Statistical Inference with R)},
year = {2014},
preprint = {https://raw.githubusercontent.com/gagolews/bibliography/master/preprints/2014wnioskowaniestatystyczne.pdf}
}