@article{Dyer2015, abstract = {In today's software-centric world, ultra-large-scale software repositories, such as SourceForge, GitHub, and Google Code, are the new library of Alexandria. They contain an enormous corpus of software and related information. Scientists and engineers alike are interested in analyzing this wealth of information. However, systematic extraction and analysis of relevant data from these repositories for testing hypotheses is hard, and best left for mining software repository (MSR) experts! Specifically, mining source code yields significant insights into software development artifacts and processes. Unfortunately, mining source code at a large scale remains a difficult task. Previous approaches had to either limit the scope of the projects studied, limit the scope of the mining task to be more coarse grained, or sacrifice studying the history of the code. In this article we address mining source code: (a) at a very large scale; (b) at a fine-grained level of detail; and (c) with full history information. To address these challenges, we present domain-specific language features for source-code mining in our language and infrastructure called Boa . The goal of Boa is to ease testing MSR-related hypotheses. Our evaluation demonstrates that Boa substantially reduces programming efforts, thus lowering the barrier to entry. We also show drastic improvements in scalability.}, author = {Dyer, Robert and Nguyen, Hoan Anh and Rajan, Hridesh and Nguyen, Tien N.}, doi = {10.1145/2803171}, issn = {1049-331X}, journal = {ACM Transactions on Software Engineering and Methodology}, keywords = {Boa,Domain-specific language,Ease of use,Lower barrier to entry,Mining software repositories,Scalable}, month = {dec}, number = {1}, pages = {1--34}, title = {{Boa: Ultra-large-scale software repository and source-code mining}}, volume = {25}, year = {2015} } @inproceedings{Hassan2008, abstract = {Source control repositories, bug repositories, archived communications, deployment logs, and code repositories are examples of software repositories that are commonly available for most software projects. The Mining Software Repositories (MSR) field analyzes and cross-links the rich data available in these repositories to uncover interesting and actionable information about software systems. By transforming these repositories from static record-keeping ones into active repositories, we can guide decision processes in modern software projects. For example, data in source control repositories, traditionally used to archive code, could be linked with data in bug repositories to help practitioners propagate complex changes and to warn them about risky code based on prior changes and bugs. In this paper, we present a brief history of the MSR field and discuss several recent achievements and results of using MSR techniques to support software research and practice. We then discuss the various opportunities and challenges that lie in the road ahead for this important and emerging field. {\textcopyright} 2008 IEEE.}, author = {Hassan, Ahmed E.}, booktitle = {2008 Frontiers of Software Maintenance}, doi = {10.1109/FOSM.2008.4659248}, isbn = {978-1-4244-2654-6}, month = {sep}, pages = {48--57}, publisher = {IEEE}, title = {{The road ahead for Mining Software Repositories}}, year = {2008} } @inproceedings{Hassan2010, abstract = {Mining software engineering data has emerged as a successful research direction over the past decade. In this position paper, we advocate Software Intelligence (SI) as the future of mining software engineering data, within modern software engineering research, practice, and education. We coin the name SI as an inspiration from the Business Intelligence (BI) field, which offers concepts and techniques to improve business decision making by using fact-based support systems. Similarly, SI offers software practitioners (not just developers) up-to-date and pertinent information to support their daily decision-making processes. SI should support decision-making processes throughout the lifetime of a software system not just during its development phase. The vision of SI has yet to become a reality that would enable software engineering research to have a strong impact on modern software practice. Nevertheless, recent advances in the Mining Software Repositories (MSR) field show great promise and provide strong support for realizing SI in the near future. This position paper summarizes the state of practice and research of SI, and lays out future research directions for mining software engineering data to enable SI. Copyright 2010 ACM.}, address = {New York, New York, USA}, author = {Hassan, Ahmed E. and Xie, Tao}, booktitle = {Proceedings of the FSE/SDP workshop on Future of software engineering research - FoSER '10}, doi = {10.1145/1882362.1882397}, isbn = {9781450304276}, keywords = {Mining software engineering data,Mining software repositories,Software intelligence}, pages = {161}, publisher = {ACM Press}, title = {{Software Intelligence: The Future of Mining Software Engineering Data}}, year = {2010} } @inproceedings{Kalliamvakou2014, abstract = {With over 10 million git repositories, GitHub is becoming one of the most important source of software artifacts on the Internet. Researchers are starting to mine the information stored in GitHub's event logs, trying to understand how its users employ the site to collaborate on software. However, so far there have been no studies describing the quality and properties of the data available from GitHub. We document the results of an empirical study aimed at understanding the characteristics of the repositories in GitHub and how users take advantage of GitHub's main features-namely commits, pull requests, and issues. Our results indicate that, while GitHub is a rich source of data on software development, mining GitHub for research purposes should take various potential perils into consideration. We show, for example, that the majority of the projects are personal and inactive; that GitHub is also being used for free storage and as a Web hosting service; and that almost 40{\%} of all pull requests do not appear as merged, even though they were. We provide a set of recommendations for software engineering researchers on how to approach the data in GitHub.}, address = {New York, New York, USA}, author = {Kalliamvakou, Eirini and Gousios, Georgios and Blincoe, Kelly and Singer, Leif and German, Daniel M. and Damian, Daniela}, booktitle = {Proceedings of the 11th Working Conference on Mining Software Repositories - MSR 2014}, doi = {10.1145/2597073.2597074}, isbn = {9781450328630}, keywords = {Bias,Code reviews,Git,GitHub,Mining software repositories}, pages = {92--101}, publisher = {ACM Press}, title = {{The promises and perils of mining GitHub}}, year = {2014} } @inproceedings{Poncin2011, abstract = {Software developers' activities are in general recorded in software repositories such as version control systems, bug trackers and mail archives. While abundant information is usually present in such repositories, successful information extraction is often challenged by the necessity to simultaneously analyze different repositories and to combine the information obtained. We propose to apply process mining techniques, originally developed for business process analysis, to address this challenge. However, in order for process mining to become applicable, different software repositories should be combined, and "related" software development events should be matched: e.g., mails sent about a file, modifications of the file and bug reports that can be traced back to it. The combination and matching of events has been implemented in FRASR (FRamework for Analyzing Software Repositories), augmenting the process mining framework ProM. FRASR has been successfully applied in a series of case studies addressing such aspects of the development process as roles of different developers and the way bug reports are handled. {\textcopyright} 2011 IEEE.}, author = {Poncin, Wouter and Serebrenik, Alexander and van den Brand, Mark}, booktitle = {2011 15th European Conference on Software Maintenance and Reengineering}, doi = {10.1109/CSMR.2011.5}, isbn = {978-1-61284-259-2}, issn = {15345351}, keywords = {Process mining,Software repositories}, month = {mar}, pages = {5--14}, publisher = {IEEE}, title = {{Process Mining Software Repositories}}, year = {2011} } @inproceedings{Spadini2018, abstract = {Software repositories contain historical and valuable information about the overall development of software systems. Mining software repositories (MSR) is nowadays considered one of the most interesting growing fields within software engineering. MSR focuses on extracting and analyzing data available in software repositories to uncover interesting, useful, and actionable information about the system. Even though MSR plays an important role in software engineering research, few tools have been created and made public to support developers in extracting information from Git repository. In this paper, we present Pydriller, a Python Framework that eases the process of mining Git. We compare our tool against the state-of-the-art Python Framework GitPython, demonstrating that Pydriller can achieve the same results with, on average, 50{\%} less LOC and significantly lower complexity.}, address = {New York, NY, USA}, author = {Spadini, Davide and Aniche, Maur{\'{i}}cio and Bacchelli, Alberto}, booktitle = {Proceedings of the 2018 26th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering}, doi = {10.1145/3236024.3264598}, isbn = {9781450355735}, keywords = {Git,GitPython,Mining Software Repositories,Python}, month = {oct}, pages = {908--911}, publisher = {ACM}, title = {{PyDriller: Python framework for mining software repositories}}, year = {2018} }