--- --- %% NOTE %% The two lines above are required for Jekyll and the jekyll-scholar plugin to correctly parse %% and render the references on the web page. %% You may remove them when using this file with BibTeX natively in your publications to avoid nasty %% BibTeX errors. % % The entries in this file are sorted by YEAR (ascending). % If inserting new entries, please put them accordingly. % % % %% Convention for BibTeX keys % % Construct the BibTeX key according to the following schemes. % In the following, please replace these patterns accordingly: % % YYYY year of publication % Aaaa first author's surname in CamelCase % Bbbb second author's surname in CamelCase % % - for single-author publications: AaaaYYYY % - for two-authors publications: AaaaBbbbYYYY % - for three or more authors: AaaaEtAlYYYY % % In case an author has published more than one publication in the same year, append lower case % letters for the second and ongoing publications: % % first publication: AaaaEtAlYYYY % second publication: AaaaEtAlYYYYb % % % %% Convention for BibTeX fields % % Please, stick to the following formatting convention to easy maintainability: % % - indent BibTeX fields of entries with 2 spaces (NOT tabs!) % % - order the BibTeX fields in alphabetic order % % - enclose all values within curly brakets '{...}' % % - do line breaks at 100 chars; this is ................................................. here -->| % % - and add a continuation indent of 2 after line breaks % % - put a colon ',' as well after the last field/value pair of an entry % % % %% Important % % Please, treat this file not different from usual source code files. % @misc{rudi2024cgkit, title={CG-Kit: Code Generation Toolkit for Performant and Maintainable Variants of Source Code Applied to Flash-X Hydrodynamics Simulations}, author={Johann Rudi and Youngjun Lee and Aidan H. Chadha and Mohamed Wahib and Klaus Weide and Jared P. O'Neal and Anshu Dubey}, year={2024}, eprint={2401.03378}, archivePrefix={arXiv}, note = {submitted to FGCS}, primaryClass={cs.DC} } @inproceedings{TanEtAl2023, address = {New York, NY, USA}, author = {{Tan, Nigel and Luettgau, Jakob and Marquez, Jack and Teranishi, Keita and Morales, Nicolas and Bhowmick, Sanjukta and Cappello, Franck and Taufer, Michela and Nicolae, Bogdan}}, booktitle = {{Proceedings of the 52nd International Conference on Parallel Processing}}, doi = {10.1145/3605573.3605639}, isbn = {9798400708435}, keywords = {incremental storage, de-duplication, data versioning, GPU parallelization, Checkpointing}, location = {Salt Lake City, UT, USA}, numpages = {10}, pages = {665–674}, publisher = {{Association for Computing Machinery}}, series = {ICPP '23}, title = {{Scalable Incremental Checkpointing using GPU-Accelerated De-Duplication}}, url = {https://doi.org/10.1145/3605573.3605639}, year = {2023} } @inproceedings{ChanningEtAl2023, author = {{Channing, Georgia and Patel, Ria and Olaya, Paula and Rorabaugh, Ariel and Miyashita, Osamu and Caino-Lores, Silvina and Schuman, Catherine and Tama, Florence and Taufer, Michela}}, title = {{Composable Workflow for Accelerating Neural Architecture Search Using In Situ Analytics for Protein Classification}}, year = {2023}, isbn = {9798400708435}, publisher = {{Association for Computing Machinery}}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3605573.3605636}, doi = {10.1145/3605573.3605636}, booktitle = {{Proceedings of the 52nd International Conference on Parallel Processing}}, pages = {1}, numpages = {1}, keywords = {Deep learning, Early termination, Neural architecture search, Neural networks, Predictive modeling, Protein diffraction}, location = {Salt Lake City, UT, USA}, series = {ICPP '23} } @inproceedings{RosendoEtAl2023, author = {Rosendo, Daniel and Keahey, Kate and Costan, Alexandru and Simonin, Matthieu and Valduriez, Patrick and Antoniu, Gabriel}, title = {KheOps: Cost-effective Repeatability, Reproducibility, and Replicability of Edge-to-Cloud Experiments}, year = {2023}, isbn = {9798400701764}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3589806.3600032}, doi = {10.1145/3589806.3600032}, booktitle = {Proceedings of the 2023 ACM Conference on Reproducibility and Replicability}, pages = {62–73}, numpages = {12}, keywords = {Workflows, Reproducibility, Replicability, Repeatability, Edge Computing, Computing Continuum, Cloud Computing}, location = {Santa Cruz, CA, USA}, series = {ACM REP '23} } @inproceedings{BarbutEtAl2023, author = {Barbut, Quentin and Benoit, Anne and Herault, Thomas and Robert, Yves and Vivien, Frédéric}, title = {When to checkpoint at the end of a fixed-length reservation?}, booktitle = {Proceedings of Fault Tolerance for HPC at eXtreme Scales (FTXS) Workshop}, url = {https://inria.hal.science/hal-04215554}, location = {Denver, United States}, date = {2023-11-12}, year={2023} } @inproceedings{onealEtAl2022, title={Domain-specific runtime to orchestrate computation on heterogeneous platforms}, author={O’Neal, Jared and Wahib, Mohamed and Dubey, Anshu and Weide, Klaus and Klosterman, Tom and Rudi, Johann}, booktitle={European Conference on Parallel Processing}, pages={154--165}, year={2022}, organization={Springer, Cham} } @article{dubeyEtAl2022, title={Flash-X: A multiphysics simulation software instrument}, author={Dubey, Anshu and Weide, Klaus and O’Neal, Jared and Dhruv, Akash and Couch, Sean and Harris, J Austin and Klosterman, Tom and Jain, Rajeev and Rudi, Johann and Messer, Bronson and others}, journal={SoftwareX}, volume={19}, pages={101168}, year={2022}, publisher={Elsevier} } @inproceedings{denoyelle2022isc, author={Denoyelle, Nicolas and Perarnau, Swann and Iskra, Kamil and Gerofi, Balazs}, title={Rapid Execution Time Estimation for Heterogeneous Memory Systems Through Differential Tracing}, booktitle={High Performance Computing}, year={2022}, publisher={Springer International Publishing}} @inproceedings{hawila2022ccta, author={Hawila, Ismail and Cerf, Sophie and Bleuse, Rapha{\"e}l and Perarnau, Swann and Rutten, {\'E}ric}, booktitle={6th IEEE Conference on Control Technology and Applications}, title={Adaptive Power Control for Sober High-Performance Computing}, year={2022}, publisher={IEEE}} @INPROCEEDINGS{libpressio, author={Underwood, Robert and Malvoso, Victoriana and Calhoun, Jon C. and Di, Sheng and Cappello, Franck}, booktitle={2021 7th International Workshop on Data Analysis and Reduction for Big Scientific Data (DRBSD-7)}, title={Productive and Performant Generic Lossy Data Compression with LibPressio}, year={2021}, volume={}, number={}, pages={1-10}, doi={10.1109/DRBSD754563.2021.00005}} @article{Ruttgers2022APIN, author = {R{\"{u}}ttgers, Mario and Waldmann, Moritz and Schr{\"{o}}der, Wolfgang and Lintermann, Andreas}, doi = {10.1007/s10489-021-02808-2}, issn = {0924-669X}, journal = {Applied Intelligence}, month = {jan}, number = {first online}, title = {{A machine-learning-based method for automatizing lattice-Boltzmann simulations of respiratory flows}}, url = {https://link.springer.com/10.1007/s10489-021-02808-2}, year = {2022} } @article{Lintermann2020, author = {Lintermann, Andreas and Meinke, Matthias and Schröder, Wolfgang}, year = {2020}, month = {03}, pages = {}, title = {Zonal Flow Solver (ZFS): a highly efficient multi- physics simulation framework}, volume = {34}, journal = {International Journal of Computational Fluid Dynamics}, doi = {10.1080/10618562.2020.1742328} } @article{Ogoke2021, author = {Ogoke, Francis and Meidani, Kazem and Hashemi, Amirreza and Barati Farimani, Amir}, year = {2021}, month = {09}, pages = {}, title = {Graph Convolutional Networks applied to unstructured flow field data}, volume = {2}, journal = {Machine Learning: Science and Technology}, doi = {10.1088/2632-2153/ac1fc9} } @inproceedings{yoshii2023hardware, title={Hardware specialization: Estimating Monte Carlo cross-section lookup kernel performance and area}, author={Yoshii, Kazutomo and Tramm, John and Allen, Bryce and Ueno, Tomohiro and Sano, Kentaro and Siegel, Andrew and Beckman, Pete}, booktitle={Proceedings of the SC'23 Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis}, pages={1274--1278}, year={2023} } @inproceedings{yoshii2023streaming, title={Streaming Hardware Compressor Generator Framework}, author={Yoshii, Kazutomo and Ueno, Tomohiro and Sano, Kentaro and Miceli, Antonino and Cappello, Franck}, booktitle={Proceedings of the SC'23 Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis}, pages={289--297}, year={2023} } @misc{yoshii2021hardware, title={A Hardware Co-design Workflow for Scientific Instruments at the Edge}, author={Kazutomo Yoshii and Rajesh Sankaran and Sebastian Strempfer and Maksim Levental and Mike Hammer and Antonino Miceli}, year={2021}, eprint={2111.01380}, archivePrefix={arXiv}, primaryClass={physics.ins-det} } @inproceedings{ueno2021virtual, title={Virtual Circuit-Switching Network with Flexible Topology for High-Performance FPGA Cluster}, author={Ueno, Tomohiro and Koshiba, Atsushi and Sano, Kentaro}, booktitle={2021 IEEE 32nd International Conference on Application-specific Systems, Architectures and Processors (ASAP)}, pages={41--48}, year={2021}, organization={IEEE} } @inproceedings{cerf2021europar, title={{Sustaining Performance While Reducing Energy Consumption: A Control Theory Approach}}, author={Cerf, Sophie and Bleuse, Rapha{\"e}l and Reis, Valentin and Perarnau, Swann and Rutten, {\'E}ric}, booktitle={Euro-Par 2021: Parallel Processing}, year={2021}, doi={10.1007/978-3-030-85665-6_21}, publisher={Springer International Publishing} } @incollection{Ruttgers2021ISC, address = {Frankfurt/Main, Germany}, author = {R{\"{u}}ttgers, Mario and Waldmann, Moritz and Schr{\"{o}}der, Wolfgang and Lintermann, Andreas}, booktitle = {High Performance Computing, Proceedings of the 36th International Conference, ISC High Performance 2021}, doi = {10.1007/978-3-030-90539-2_1}, pages = {7--22}, publisher = {Springer International Publishing}, title = {{Machine-Learning-Based Control of Perturbed and Heated Channel Flows}}, url = {https://link.springer.com/10.1007/978-3-030-90539-2{\_}1}, year = {2021} } @incollection{Ando2021ISC, address = {Frankfurt/Main, Germany}, author = {Ando, Kazuto and Onishi, Keiji and Bale, Rahul and Tsubokura, Makoto and Kuroda, Akiyoshi and Minami, Kazuo}, booktitle = {High Performance Computing, Proceedings of the 36th International Conference, ISC High Performance 2021}, doi = {10.1007/978-3-030-90539-2_8}, pages = {122--137}, publisher = {Springer International Publishing}, title = {{Nonlinear Mode Decomposition and Reduced-Order Modeling for Three-Dimensional Cylinder Flow by Distributed Learning on Fugaku}}, url = {https://link.springer.com/10.1007/978-3-030-90539-2{\_}8}, year = {2021} } @article{Hori21, title = {{An International Survey on MPI Users}}, journal = {{Parallel Computing}}, publisher = {{Elsevier}}, author = {Atsushi Hori and Emmanuel Jeannot and George Bosilca and Takahiro Ogura and Balazs Gerofi and Jie Yin and Yutaka Ishikawa}, year = {2021}, note = {(submitted)} } @inproceedings{Tian21, TITLE = {{Revisiting Huffman Coding: Toward Extreme Performance on Modern GPU Architectures}}, AUTHOR = {Tian, Jiannan and Cody Rivera and Di, Sheng and Chen, Jieyang and Xin, Liang and Tao, Dingwen and Cappello, Franck}, BOOKTITLE = {{IPDPS'21: 35th IEEE International parallel and distributed processing symposium}}, YEAR = {2021}, } @inproceedings{Tian20, TITLE = {{cuSZ: An Efficient GPU Based Error-Bounded Lossy Compression, Framework for Scientific Data}}, AUTHOR = {Tian, Jiannan and Di, Sheng and Zhao, Kai and Cody, Rivera and Hickman, Megan and Underwood, Robert and Jin, Sian and Liang, Xin and Calhoun, Jon and Tao, Dingwen and Cappello, Franck}, BOOKTITLE = {{PACT'20: ACM International Conference on Parallel Architectures and Compilation Techniques}}, YEAR = {2020}, } @inproceedings{Rupak21, TITLE = {{Compression of Time Evolutionary Image Data through Predictive Deep Neural Networks}}, AUTHOR = {Roy, Rupak and Sato, Kento and Bhattacharya, Subhadeep and Fang, Xingang and Joti, Yasumasa and Hatsui, Takaki and Hiraki, Toshiyuki and Guo, Jian and Yu, Weikuan}, BOOKTITLE = {{CCGRID'21: IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing}}, YEAR = {2020}, } @techreport{agullo:hal-02572910, TITLE = {{Exploring variable accuracy storage through lossy compression techniques in numerical linear algebra: a first application to flexible GMRES}}, AUTHOR = {Agullo, Emmanuel and Cappello, Franck and Di, Sheng and Giraud, Luc and Liang, Xin and Schenkels, Nick}, URL = {https://hal.inria.fr/hal-02572910}, TYPE = {Research Report}, NUMBER = {RR-9342}, INSTITUTION = {{Inria Bordeaux Sud-Ouest}}, YEAR = {2020}, MONTH = May, KEYWORDS = {Mixed precision ; Lossy compression ; Flexible GMRES ; Inexact Krylov ; Compression avec perte ; Pr{\'e}cision mixte}, PDF = {https://hal.inria.fr/hal-02572910v2/file/RR-9342.pdf}, HAL_ID = {hal-02572910}, HAL_VERSION = {v2}, } @inproceedings{Underwood20, TITLE = {{FRaZ: A Generic High-Fidelity Fixed-Ratio Lossy Compression Framework for Scientific Floating-point Data}}, AUTHOR = {Underwood, Robert and Di, Sheng and Calhoun, Jon and Cappello, Franck}, BOOKTITLE = {{IPDPS20: 34th IEEE International parallel and distributed processing symposium}}, YEAR = {2020}, } @inproceedings{tnb19+, TITLE = {{Towards Portable Online Prediction of Network Utilization using MPI-level Monitoring}}, AUTHOR = {Tseng, Shu-Mei and Nicolae, Bogdan and Bosilca, George and Jeannot, Emmanuel and Chandramowlishwaran, Aparna and Cappello, Franck}, URL = {https://hal.inria.fr/hal-02184204}, BOOKTITLE = {{EuroPar'19: 25th International European Conference on Parallel and Distributed Systems}}, ADDRESS = {Goettingen, Germany}, YEAR = {2019}, MONTH = Aug, KEYWORDS = {Work stealing ; Prediction of resource utilization ; Timeseries forecasting ; Network monitoring ; Online learning}, PDF = {https://hal.inria.fr/hal-02184204/file/paper.pdf}, HAL_ID = {hal-02184204}, HAL_VERSION = {v1}, } @ARTICLE{Yildiz2019, author={O. {Yildiz} and J. {Ejarque} and H. {Chan} and S. {Sankaranarayanan} and R. M. {Badia} and T. {Peterka}}, journal={Computing in Science Engineering}, title={Heterogeneous Hierarchical Workflow Composition}, year={2019}, volume={21}, number={4}, pages={76-86}, keywords={natural sciences computing;workflow management software;hierarchical heterogeneous workflow;heterogeneous hierarchical workflow composition;workflow systems;automated end-to-end path;single workflow system;end-to-end workflow;in situ workflows;subworkflows;data models;materials science use cases;Task analysis;Computational modeling;Workflow management software;Data models;Data visualization;Analytical models;Heterogeneous networks}, doi={10.1109/MCSE.2019.2918766}, ISSN={1558-366X}, month={July}, } @inproceedings{yildiz2021dynamic, title={Dynamic Heterogeneous Task Specification and Execution for In Situ Workflows}, author={Yildiz, Orcun and Morozov, Dmitriy and Nicolae, Bogdan and Peterka, Tom}, booktitle={2021 IEEE Workshop on Workflows in Support of Large-Scale Science (WORKS)}, pages={25--32}, year={2021}, organization={IEEE Computer Society} } @INPROCEEDINGS{Rupak20, AUTHOR = {Rupak Roy and Kento Sato and Jian Guo and Jens Domke and Weikuan Yu and Takaki Hatsui and Yasumasa Joti}, BOOKTITLE = {{SC'2019}, the IEEE/ACM Conference on High Performance Computing Networking, Storage and Analysis}, TITLE = {Poster: Improving Data Compression with Deep Predictive Neural Network for Time Evolutional Data}, publisher = {ACM Press}, YEAR = {2019} } @unpublished{Schenkels20, author={Nick Schenkels and Emmanuel Agullo and Luc Giraud and Xin Liangy and Sheng Diy and Franck Cappello}, title={Flexible generalized minimal residual method with a compressed search space}, year={2020}, note={(to be submitted)} } @inproceedings{ahori-IPSJ-2019-July, author="Atsushi Hori and George Bosilca and Emmanuel Jeannot and Takahiro Ogura and Yutaka Ishikawa", title="{Is Japanese HPC another Galapagos? - Interim Report of MPI International Survey -}", journal="SIGHPC", ISSN="", publisher="Information Processing Society of Japan", year="2019", month="July", number="34" } @inproceedings{clusterWangDCS18, author = {Chen Wang and Nikoli Dryden and Franck Cappello and Marc Snir}, title = {Neural Network Based Silent Error Detector}, booktitle = {{IEEE} International Conference on Cluster Computing, {CLUSTER} 2018, Belfast, UK, September 10-13, 2018}, pages = {168--178}, year = {2018}, } @inproceedings{HPDC2018, author = {Dingwen Tao and Sheng Di and Xin Liang and Zizhong Chen and Franck Cappello}, title = {Improving performance of iterative methods by lossy checkponting}, booktitle = {Proceedings of the 27th International Symposium on High-Performance Parallel and Distributed Computing, {HPDC} 2018, Tempe, AZ, USA, June 11-15, 2018}, pages = {52--65}, year = {2018} } @inproceedings{UnderwoodEtAl2020, author = {Underwood, Robert and Calhoun, Jon and Di, Sheng and Cappello, Franck}, title = {FRaZ: A Generic High-Fidelity Fixed-Ratio Lossy Compression Framework for Scientific Data}, booktitle = {2020 {IEEE} International Parallel and Distributed Processing Symposium, {IPDPS} 2020, New Orleans, USA, May 18-22, 2020}, pages = {}, publisher = {{IEEE}}, year = {2020}, url = {}, } @inproceedings{RezaEtAl2019, author = {Reza, Tasmia and Calhoun, Jon and Keipert, Kristopher and Di, Sheng and Cappello, Franck}, title = { Analyzing the Performance and Accuracy of Lossy Checkpointing on Sub-Iteration of NWChem}, booktitle = {2019 IEEE/ACM 5th International Workshop on Data Analysis and Reduction for Big Scientific Data (DRBSD)}, year={2019}, volume={}, number={}, pages={}, doi={}, ISSN={}, month={Nov},} } @inproceedings{ElmoreCalhoun2019, title={Evaluating Lossy Compressors for Inline Compression}, author={Elmore, Donald and Calhoun, Jon}, booktitle = {Poster Session of the 2019 ACM/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis}, series = {SC '19}, year = {2019}, publisher = {IEEE Computer Society}, address = {Washington, DC, USA}, } @article{Calhoun18, author = {Calhoun, Jon and Cappello, Franck and Olson, Luke N. and Snir, Marc and Gropp, William D.}, journal = {Int. J. High Perform. Comput. Appl.}, publisher = {Sage Publications, Inc.}, title = {Exploring the Feasibility of Lossy Compression for PDE Simulations}, volume = {27}, year = {2018}, } @unpublished{Calhoun17, author = {Calhoun, Jon}, journal = {Ph. D. Manuscript: https://www.ideals.illinois.edu/handle/2142/98379}, title = {From detection to optimization: impact of soft errors on high-performance computing applications}, year = {2017}, } @misc{NarayananEtAl2021, author = {Hasco{\"e}t, Laurent and H{\"u}ckelheim, Jan Christian and Hovland, Paul, and Narayanan, Sri Hari Krishna}, title= {{SIAM} {CSE}21 {M}initutorial: {A}utomatic {D}ifferentiation as a {T}ool for {C}omputational {S}cience}, url={https://github.com/sriharikrishna/siamcse21}, year={2021} } @article{HuckelheimHascoet2022, author = {H\"{u}ckelheim, Jan and Hasco\"{e}t, Laurent}, title = {Source-to-Source Automatic Differentiation of OpenMP Parallel Loops}, year = {2022}, issue_date = {March 2022}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, volume = {48}, number = {1}, issn = {0098-3500}, url = {https://doi.org/10.1145/3472796}, doi = {10.1145/3472796}, abstract = {This article presents our work toward correct and efficient automatic differentiation of OpenMP parallel worksharing loops in forward and reverse mode. Automatic differentiation is a method to obtain gradients of numerical programs, which are crucial in optimization, uncertainty quantification, and machine learning. The computational cost to compute gradients is a common bottleneck in practice. For applications that are parallelized for multicore CPUs or GPUs using OpenMP, one also wishes to compute the gradients in parallel. We propose a framework to reason about the correctness of the generated derivative code, from which we justify our OpenMP extension to the differentiation model. We implement this model in the automatic differentiation tool Tapenade and present test cases that are differentiated following our extended differentiation procedure. Performance of the generated derivative programs in forward and reverse mode is better than sequential, although our reverse mode often scales worse than the input programs.}, journal = {ACM Trans. Math. Softw.}, month = {feb}, articleno = {7}, numpages = {32}, keywords = {Automatic differentiation, OpenMP, shared-memory parallel, multicore} } @conference{ppopp22-poster-ad-z3, author = {{J}an {H}\"uckelheim and {L}aurent {H}asco\"et}, booktitle = {Principles and Practice of Parallel Programming (PPoPP) 2022}, date-added = {2022-03-15 08:59:01 -0500}, date-modified = {2022-03-15 09:01:06 -0500}, title = {{POSTER}: Automatic Differentiation of Parallel Loops with Formal Methods}, year = {2022}} @InProceedings{LeFevreEtAl2020, author = {Valentin Le Fèvre and Thomas Herault and Julien Langou and Yves Robert}, booktitle = {{Resilience}: 13th Workshop on Resiliency in High Performance Computing in Clusters, Clouds, and Grids, jointly published with {Euro-Par 2020}}, title = {A comparison of several fault-tolerance methods for the detection and correction of floating-point errors in matrix-matrix multiplication}, year = {2020}, series = {LNCS}, publisher = {Springer Verlag} } @InProceedings{LeFevreEtAl2018, author = {Valentin Le Fèvre and George Bosilca and Aurelien Bouteiller and Thomas Herault and Atsushi Hori and Yves Robert and Jack Dongarra}, booktitle = {{Resilience}: 11th Workshop on Resiliency in High Performance Computing in Clusters, Clouds, and Grids, jointly published with {Euro-Par 2018}}, title = {{Do moldable applications perform better on failure-prone HPC platforms?}}, year = {2018}, series = {LNCS}, publisher = {Springer Verlag} } @INPROCEEDINGS{HeraultEtAl2018, AUTHOR = {Thomas Hérault and Yves Robert and Aurélien Bouteiller and Dorian Arnold and Kurt Ferreira and George Bosilca and Jack Dongarra}, BOOKTITLE = {20th Workshop on Advances in Parallel and Distributed Computational Models {APDCM 2018}}, PUBLISHER = {IEEE Computer Society Press}, TITLE = {Optimal cooperative checkpointing for shared high-performance computing platforms}, YEAR = {2018} } @inproceedings{BenoitEtAl2022, author = {Benoit, Anne and Du, Yishu and Herault, Thomas and Marchal, Loris and Pallez, Guillaume and Perotin, Lucas and Robert, Yves and Sun, Hongyang and Vivien, Frederic}, title = {Checkpointing \`{a} La Young/Daly: An Overview}, year = {2022}, isbn = {9781450396752}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3549206.3549328}, doi = {10.1145/3549206.3549328}, booktitle = {Proceedings of the 2022 Fourteenth International Conference on Contemporary Computing}, pages = {701–710}, numpages = {10}, location = {Noida, India}, series = {IC3-2022} } @article{BosilcaEtAl2022, title = {{Comparing distributed termination detection algorithms for modern HPC platforms}}, author = {George Bosilca and Aurélien Bouteiller and Thomas Herault and Valentin Le Fèvre and Yves Robert and Jack Dongarra}, journal = {Int. J. of Networking and Computing}, year = {2022}, volume = {12}, number = {1}, pages = {26-46} } @article{DuEtAl2022, author = {Yishu Du and Guillaume Pallez and Loris Marchal and Yves Robert}, journal = {IEEE Trans. Parallel Distributed Systems}, volume = {33}, pages = {507-522}, title = {Optimal checkpointing strategies for iterative applications}, number = {3}, year = {2022} } @article{BenoitEtAl2021, AUTHOR = {Anne Benoit and Valentin Le Fèvre and Lucas Perotin and Padma Raghavan and Yves Robert and Hongyang Sun}, journal = {IEEE Transactions on Computers}, TITLE = {Resilient scheduling of moldable parallel jobs to cope with silent errors}, YEAR = {2021} } @INPROCEEDINGS{BenoitEtAl2019, AUTHOR = {Anne Benoit and Thomas Hérault and Valentin Le Fèvre and Yves Robert}, BOOKTITLE = {{SC'2019}, the IEEE/ACM Conference on High Performance Computing Networking, Storage and Analysis}, TITLE = {Replication is more efficient than you think}, publisher = {ACM Press}, YEAR = {2019} } @INPROCEEDINGS{BenoitEtAl2018b, AUTHOR = {Anne Benoit and Swann Perarnau and Loïc Pottier and Yves Robert}, BOOKTITLE = {{ICPP'2018}, the 47th Int. Conf. on Parallel Processing}, TITLE = {A performance model to execute workflows on high-bandwidth memory architectures}, publisher = {{IEEE} Computer Society Press}, YEAR = {2018} } @article{BenoitEtAl2018, AUTHOR = {Anne Benoit and Aurélien Cavelan and Franck Cappello and Padma Raghavan and Yves Robert and Hongyang Sun}, journal = {J. Parallel and Distributed Computing}, TITLE = {Coping with silent and fail-stop errors at scale by combining replication and checkpointing}, YEAR = {2018}, badvolume = {98}, badpages = {8-24} } @inproceedings{benoitEtAl2017identifying, title={Identifying the right replication level to detect and correct silent errors at scale}, author={Benoit, Anne and Cavelan, Aur{\'e}lien and Cappello, Franck and Raghavan, Padma and Robert, Yves and Sun, Hongyang}, year={2017}, booktitle={Proceedings of the 7th Workshop on Fault Tolerance for HPC at eXtreme Scale (FTXS)}, KEYWORDS = {mine,Workshop}, } @inproceedings{benoitEtAl2017optimal, title={Optimal checkpointing period with replicated execution on heterogeneous platforms}, author={Benoit, Anne and Cavelan, Aur{\'e}lien and Le F{\`e}vre, Valentin and Robert, Yves}, booktitle={Proceedings of the 7th Workshop on Fault Tolerance for HPC at eXtreme Scale (FTXS)}, year={2017}, KEYWORDS = {mine,Workshop}, } @inproceedings{tmv+16, TITLE = {{Topology-Aware Data Aggregation for Intensive I/O on Large-Scale Supercomputers}}, AUTHOR = {Tessier, Fran{\c c}ois and Malakar, Preeti and Vishwanath, Venkatram and Jeannot, Emmanuel and Isaila, Florin}, URL = {https://hal.inria.fr/hal-01394741}, BOOKTITLE = {{1st Workshop on Optimization of Communication in HPC runtime systems (IEEE COM-HPC16)}}, ADDRESS = {Salt-Lake City, United States}, PUBLISHER = {{IEEE}}, YEAR = {2016}, MONTH = Nov, PDF = {https://hal.inria.fr/hal-01394741/file/topoIO-paper.pdf}, HAL_ID = {hal-01394741}, HAL_VERSION = {v1}, } @inproceedings{BautEtAl2014, author={Bautista-Gomez, Leonardo and Cappello, Franck and Carro, Luigi and DeBardeleben, Nathan and Fang, Bo and Gurumurthi, Sudhanva and Pattabiraman, Karthik and Rech, Paolo and Reorda, M Sonza}, booktitle={Proceedings of the conference on Design, Automation \& Test in Europe}, organization={European Design and Automation Association}, pages={341}, title={GPGPUs: How to combine high computational power with high reliability}, year={2014} } @inproceedings{BautEtAl2015b, title={Which Verification for Soft Error Detection?}, author={Bautista-Gomez, Leonardo and Benoit, Anne and Cavelan, Aur{\'e}lien and Raina, Saurabh K and Robert, Yves and Sun, Hongyang}, year={2015}, booktitle={Proceedings of the 24th International Conference on High-Performance Performance Computing}, organization={IEEE} } @inproceedings{BautEtAl2015, title={Detecting and correcting data corruption in stencil applications through multivariate interpolation}, author={Bautista-Gomez, Leonardo Arturo and Cappello, Franck}, booktitle={Proceedings of the 2015 IEEE International Conference on Cluster Computing}, pages={595--602}, year={2015}, organization={IEEE Computer Society} } @article{BleuseR2017Scheduling, note = {in print}, author = {Rapha{\"{e}}l Bleuse and Sascha Hunold and Safia Kedad{-}Sidhoum and Florence Monna and Gr{\'{e}}gory Mouni{\'{e}} and Denis Trystram}, title = {{Scheduling Independent Moldable Tasks on Multi-Cores with GPUs}}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {}, number = {}, pages = {}, year = 2017, doi = {10.1109/TPDS.2017.2675891}, publisher = {IEEE}, issn = {1045-9219}, language = english } @article{CappelloEtAl2016, author = {Franck Cappello and Kazutomo Yoshii and Hal Finkel and Jason Cong}, booktitle = {{The 2016 Post-Moore's Era Supercomputing (PMES) Workshop}}, title = {{Re-Form: FPGA-Powered True Codesign Flow for High-Performance Computing In The Post-Moore Era}}, year = {2016}, } @inproceedings{Cheriere2020Pufferscale, author = {Cheriere, Nathanael and Dorier, Matthieu and Antiniu, Gabriel and Wild, Stefan M and Leyffer, Sven and Ross, Robert}, title = {Pufferscale: Rescaling HPC Data Services for High Energy Physics Applications}, booktitle = {Proceedings of the 20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (Ccgrid)}, series = {CCgrid '20}, year = {2020}, location = {Melbourne, Australia}, pages = {}, numpages = {}, url = {}, doi = {}, acmid = {}, publisher = {IEEE/ACM}, keywords = {Distributed Storage System, Elasticity, Rescaling, Load balancing, High Energy Physics}, pdf = {} } @article{Cheriere2020HowFast, title = {How Fast Can One Resize a Distributed File System?}, author = {Cheriere, Nathanael and Dorier, Matthieu and Antoniu, Gabriel}, journal = {Journal of Parallel and Distributed Computing}, volume = {}, pages = {}, year = {2020}, pdf = {}, url = {} } @inproceedings{CheriereEtAl2016, title = {{Design and Evaluation of Topology-aware Scatter and AllGather Algorithms for Dragonfly Networks}}, author = {Cheriere, Nathanael and Dorier, Matthieu}, booktitle = {{IEEE/ACM International Conference for High Performance Computing, Networking, Storage and Analysis (SC) - ACM Student Research Competition}}, year = {2016}, url = {http://sc16.supercomputing.org/sc-archive/src_poster/src_poster_pages/spost146.html}, pdf = {http://sc16.supercomputing.org/sc-archive/src_poster/poster_files/spost146s2-file2.pdf} } @inproceedings{Cheriere2017How, TITLE = {{How Fast Can One Scale Down a Distributed File System?}}, AUTHOR = {Cheriere, Nathana{\"e}l and Antoniu, Gabriel}, URL = {https://hal.archives-ouvertes.fr/hal-01644928}, BOOKTITLE = {{BigData}}, ADDRESS = {Boston, United States}, YEAR = {2017}, MONTH = Dec, DOI = {10.1109/BigData.2017.8257922}, KEYWORDS = {Decommission ; Model ; Malleable File System ; Distributed File System ; Elastic Storage}, PDF = {https://hal.archives-ouvertes.fr/hal-01644928/file/ModelingDecommision.pdf}, HAL_ID = {hal-01644928}, HAL_VERSION = {v1}, } @techreport{Cheriere2018LowerCommission, TITLE = {{A Lower Bound for the Commission Times in Replication-Based Distributed Storage Systems}}, AUTHOR = {Cheriere, Nathana{\"e}l and Dorier, Matthieu and Antoniu, Gabriel}, URL = {https://hal.archives-ouvertes.fr/hal-01817638}, TYPE = {Research Report}, NUMBER = {RR-9186}, PAGES = {1-26}, INSTITUTION = {{Inria Rennes - Bretagne Atlantique}}, YEAR = {2018}, MONTH = Jun, KEYWORDS = {Commission ; Elastic Storage ; Distributed File System ; Malleable File System ; Lower Bound}, PDF = {https://hal.archives-ouvertes.fr/hal-01817638/file/RR-9186.pdf}, HAL_ID = {hal-01817638}, HAL_VERSION = {v2}, } @inproceedings{Cheriere2018Pufferbench, TITLE = {{Pufferbench: Evaluating and Optimizing Malleability of Distributed Storage}}, AUTHOR = {Cheriere, Nathana{\"e}l and Dorier, Matthieu and Antoniu, Gabriel}, URL = {https://hal.archives-ouvertes.fr/hal-01892713}, BOOKTITLE = {{PDSW-DISCS 2018: 3rd Joint International workshop on Parallel Data Storage \& Data Intensive Scalable computing Systems}}, ADDRESS = {Dallas, United States}, PAGES = {1-10}, YEAR = {2018}, MONTH = Nov, KEYWORDS = {Distributed Storage System Malleability ; Benchmark ; Pufferbench}, PDF = {https://hal.archives-ouvertes.fr/hal-01892713/file/Paper.pdf}, HAL_ID = {hal-01892713}, HAL_VERSION = {v1}, } @techreport{Cheriere2018LowerRelaxed, TITLE = {{Lower Bounds for the Duration of Decommission Operations with Relaxed Fault Tolerance in Replication-based Distributed Storage Systems}}, AUTHOR = {Cheriere, Nathana{\"e}l and Dorier, Matthieu and Antoniu, Gabriel}, URL = {https://hal.archives-ouvertes.fr/hal-01943964}, TYPE = {Research Report}, NUMBER = {RR-9229}, PAGES = {1-28}, INSTITUTION = {{Inria Rennes - Bretagne Atlantique}}, YEAR = {2018}, MONTH = Dec, KEYWORDS = {Distributed Storage Systems ; Malleable Storage ; Fault Tolerance ; Elastic Storage ; Syst{\`e}me de stockage distribu{\'e} ; Stockage {\'e}lastique ; Stockage mall{\'e}able ; D{\'e}commission ; Tol{\'e}rance aux pannes}, PDF = {https://hal.archives-ouvertes.fr/hal-01943964/file/Report.pdf}, HAL_ID = {hal-01943964}, HAL_VERSION = {v2}, } @inproceedings{DiEtAl2014, acmid = {2683692}, address = {Piscataway, NJ, USA}, author = {Di, Sheng and Bautista-Gomez, Leonardo and Cappello, Franck}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, doi = {10.1109/SC.2014.79}, isbn = {978-1-4799-5500-8}, location = {New Orleans, Louisana}, numpages = {12}, pages = {907--918}, publisher = {IEEE Press}, series = {SC '14}, title = {Optimization of a Multilevel Checkpoint Model with Uncertain Execution Scales}, year = {2014}, } @inproceedings{DorierEtAl2014a, acmid = {2683662}, address = {Piscataway, NJ, USA}, author = {Dorier, Matthieu and Ibrahim, Shadi and Antoniu, Gabriel and Ross, Rob}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, doi = {10.1109/SC.2014.56}, isbn = {978-1-4799-5500-8}, keywords = {HPC, I/O, Omnisc'IO, exascale, grammar, prediction, storage}, location = {New Orleans, Louisana}, numpages = {12}, pages = {623--634}, publisher = {IEEE Press}, series = {SC '14}, title = {Omnisc'IO: A Grammar-based Approach to Spatial and Temporal I/O Patterns Prediction}, year = {2014}, } @inproceedings{DorierEtAl2014b, address= {Phoenix, United States}, author = {Dorier, Matthieu and Antoniu, Gabriel and Ross, Robert and Kimpe, Dries and Ibrahim, Shadi}, booktitle = {IPDPS - International Parallel and Distributed Processing Symposium}, hal_id = {hal-00916091}, hal_version = {v1}, month = {May}, pdf = {https://hal.inria.fr/hal-00916091/file/CALCioM.pdf}, title = {CALCioM: Mitigating I/O Interference in HPC Systems through Cross-Application Coordination}, url = {https://hal.inria.fr/hal-00916091}, year = {2014}, } @article{ShengEtCappello2016, address= {Phoenix, United States}, author = {Sheng Di and Franck Cappello}, booktitle = {IEEE Transactions on Parallel and Distributed Computing}, title = { Adaptive-Impact Driven Detection of Silent Data Corruption for HPC Applications}, year = {2016}, } @inproceedings{DorierEtAl2015, TITLE = {{Lessons Learned from Building In Situ Coupling Frameworks}}, AUTHOR = {Dorier, Matthieu and Dreher, Matthieu and Peterka, Tom and Antoniu, Gabriel and Raffin, Bruno and Wozniak, Justin M.}, URL = {https://hal.inria.fr/hal-01224846}, BOOKTITLE = {{First Workshop on In Situ Infrastructures for Enabling Extreme-Scale Analysis and Visualization}}, ADDRESS = {Austin, United States}, YEAR = {2015}, MONTH = Nov, DOI = {10.1145/2828612.2828622}, KEYWORDS = {Exascale ; In Situ Visualization ; Simulation ; Coupling ; Damaris ; Decaf ; FlowVR}, PDF = {https://hal.inria.fr/hal-01224846/file/paper-no-cr.pdf}, HAL_ID = {hal-01224846}, HAL_VERSION = {v1}, } @techreport{DorierEtAlRR2016, TITLE = {{Performance-Constrained In Situ Visualization of Atmospheric Simulations}}, AUTHOR = {Dorier, Matthieu and Sisneros, Robert and Bautista-Gomez, Leonardo and Peterka, Tom and Orf, Leigh G and Ross, Rob and Rahmani, Lokman and Antoniu, Gabriel and Boug{\'e}, Luc}, URL = {https://hal.inria.fr/hal-01273718}, TYPE = {Research Report}, NUMBER = {RR-8855}, PAGES = {27}, INSTITUTION = {{INRIA Rennes - Bretagne Atlantique}}, YEAR = {2016}, MONTH = Feb, KEYWORDS = { In Situ Visualization ; Performance ; Exascale}, PDF = {https://hal.inria.fr/hal-01273718/file/RR-8855.pdf}, HAL_ID = {hal-01273718}, HAL_VERSION = {v1}, } @inproceedings{DorierEtAl2016a, title = {{Adaptive Performance-Constrained In Situ Visualization of Atmospheric Simulations}}, author = {Dorier, Matthieu and Sisneros, Robert and Bautista Gomez, Leonardo and Peterka, Tom and Orf, Leigh and Rahmani, Lokman and Antoniu, Gabriel and Bougé, Luc}, booktitle = {IEEE International Conference on Cluster Computing (CLUSTER)}, year = {2016}, organization = {IEEE}, url = {https://hal.inria.fr/hal-01351919}, pdf = {https://hal.inria.fr/hal-01351919/file/SmartViz-final-1.pdf} } @article{DorierEtAl2016TOPC, title={Damaris: Addressing Performance Variability in Data Management for Post-Petascale Simulations}, author={Dorier, Matthieu and Antoniu, Gabriel and Cappello, Franck and Snir, Marc and Sisneros, Robert and Yildiz, Orcun and Ibrahim, Shadi and Peterka, Tom and Orf, Leigh}, journal={ACM Transactions on Parallel Computing (TOPC)}, volume={3}, number={3}, pages={15}, year={2016}, publisher={ACM} } @article{DorierTPDS2015, TITLE = {{Using Formal Grammars to Predict I/O Behaviors in HPC: the Omnisc'IO Approach}}, AUTHOR = {Dorier, Matthieu and Ibrahim, Shadi and Antoniu, Gabriel and Ross, Rob}, URL = {https://hal.inria.fr/hal-01238103}, JOURNAL = {{IEEE Transactions on Parallel and Distributed Systems}}, PUBLISHER = {{Institute of Electrical and Electronics Engineers}}, YEAR = {2015}, DOI = {10.1109/TPDS.2015.2485980}, KEYWORDS = {Omnisc'IO ; Grammar ; Prediction ; I/O ; Storage ; HPC}, PDF = {https://hal.inria.fr/hal-01238103/file/paper.pdf}, HAL_ID = {hal-01238103}, HAL_VERSION = {v1}, } @inproceedings{DorierWORKS2017, title = {{Supporting Task-level Fault-Tolerance in HPC Workflows by Launching MPI Jobs inside MPI Jobs}}, author = {Dorier, Matthieu and Wozniak, Justin and Ross, Robert}, booktitle = {{Proceedings of the WORKS 2017 workshop (SC17)}}, year = {2017}, type = {workshop}, url = {https://dl.acm.org/citation.cfm?id=3151001}, pdf = {http://delivery.acm.org/10.1145/3160000/3151001/a5-dorier.pdf} } @inproceedings{DreherEtAl2016, title={Bredala: Semantic Data Redistribution for In Situ Applications}, author={Dreher, Matthieu and Peterka, Tom}, booktitle={Cluster Computing (CLUSTER), 2016 IEEE International Conference on}, pages={279--288}, year={2016}, organization={IEEE} } @article{GainaruEtAl2013, acmid = {2509969}, address = {Thousand Oaks, CA, USA}, author = {Gainaru, Ana and Cappello, Franck and Snir, Marc and Kramer, William}, doi = {10.1177/1094342013488258}, issn = {1094-3420}, issue_date = {August 2013}, journal = {Int. J. High Perform. Comput. Appl.}, keywords = {failure prediction, fault tolerance, signal analysis}, month = {aug}, number = {3}, numpages = {10}, pages = {273--282}, publisher = {Sage Publications, Inc.}, title = {Failure Prediction for HPC Systems and Applications: Current Situation and Open Issues}, volume = {27}, year = {2013}, } @inproceedings{GuhurEtAl2016, title={Lightweight and Accurate Silent Data Corruption Detection in Ordinary Differential Equation Solvers}, author={Guhur, Pierre-Louis and Zhang, Hong and Peterka, Tom and Constantinescu, Emil and Cappello, Franck}, booktitle={European Conference on Parallel Processing}, pages={644--656}, year={2016}, organization={Springer} } @inproceedings{KellerTesserEtAl2014, author = {Rafael Keller Tesser and Laercio Lima Pilla and Fabrice Dupros and Philippe Olivier Alexandre Navaux and Jean-Francois Mehaut and Celso L. Mendes}, bibsource = {dblp computer science bibliography, http://dblp.org}, biburl = {http://dblp.uni-trier.de/rec/bib/conf/pdp/TesserPDNMM14}, booktitle = {22nd Euromicro International Conference on Parallel, Distributed, and Network-Based Processing, {PDP} 2014, Torino, Italy, February 12-14, 2014}, doi = {10.1109/PDP.2014.37}, pages = {196--203}, timestamp = {Tue, 03 Feb 2015 17:12:45 +0100}, title = {Improving the Performance of Seismic Wave Simulations with Dynamic Load Balancing}, url = {http://dx.doi.org/10.1109/PDP.2014.37}, year = {2014}, } @article{KellerTesserEtAl2014a, title = {Dynamic load balancing for seismic wave propagation models}, journal = {International Journal of High Performance Computing Applications (accepted)}, author = {Rafael Keller Tesser and Laercio Lima Pilla and Fabrice Dupros and Philippe Olivier Alexandre Navaux and Jean-Francois Mehaut and Celso L. Mendes}, year = {2014}, note = {accepted}, } @inproceedings{MartsinkevichEtAl2015, author = {Tatiana V. Martsinkevich and Omer Subasi and Osman S. Unsal and Franck Cappello and Jes{\'{u}}s Labarta}, title = {Fault-Tolerant Protocol for Hybrid Task-Parallel Message-Passing Applications}, booktitle = {2015 {IEEE} International Conference on Cluster Computing, {CLUSTER} 2015, Chicago, IL, USA, September 8-11, 2015}, pages = {563--570}, year = {2015}, url = {http://dx.doi.org/10.1109/CLUSTER.2015.104}, doi = {10.1109/CLUSTER.2015.104}, } @inproceedings{MatriEtAl2017, TITLE = {{Could Blobs Fuel Storage-Based Convergence Between HPC and Big Data?}}, AUTHOR = {Matri, Pierre and Alforov, Yevhen and Brandon, Alvaro and Kuhn, Michael and Carns, Philip and Ludwig, Thomas}, URL = {https://hal.inria.fr/hal-01617655}, BOOKTITLE = {{CLUSTER 2017 - IEEE International Conference on Cluster Computing}}, ADDRESS = {Honolulu, United States}, PAGES = {81 - 86}, YEAR = {2017}, MONTH = Sep, DOI = {10.1109/CLUSTER.2017.63}, PDF = {https://hal.inria.fr/hal-01617655/file/HPC_BD_Convergence___Short_Paper___Cluster_17%20%282%29.pdf}, HAL_ID = {hal-01617655}, HAL_VERSION = {v1}, } @inproceedings{MatriEtAl2018, TITLE = {{TýrFS: Increasing Small Files Access Performance with Dynamic Metadata Replication}}, AUTHOR = {Matri, Pierre and Pérez, María S and Costan, Alexandru and Antoniu, Gabriel}, booktitle={2018 IEEE International Conference on Cluster Cloud and Grid Computing (CCGrid'18)}, year={2018}, organization={IEEE} } @inproceedings{MatriEtAl2018Streaming, TITLE = {{SLoG: Large-Scale Logging Middleware for HPC and Big Data Convergence}}, AUTHOR = {Matri, Pierre and Carns, Philip and Ross, Robert and Costan, Alexandru and Pérez, María S and Antoniu, Gabriel}, booktitle={2018 IEEE 38th International Conference on Distributed Computing Systems (ICDCS)}, year={2018}, organization={IEEE} } @article{NarayananHascoet2016, author = {Sri Hari Krishna Narayanan and Laurent Hascoet}, title = {Interfacing OpenAD and Tapenade}, journal = {}, year = {2016}, volume = {}, number = {}, pages = {}, notes = {submitted, also available as ANL/MCS-P5588-0316} } @article{PeterkaEtAl2016, title={Self-adaptive density estimation of particle data}, author={Peterka, Tom and Croubois, Hadrien and Li, Nan and Rangel, Esteban and Cappello, Franck}, journal={SIAM Journal on Scientific Computing}, volume={38}, number={5}, pages={S646--S666}, year={2016}, publisher={SIAM} } @misc{PinedaEtAl2015, author = {Pineda-Morales, Luis and Subramaniam, Balaji and Keahey, Kate and Antoniu, Gabriel and Costan, Alexandru and Wang, Shaowen and Padmanabhan, Anand and Soliman, Aiman}, hal_id = {hal-01241718}, hal_version = {v1}, howpublished = {{SC15 - ACM/IEEE International Conference in Supercomputing}}, keywords = {spatial data ; cloud computing ; elastic provisioning}, month = {Nov}, note = {Poster}, pdf = {https://hal.inria.fr/hal-01241718/file/Pineda-Morales_SC.pdf}, title = {{Scaling Smart Appliances for Spatial Data Synthesis}}, url = {https://hal.inria.fr/hal-01241718}, year = {2015}, } @INPROCEEDINGS{SubasiEtAl2015, author = {O. Subasi and J. Arias and O. Unsal and J. Labarta and A. Cristal}, booktitle = {2015 23rd Euromicro International Conference on Parallel, Distributed and Network-Based Processing (PDP)}, title = {NanoCheckpoints: A Task-Based Asynchronous Dataflow Framework for Efficient and Scalable Checkpoint/Restart}, year = {2015}, pages = {99-102}, doi = {10.1109/PDP.2015.17}, ISSN = {1066-6192}, month = {March}, } @inproceedings{SubasiEtAl2016, title = {Spatial Support Vector Regression to Detect Silent Errors in the Exascale Era}, author = {Omer Subasi and Sheng Di and Leonardo Bautista-Gomez and Prasanna Balaprakash and Osman Unsal and Jesus Labarta and Adrian Cristal, and Franck Cappello}, booktitle = {Proceedings of the 2016 IEEE/ACM International Symposium on Cluster Cloud and Grid Computing}, organization={IEEE}, year = {2016} } @article{Subasi2018, title = "Exploring the Capabilities of Support Vector Machines in Detecting Silent Data Corruptions ", journal = "Sustainable Computing: Informatics and Systems ", volume = "", number = "", year = "2018", note = "", issn = "2210-5379", doi = "https://doi.org/10.1016/j.suscom.2018.01.004", url = "https://www.sciencedirect.com/science/article/pii/S2210537917300896", author = "Omer Subasi and Sheng Di and Leonardo Bautista-Gomez and Prasanna Balaprakash and Osman Unsal and Jesus Labarta and Adrian Cristal and Sriram Krishnamoorthy and Franck Cappello", keywords = "Silent Data Corruptions", keywords = "Support Vector Machines", keywords = "HPC Applications " } @article{bautista2016coping, title={Coping with recall and precision of soft error detectors}, author={Bautista-Gomez, Leonardo and Benoit, Anne and Cavelan, Aur{\'e}lien and Raina, Saurabh K and Robert, Yves and Sun, Hongyang}, journal={Journal of Parallel and Distributed Computing}, volume={98}, pages={8--24}, year={2016}, publisher={Elsevier} } @inproceedings{YildizIPDPS2016, TITLE = {{On the Root Causes of Cross-Application I/O Interference in HPC Storage Systems}}, AUTHOR = {Yildiz, Orcun and Dorier, Matthieu and Ibrahim, Shadi and Ross, Rob and Antoniu, Gabriel}, URL = {https://hal.inria.fr/hal-01270630}, BOOKTITLE = {{IPDPS - International Parallel and Distributed Processing Symposium}}, ADDRESS = {Chicago, United States}, YEAR = {2016}, MONTH = May, KEYWORDS = {Exascale I/O ; Parallel File Systems ; Cross-Application Contention ; Interference}, PDF = {https://hal.inria.fr/hal-01270630/file/IPDPS%2716-CR.pdf}, HAL_ID = {hal-01270630}, HAL_VERSION = {v1}, } @article{YoshiiEtAl2016, author = {Kazutomo Yoshii and Hal Finkel and Franck Cappello}, booktitle = {{Second International Workshop on Heterogeneous High-performance Reconfigurable Computing}}, title = {{Benchmarking Under the Hood of OpenCL FPGA Platforms}}, year = {2016}, } @inproceedings{subasi2017rep, title={Designing and Modelling Selective Replication for Fault-tolerant HPC Applications}, author={Subasi, Omer and Yalcin, Gulay and Zyulkyarov, Ferad and Unsal, Osman and Labarta, Jesus}, booktitle={2017 IEEE International Conference on Cluster Cloud and Grid Computing (CCGrid'17)}, year={2017}, organization={IEEE} } @inproceedings{subasi2016run, title={A runtime heuristic to selectively replicate tasks for application-specific reliability targets}, author={Subasi, Omer and Yalcin, Gulay and Zyulkyarov, Ferad and Unsal, Osman and Labarta, Jesus}, booktitle={2016 IEEE International Conference on Cluster Computing (CLUSTER'16)}, pages={498--505}, year={2016}, organization={IEEE} } @inproceedings{di2015detect, title={An efficient silent data corruption detection method with error-feedback control and even sampling for HPC applications}, author={Di, Sheng and Berrocal, Eduardo and Cappello, Franck}, booktitle={2015 IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGrid'15)}, pages={271--280}, year={2015}, organization={IEEE} } @inproceedings{TsujEtAl2017, author = {Miwako Tsuji and William T. C. Kramer and Mitsuhisa Sato}, title = {A Performance Projection of Mini-Applications onto Benchmarks Toward the Performance Projection of Real-Applications}, booktitle = {2017 IEEE International Conference on Cluster Computing (CLUSTER), Workshop on Representative Applications (WRAp)}, publisher = {IEEE}, year = {2017}, pages = {On Line}, } @inproceedings{tvj17, title={TAPIOCA: An I/O Library for Optimized Topology-Aware Data Aggregation on Large-Scale Supercomputers}, author={Tessier, Fran{\c{c}}ois and Vishwanath, Venkatram and Jeannot, Emmanuel}, booktitle={Cluster Computing (CLUSTER), 2017 IEEE International Conference on}, pages={70--80}, year={2017}, organization={IEEE} } @inproceedings{drehercluster17, title={Manala: a Flexible Flow Control Library for Asynchronous Task Communication}, author={Dreher, Matthieu and Sasikumar, Kiran and Sankaranarayanan, Subramanian and Peterka, Tom}, booktitle={Cluster Computing (CLUSTER), 2017 IEEE International Conference on}, pages={509--519}, year={2017}, organization={IEEE} } @inproceedings{mommessincluster17, title={Automatic Data Filtering for In Situ Workflows}, author={Mommessin, Cl{\'e}ment and Dreher, Matthieu and Raffin, Bruno and Peterka, Tom}, booktitle={Cluster Computing (CLUSTER), 2017 IEEE International Conference on}, pages={370--378}, year={2017}, organization={IEEE} } @inproceedings{dorierpdsw17, title={CoSS: Proposing a Contract-Based Storage System for HPC}, author={Matthieu Dorier and Matthieu Dreher and Tom Peterka and Robert Ross}, booktitle={Proceedings of PDSW SC17 Workshop}, year={2017} } @inproceedings{dreherisav17, title={In Situ Workflows at Exascale: System Software to the Rescue}, author={Matthieu Dreher and Swann Perarnau and Tom Peterka and Kamil Iskra and Pete Beckman}, booktitle={Proceedings of ISAV SC17 Workshop}, year={2017} } @inproceedings{Kettimuthua17, title={Transferring a Petabyte in a Day}, author={Rajkumar Kettimuthua and Zhengchun Liua and David Wheelerd and Ian Fostera and Katrin Heitmanna and Franck Cappello}, booktitle={IEEE/ACM SC17 Workshop on Innovating the Network for Data Intensive Science (INDIS 2017)}, year={2017} } @misc{BermanEtAl, title = {Report of the Third Global Experimentation for Future Internet (GEFI 2018) Workshop}, author = {M. Berman, T. Friedman, A. Gosain, K. Keahey, R. McGeer, I. Moerman, A. Nakao, L. Nussbaum, K. Rauschenbach, V. Syrotiuk, A. Veeraraghavan, and N. Yamanaka}, address = {Tokyo, Japan}, url = {http://indico.rnp.br/conferenceDisplay.py?confId=260}, month = {October}, year = {2018} } @inproceedings{Hobson2020Dhmem, author = {Hobson, Tanner and Yildiz, Orcun and Nicolae, Bogdan and Huang, Jian and Peterka, Tom}, title = {Shared-Memory Communication for Containerized Workflows}, booktitle = {Proceedings of the 21st IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGrid)}, series = {CCGrid '21}, year = {2021}, location = {Melbourne, Australia}, pages = {}, numpages = {}, url = {}, doi = {}, acmid = {}, publisher = {IEEE/ACM}, keywords = {shared memory, workflow systems, containers}, pdf = {} } @inproceedings{DanielEtAl2020, title={E2clab: Exploring the computing continuum through repeatable, replicable and reproducible edge-to-cloud experiments}, author={Rosendo, Daniel and Silva, Pedro and Simonin, Matthieu and Costan, Alexandru and Antoniu, Gabriel}, booktitle={2020 IEEE International Conference on Cluster Computing (CLUSTER)}, pages={176--186}, year={2020}, organization={IEEE} } @incollection{Ruttgers2020, address = {Frankfurt/Main, Germany}, author = {R{\"{u}}ttgers, Mario and Koh, Seong-Ryong and Jitsev, Jenia and Schr{\"{o}}der, Wolfgang and Lintermann, Andreas}, booktitle = {High Performance Computing, Proceedings of the 35th International Conference, ISC High Performance 2020}, doi = {10.1007/978-3-030-59851-8_6}, pages = {81--101}, publisher = {Springer International Publishing}, title = {{Prediction of Acoustic Fields Using a Lattice-Boltzmann Method and Deep Learning}}, url = {http://link.springer.com/10.1007/978-3-030-59851-8{\_}6}, year = {2020} } @article{TsujEtAl2021, title={A new sustained system performance metric for scientific performance evaluation}, author={Miwako Tsuji and William T.C. Kramer and Jean-Christophe Weill and Jean-Philippe Nominé and Mitsuhisa Sato}, journal={The Journal of Supercomputing}, volume={}, pages={1--29}, year={2021}, publisher={Springer}, } @article{ahori-PARCO21-survey, title = {An international survey on MPI users}, journal = {Parallel Computing}, volume = {108}, pages = {102853}, year = {2021}, issn = {0167-8191}, doi = {https://doi.org/10.1016/j.parco.2021.102853}, url = {https://www.sciencedirect.com/science/article/pii/S0167819121000983}, author = {Atsushi Hori and Emmanuel Jeannot and George Bosilca and Takahiro Ogura and Balazs Gerofi and Jie Yin and Yutaka Ishikawa}, keywords = {Message Passing Interface (MPI), Survey}, } @inproceedings{DanielEtAl2021, title={Reproducible performance optimization of complex applications on the edge-to-cloud continuum}, author={Rosendo, Daniel and Costan, Alexandru and Antoniu, Gabriel and Simonin, Matthieu and Lombardo, Jean-Christophe and Joly, Alexis and Valduriez, Patrick}, booktitle={2021 IEEE International Conference on Cluster Computing (CLUSTER)}, pages={23--34}, year={2021}, organization={IEEE} } @article{DanielEtAl2022, title={Distributed intelligence on the Edge-to-Cloud Continuum: A systematic literature review}, author={Rosendo, Daniel and Costan, Alexandru and Valduriez, Patrick and Antoniu, Gabriel}, journal={Journal of Parallel and Distributed Computing}, year={2022}, publisher={Elsevier} } @INPROCEEDINGS{OlayaEtAl2022, author={Olaya, Paula and Caíno-Lores, Silvina and Lama, Vanessa and Patel, Ria and Rorabaugh, Ariel Keller and Miyashita, Osamu and Tama, Florence and Taufer, Michela}, booktitle={2022 IEEE 18th International Conference on e-Science (e-Science)}, title={Identifying Structural Properties of Proteins from X-ray Free Electron Laser Diffraction Patterns}, year={2022}, volume={}, number={}, pages={21-31}, doi={10.1109/eScience55777.2022.00017} } @INPROCEEDINGS{PatelEtAl2022, author={Patel, Ria and Rorabaugh, Ariel Keller and Olaya, Paula and Caino-Lores, Silvina and Channing, Georgia and Schuman, Catherine and Miyashita, Osamu and Tama, Florence and Taufer, Michela}, booktitle={2022 IEEE 18th International Conference on e-Science (e-Science)}, title={A Methodology to Generate Efficient Neural Networks for Classification of Scientific Datasets}, year={2022}, volume={}, number={}, pages={389-390}, doi={10.1109/eScience55777.2022.00052} } @INPROCEEDINGS{MateevitsiEtAl2023, author={Mateevitsi, Victor A. and Bode, Mathis and Ferrier, Nicola and Fischer, Paul and G{\"{o}}bbert, Jens Henrik and Insley, Joseph A. and Lan, Yu-Hsiang and Min, Misun and Papka, Michael E. and Patel, Saumil and Rizzi, Silvio and Windgassen, Jonathan}, booktitle={Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis (SC-W 2023)}, title={{Scaling Computational Fluid Dynamics: In Situ Visualization of NekRS using SENSEI}}, year={2023}, doi={10.1145/3624062.3624159}, } @article{UnderwoodEtAl2023, author = {Robert Underwood, Chunhong Yoon, Ali Gok, Sheng Di and Franck Cappello}, title = {ROIBIN-SZ: Fast and Science-Preserving Compression for Serial Crystallography}, journal = {Synchrotron Radiation News}, volume = {36}, number = {4}, pages = {17-22}, year = {2023}, publisher = {Taylor & Francis}, doi = {10.1080/08940886.2023.2245722}, URL = {https://doi.org/10.1080/08940886.2023.2245722}, eprint = {https://doi.org/10.1080/08940886.2023.2245722} } @misc{TalukdarEtAl2023, author = {Isita Talukdar, Amarjit Singh, Robert Underwood, Kento Sato, Weikuan Yu}, title = {Integrating TEZip into LibPressio: A Acase Study of Integrating a Dynamic Application into a Static C Envionment}, year = {2023} } @inproceedings{peterka2023lowfive, title={LowFive: In Situ Data Transport for High-Performance Workflows}, author={Peterka, Tom and Morozov, Dmitriy and Nigmetov, Arnur and Yildiz, Orcun and Nicolae, Bogdan and Davis, Philip E}, booktitle={IPDPS'23: The 37th IEEE International Parallel and Distributed Processing Symposium}, year={2023} } @ARTICLE{GaikwadEtAl2024, author = {{Gaikwad}, Shreyas Sunil and {Krishna Narayanan}, Sri Hari and {Hascoet}, Laurent and {Campin}, Jean-Michel and {Pillar}, Helen and {Nguyen}, An and {H{\"u}ckelheim}, Jan and {Hovland}, Paul and {Heimbach}, Patrick}, title = "{{MITgcm-AD} v2: Open source tangent linear and adjoint modeling framework for the oceans and atmosphere enabled by the Automatic Differentiation tool Tapenade}", journal = {arXiv e-prints}, keywords = {Physics - Atmospheric and Oceanic Physics}, year = 2024, month = jan, eid = {arXiv:2401.11952}, pages = {arXiv:2401.11952}, doi = {10.48550/arXiv.2401.11952}, archivePrefix = {arXiv}, eprint = {2401.11952}, primaryClass = {physics.ao-ph}, }