@inproceedings{b9e170f45c3a47aaaa49f5048ad04e95,
title = "Reducing the overhead of message logging in fault-tolerant HPC applications",
abstract = "With the exascale era within reach, the high performance computing community is preparing to embrace the challenges associated with extreme-scale systems. Resilience raises as one of the major hurdles in making those systems usable for the advance of science and industry. Message logging is a well-known strategy to provide fault tolerance, one that is promising due to its ability to avoid global restart. However, message-logging protocols may suffer considerable overhead if implemented for the general case. This paper introduces a new messagelogging protocol that leverages the benefits of a flexible parallel programming paradigm. We evaluate the protocol using a particular type of applications and demonstrate it can keep a low performance penalization when scaling up to 128,000 cores.",
keywords = "Fault tolerance, Message logging, Resilience",
author = "Esteban Meneses",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing AG 2017.; 3rd Latin American Conference on High Performance Computing, CARLA 2016 ; Conference date: 29-08-2016 Through 02-09-2016",
year = "2017",
doi = "10.1007/978-3-319-57972-6_15",
language = "Ingl{\'e}s",
isbn = "9783319579719",
series = "Communications in Computer and Information Science",
publisher = "Springer Verlag",
pages = "204--218",
editor = "{Barrios Hernandez}, {Carlos Jaime} and Isidoro Gitler and Jaime Klapp",
booktitle = "High Performance Computing - 3rd Latin American Conference, CARLA 2016, Revised Selected Papers",
}