Main Papers Reports Books BibTeX Biography Wiki


.BIB texts for journal articles, conference proceeding papers, and book chapter


@INPROCEEDINGS {11402485,
author = { Yim, Keun Soo },
booktitle = { 2025 IEEE International Conference on Big Data (BigData) },
title = {{ Forecasting Extreme Production Outages in Agile, Big Data and Machine Learning Services: Simple, Two-Parameter Software Reliability Models for Root Cause Insights }},
year = {2025},
volume = {},
ISSN = {},
pages = {3914-3923},
abstract = { Time series forecasting models have diverse real-world applications, yet forecasting sporadic or spiky production outages of cloud computing services remains a challenging target. Traditional one-parameter Software Reliability Growth Models (SRGMs) are inadequate for accurately estimating outages in modern agile software environments for big data computing. This inadequacy stems from the continuous introduction and removal of defects, constantly evolving total defect counts, and non-constant defect detection rates in agile software, further complicated by operational issues like release and deployment challenges contributing to outages. In this paper, we address these limitations by optimizing a fundamental reliability model to estimate aggregated time series of sporadic, spiky production outages of big data machine learning (ML) services. Our analysis utilizes three years of production incident statistics from planet-scale services with billions of users. We conduct a comprehensive curve fitting study across daily, weekly, and monthly aggregated outage counts against a total of 55 standard distribution functions. We empirically demonstrate that two-parameter distributions, specifically beta and wrapped Cauchy, consistently provide the best fit for total production outages across all granularities, highlighting the necessity of multi-parameter models for agile software reliability. Furthermore, by classifying outages by their root cause type (e.g., experiments, ML, and migration) based on manual post-mortem analyses, we find that root cause-specific outages often represent even more extreme events than total outage counts, requiring two- or multi-parameter models for accurate forecasting. This granular understanding is crucial for big data service operators (e.g., on-call engineers) to identify root causes and apply mitigation techniques in a timely manner. },
keywords = {Computational modeling;Agile software development;Production;Machine learning;Big Data;Predictive models;Data models;Software reliability;Forecasting;Distribution functions},
doi = {10.1109/BigData66926.2025.11402485},
url = {https://doi.ieeecomputersociety.org/10.1109/BigData66926.2025.11402485},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
month =Dec}

@misc{yim2025evaluationfoundationalmodelstochastic,
      title={Evaluation of a Foundational Model and Stochastic Models for Forecasting Sporadic or Spiky Production Outages of High-Performance Machine Learning Services}, 
      author={Keun Soo Yim},
      year={2025},
      eprint={2507.01067},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2507.01067}, 
}

@misc{yim2024predictinglikelyvulnerablecodechanges,
      title={Predicting Likely-Vulnerable Code Changes: Machine Learning-based Vulnerability Protections for Android Open Source Project}, 
      author={Keun Soo Yim},
      year={2024},
      eprint={2405.16655},
      archivePrefix={arXiv},
      primaryClass={cs.CR},
      url={https://arxiv.org/abs/2405.16655}, 
}

@misc{yim2024taskorientedqueriesbenchmarktoqb,
      title={The Task-oriented Queries Benchmark (ToQB)}, 
      author={Keun Soo Yim},
      year={2024},
      eprint={2406.02943},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2406.02943}, 
}

@Inbook{Yim2023assessment,
author = {Yim, Keun Soo},
editor = {Wang, Long and Pattabiraman, Karthik and Di Martino, Catello and Athreya, Arjun and Bagchi, Saurabh},
title = {Assessment of Security Defense of Native Programs Against Software Faults},
bookTitle="System Dependability and Analytics: Approaching System Dependability from Data, System and Analytics Perspectives",
year="2023",
publisher="Springer International Publishing",
address="Cham",
pages="69--98",
isbn="978-3-031-02063-6",
doi="10.1007/978-3-031-02063-6_5",
url="https://doi.org/10.1007/978-3-031-02063-6_5"
}

  
@article{YIHB2019TECS,
author = {Yim, Keun Soo and Malchev, Iliyan and Hsieh, Andrew and Burke, Dave},
title = {Treble: Fast Software Updates by Creating an Equilibrium in an Active Software Ecosystem of Globally Distributed Stakeholders},
year = {2019},
issue_date = {October 2019},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {18},
number = {5s},
issn = {1539-9087},
url = {https://doi.org/10.1145/3358237},
doi = {10.1145/3358237},
journal = {ACM Transactions on Embedded Computing Systems},
month = {Oct},
articleno = {104},
numpages = {23},
}

@inproceedings{yim2016bigdatamonitoring,
  title={Evaluation metrics of reliability monitoring rules of a big data service},
  author={Yim, Keun Soo},
  booktitle={Proceedings of the IEEE International Symposium on Software Reliability Engineering (ISSRE)},
  pages={376--387},
  year={2016},
  organization={IEEE}
}

@inproceedings{yim2016rowhammerattack,
  title={The rowhammer attack injection methodology},
  author={Yim, Keun Soo},
  booktitle={Proceedings of the IEEE Symposium on Reliable Distributed Systems (SRDS)},
  pages={1--10},
  year={2016},
  organization={IEEE}
}

@inproceedings{yim2014nbody,
  title={Characterization of impact of transient faults and detection of data corruption errors in large-scale N-body programs using computational accelerators},
  author={Yim, Keun Soo},
  booktitle={Proceedings of the IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  pages={458--467},
  year={2014},
  organization={IEEE}
}

@phdthesis{yim2013phd,
  title={From Experiment To Design – Fault Characterization and Detection in Parallel Computer Systems Using Computational Accelerators},
  author={Yim, Keun Soo},
  year={2013},
  school={University of Illinois at Urbana-Champaign}
}

@inproceedings{yim2013pluggablewatchdog,
  title={Pluggable watchdog: Transparent failure detection for MPI programs},
  author={Yim, Keun Soo and Kalbarczyk, Z.},
  booktitle={Proceedings of the IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  pages={489--500},
  year={2013},
  organization={IEEE}
}

@inproceedings{yim2011hauberk,
  title={Hauberk: Lightweight silent data corruption error detector for GPGPU},
  author={Yim, Keun Soo and Pham, C. and Saleheen, M. and Kalbarczyk, Z.},
  booktitle={Proceedings of the IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
  pages={287--300},
  year={2011},
  organization={IEEE}
}

@inproceedings{yim2010dynamicmemory,
  title={Measurement-based analysis of fault and error sensitivities of dynamic memory},
  author={Yim, Keun Soo and Kalbarczyk, Z.},
  booktitle={Proceedings of the IEEE International Conference on Dependable Systems and Networks (DSN)},
  pages={431--436},
  year={2010},
  organization={IEEE}
}

@inproceedings{yim2009longlatency,
  title={Quantitative analysis of long latency failure in system software},
  author={Yim, Keun Soo and Kalbarczyk, Z.},
  booktitle={Proceedings of the IEEE Pacific Rim International Symposium on Dependable Computing (PRDC)},
  pages={23--30},
  year={2009},
  organization={IEEE}
}

@inproceedings{yim2008synergyfs,
  title={SynergyFS: A stackable file system creating synergies between heterogeneous storage devices},
  author={Yim, Keun Soo and Son, J. C.},
  booktitle={Proceedings of the Ottawa Linux Symposium (OLS)},
  pages={255--259},
  year={2008}
}

@article{yim2006svm,
  title={A software reproduction of virtual memory for deeply embedded systems},
  author={Yim, Keun Soo and Lee, J. D. and Park, J. and Im, C. and Yoo, J.-J. and Ryu, Y.},
  journal={Lecture Notes in Computer Science (LNCS)},
  volume={3980},
  pages={1000--1009},
  year={2006},
  publisher={Springer-Verlag}
}

@article{yim2006cata,
  title={CATA: A garbage collection scheme for flash memory file systems},
  author={Yim, Keun Soo and Ryu, L.-z. and Han, Y.},
  journal={Lecture Notes in Computer Science (LNCS)},
  volume={4159},
  pages={103--112},
  year={2006},
  publisher={Springer-Verlag}
}

@inproceedings{yim2006rtcsa,
  title={Operating system support for procedural abstraction in embedded systems},
  author={Yim, Keun Soo and Yoo, J.-J. and Lee, J. D. and Kim, J.},
  booktitle={Proceedings of the IEEE International Conference on Embedded and Real-Time Computing Systems and Applications (RTCSA)},
  pages={378--382},
  year={2006},
  organization={IEEE}
}

@article{yim2005nvmemory,
  title={A novel memory hierarchy for flash memory based storage systems},
  author={Yim, Keun Soo},
  journal={Journal of Semiconductor Technology and Science (JSTS)},
  volume={5},
  number={4},
  pages={69--76},
  year={2005}
}

@inproceedings{yim2005fastboot,
  title={A fast start-up technique for flash memory based computing systems},
  author={Yim, Keun Soo and Kim, J. and Koh, K.},
  booktitle={Proceedings of the ACM Symposium on Applied Computing (SAC)},
  pages={852--858},
  year={2005},
  organization={ACM}
}

@article{yim2004fcl,
  title={A flash compression layer for SmartMedia Card systems},
  author={Yim, Keun Soo and Bahn, H. and Koh, K.},
  journal={IEEE Transactions on Consumer Electronics (TCE)},
  volume={50},
  number={1},
  pages={192--197},
  year={2004}
}

@inproceedings{yim2004reliabletransport,
  title={An energy-efficient reliable transport for wireless sensor networks},
  author={Yim, Keun Soo and Kim, J. and Koh, K.},
  booktitle={Lecture Notes in Computer Science (LNCS)},
  volume={3090},
  pages={54--64},
  year={2004},
  publisher={Springer-Verlag}
}

@inproceedings{yim2004energyrouting,
  title={An energy-efficient routing and reporting scheme to exploit data similarities in wireless sensor networks},
  author={Yim, Keun Soo and Kim, J. and Koh, K.},
  booktitle={Lecture Notes in Computer Science (LNCS)},
  volume={3207},
  pages={515--527},
  year={2004},
  publisher={Springer-Verlag}
}

@inproceedings{yim2004compressedcache,
  title={A space-efficient on-chip compressed cache organization for high performance computing},
  author={Yim, Keun Soo and Lee, J.-S. and Kim, J. and Kim, S.-D. and Koh, K.},
  booktitle={Lecture Notes in Computer Science (LNCS)},
  volume={3358},
  pages={952--964},
  year={2004},
  publisher={Springer-Verlag}
}

@inproceedings{yim2004nicnet,
  title={NIC-NET: A host-independent network solution for high-end network servers},
  author={Yim, Keun Soo and Cha, H. and Koh, K.},
  booktitle={Lecture Notes in Computer Science (LNCS)},
  volume={3320},
  pages={401--405},
  year={2004},
  publisher={Springer-Verlag}
}

@inproceedings{yim2004isocc,
  title={An energy-efficient compression algorithm for wireless communication},
  author={Yim, Keun Soo and Lee, S. and Koh, K.},
  booktitle={Proceedings of the International SoC Design Conference (ISOCC)},
  pages={578--581},
  year={2004}
}

@inproceedings{yim2003pdpta,
  title={Performance analysis of on-chip cache and main memory compression systems for high-end parallel computers},
  author={Yim, Keun Soo and Kim, J. and Koh, K.},
  booktitle={Proceedings of the International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA)},
  pages={469--475},
  year={2003}
}


Return to the
top.