@inproceedings{TrainVerify2025SOSP,author={Lu, Yunchi and Miao, Youshan and Tan, Cheng and Huang, Peng and Zhu, Yi and Yang, Fan},title={TrainVerify: Equivalence-Based Verification for Distributed LLM Training},booktitle={Proceedings of ACM SIGOPS 31st Symposium on Operating Systems Principles},series={SOSP '25},month=oct,year={2025},address={New York, NY, USA},location={Seoul, Republic of Korea},isbn={9798400718700},pages={237-253},url={https://doi.org/10.1145/3731569.3764850},publisher={Association for Computing Machinery},}
Slow-Fault
One-Size-Fits-None: Understanding and Enhancing Slow-Fault Tolerance in Modern Distributed Systems
@inproceedings{SlowFaultStudy2025NSDI,author={Lu, Ruiming and Lu, Yunchi and Jiang, Yuxuan and Xue, Guangtao and Huang, Peng},title={One-Size-Fits-None: Understanding and Enhancing Slow-Fault Tolerance in Modern Distributed Systems},booktitle={Proceedings of the 22nd USENIX Symposium on Networked Systems Design and Implementation},series={NSDI '25},month=apr,year={2025},location={Philadelphia, PA, USA},isbn={978-1-939133-46-5},pages={359--378},url={https://www.usenix.org/conference/nsdi25/presentation/lu},publisher={USENIX Association},}