Dissertation: Opportunistic Memory Systems in
Presence of Hardware Variability
Committee: Puneet Gupta, Lara Dolecek, Mani Srivastava, Glenn Reinman
@phdthesis{Gottscho2017, author = {Gottscho, Mark William}, doi = {10.13140/RG.2.2.13446.98885}, keywords = {CAD,DRAM,EDA,IoT,Linux,SRAM,approximate computing,benchmarks,caches,cloud,computer architecture,electronic design automation,embedded systems,error-correcting codes,hardware/software interface,memory systems,operating systems,reliability,resiliency,scratchpads,supercomputers,tools,variation-aware}, school = {University of California, Los Angeles}, title = {{Opportunistic Memory Systems in Presence of Hardware Variability}}, type = {Ph.D. Dissertation}, url = {http://escholarship.org/uc/item/85c1r1q9}, year = {2017} }
Project: ViPZonE: Exploiting DRAM Power Variability for Energy Savings in Linux x86-64
Advisor: Puneet Gupta
Raghu Prabhakar, Ram Sivaramakrishnan, Darshan Gandhi, Yun Du, Mingran Wang, Xiangyu Song, Kejie Zhang, Tianren Gao, Angela Wang, Karen Li, Joshua Brot, Calvin Leung, Tuowen Zhao, Mark Gottscho, Edison Chen, Kaizhao Liang, Swayambhoo Jain, Urmish Thakker, Kevin J. Brown, and Kunle Olukotun. Composition of Experts on the SN40L Reconfigurable Dataflow Unit. IEEE Micro preprint. August 2024.
@article{RPrabhakarMicro2024, author = {Prabhakar, Raghu and Sivaramakrishnan, Ram and Gandhi, Darshan, and Du, Yun and Wang, Mingran and Song, Xiangyu and Zhang, Kejie and Gao, Tianren and Wang, Angela and Li, Karen and Brot, Joshua and Leung, Calvin and Zhao, Tuowen and Gottscho, Mark and Chen, Edison and Liang, Kaizhao and Jain, Swayambhoo and Thakker, Urmish and Brown, Kevin J. and Olukotun, Kunle}, doi = {10.1109/MM.2024.3428548}, title = {{Composition of Experts on the SN40L Reconfigurable Dataflow Unit}}, year = {2024}, }
Raghu Prabhakar, Ram Sivaramakrishnan, Darshan Gandhi, Yun Du, Mingran Wang, Xiangyu Song, Kejie Zhang, Tianren Gao, Angela Wang, Karen Li, Yongning Sheng, Joshua Brot, Denis Sokolov, Apurv Vivek, Calvin Leung, Arjun Sabnis, Jiayu Bai, Tuowen Zhao, Mark Gottscho, David Jackson, Mark Luttrell, Manish K. Shah, Edison Chen, Kaizhao Liang, Swayambhoo Jain, Urmish Thakker, Dawei Huang, Sumti Jairath, Kevin J. Brown, and Kunle Olukotun. SambaNova SN40L: Scaling the AI Memory Wall with Dataflow and Composition of Experts. Arxiv preprint. May 2024.
@article{RPrabhakarArxiv2024, author = {Prabhakar, Raghu and Sivaramakrishnan, Ram and Gandhi, Darshan, and Du, Yun and Wang, Mingran and Song, Xiangyu and Zhang, Kejie and Gao, Tianren and Wang, Angela and Li, Karen and Sheng, Yongning and Brot, Joshua and Sokolov, Denis and Vivek, Apurv and Leung, Calvin and Sabnis, Arjun and Bai, Jiayu and Zhao, Tuowen and Gottscho, Mark and Jackson, David and Luttrell and Shah, Manish K. and Chen, Edison and Liang, Kaizhao and Jain, Swayambhoo and Thakker, Urmish and Huang, Dawei and Jairath, Sumti and Brown, Kevin J. and Olukotun, Kunle}, doi = {10.48550/arXiv.2405.07518}, title = {{SambaNova SN40L: Scaling the AI Memory Wall with Dataflow and Composition of Experts}}, year = {2024}, }
Norman P. Jouppi, Doe Hyun Yoon, Matthew Ashcraft, Mark Gottscho, Thomas B. Jablin, George Kurian, James Laudon, Sheng Li, Peter Ma, Xiaoyu Ma, Thomas Norrie, Nishant Patil, Sushma Prasad, Cliff Young, Zongwei Zhou, and David Patterson. Ten Lessons from Three Generations Shaped Google's TPUv4i: Industrial Product. In ACM/IEEE International Symposium on Computer Architecture (ISCA), in Valencia, Spain (virtual/online). June 2021.
@article{NJouppiISCA2021, author = {Jouppi, Normal P. and Yoon, Doe Hyun and Ashcraft, Matthew and Gottscho, Mark and Jablin, Thomas B. and Kurian, George and Laudon, James and Li, Sheng and Ma, Peter and Ma, Xiaoyu and Norrie, Thomas and Patil, Nishant and Prasad, Sushma and Young, Cliff and Zhou, Zongwei and Patterson, David}, doi = {10.1109/ISCA52012.2021.00010}, inproceedings = {ACM/IEEE International Symposium on Computer Architecture (ISCA)}, title = {{Ten Lessons from Three Generations Shaped Google's TPUv4i: Industrial Product}}, year = {2021}, }
Clayton Schoeny, Frederic Sala, Mark Gottscho, Irina Alam, Puneet Gupta, and Lara Dolecek. Context-Aware Resiliency: Unequal Message Protection for Random-Access Memories. In IEEE Transactions on Information Theory, Vol. 65, issue 10, pp. 6146-6159. May 2019.
@article{SchoenyTransIT2019, author = {Schoeny, Clayton and Sala, Frederic and Gottscho, Mark and Alam, Irina and Gupta, Puneet and Dolecek, Lara}, title = {{Context-Aware Resiliency: Unequal Message Protection for Random-Access Memories}}, year = {2019}, booktitle = {IEEE Transactions on Information Theory}, volume = {65}, issue = {10}, pages = {6146-6159}, year = {2019}, doi = {10.1109/TIT.2019.2918209} }
Clayton Schoeny, Irina Alam, Mark Gottscho, Puneet Gupta, and Lara Dolecek. Error Correction and Detection for Computing Memories Using System Side Information. In IEEE Information Theory Workshop (ITW), in Guangzhou, China. November 2018.
@article{CSchoenyITW2018, author = {Schoeny, Clayton and Alam, Irina and Gottscho, Mark and Gupta, Puneet and Dolecek, Lara}, doi = {10.1109/ITW.2018.8613473}, inproceedings = {IEEE Information Theory Workshop (ITW)}, title = {{Error Correction and Detection for Computing Memories Using System Side Information}}, year = {2018}, }
Mark Gottscho, Clayton Schoeny, Lara Dolecek, and Puneet Gupta. Software-Defined ECC: Heuristic Recovery from Uncorrectable Memory Errors. UCLA Technical Report, published online October 2017.
@article{GottschoUCLATR2017, author = {Gottscho, Mark and Schoeny, Clayton and Dolecek, Lara and Gupta, Puneet}, title = {{Software-Defined ECC: Hueristic Recovery from Uncorrectable Memory Errors}}, year = {2017}, }
Mark Gottscho, Irina Alam, Clayton Schoeny, Lara Dolecek, and Puneet Gupta. Low-Cost Memory Fault Tolerance for IoT Devices. In ACM Transactions on Embedded Computing Systems (TECS), online July 2017. TECS ESWEEK Special Issue, appeared at the ACM/IEEE International Conference on Compilers, Architecture, and System Synthesis (CASES) in Seoul, South Korea, October 2017.
Best Paper Award@article{GottschoESWEEK2017, address = {Seoul, South Korea}, author = {Gottscho, Mark and Alam, Irina and Schoeny, Clayton and Dolecek, Lara and Gupta, Puneet}, booktitle = {ACM Transactions on Embedded Computing Systems (TECS) ESWEEK Special Issue}, title = {{Low-Cost Memory Fault Tolerance for IoT Devices}}, year = {2017}, volume = {16}, number = {5s}, doi = {10.1145/3126534}, }
Mark Gottscho, Mohammed Shoaib, Sriram Govindan, Bikash Sharma, Di Wang, and Puneet Gupta. Measuring the Impact of Memory Errors on Application Performance. In IEEE Computer Architecture Letters (CAL), Vol. 16, No. 1, pp. 51-55, online August 2016, print 2017.
@article{GottschoCAL2016, author = {Gottscho, Mark and Shoaib, Mohammed and Govindan, Sriram and Sharma, Bikash and Wang, Di and Gupta, Puneet}, doi = {10.1109/LCA.2016.2599513}, journal = {IEEE Computer Architecture Letters (CAL)}, title = {{Measuring the Impact of Memory Errors on Application Performance}}, year = {2016}, volume = {16}, number = {1}, pages = {51--55}, }
Mark Gottscho, Clayton Schoeny, Lara Dolecek and Puneet Gupta. Software-Defined Error-Correcting Codes. In Proceedings of the IEEE/IFIP International Conference on Dependable Systems and Networks Workshops (DSN-W), pp. 276-282, Best of SELSE Special Session. Toulouse, France, June 2016.
Best Paper Award (SELSE)@inproceedings{GottschoDSNW2016, address = {Toulouse, France}, author = {Gottscho, Mark and Schoeny, Clayton and Dolecek, Lara and Gupta, Puneet}, booktitle = {IEEE/IFIP International Conference on Dependable Systems and Networks Workshops (DSN-W)}, doi = {10.1109/DSN-W.2016.67}, title = {{Software-Defined Error-Correcting Codes}}, year = {2016}, pages = {276--282} }
Mark Gottscho, Sriram Govindan, Mohammed Shoaib, Bikash Sharma, and Puneet Gupta. X-Mem: A Cross-Platform and Extensible Memory Characterization Tool for the Cloud. In Proceedings of the IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 263-273, April 2016.
@article{GottschoISPASS016, author = {Gottscho, Mark and Govindan, Sriram and Shoaib, Mohammed and Sharma, Bikash and Gupta, Puneet}, doi = {10.1109/ISPASS.2016.7482101}, journal = {IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)}, title = {{X-Mem: A Cross-Platform and Extensible Memory Characterization Tool for the Cloud}}, year = {2016}, pages = {263--273} }
Qixiang Zhang, Liangzhen Lai, Mark Gottscho, and Puneet Gupta. Multi-Story Power Distribution Networks for GPUs. In IEEE Design, Automation, and Test in Europe, pp. 451-456. Dresden, Germany, March 2016.
@inproceedings{ZhangDATE2016, address = {Dresden, Germany}, author = {Zhang, Qixiang and Lai, Liangzhen and Gottscho, Mark and Gupta, Puneet}, booktitle = {IEEE Design, Automation, and Test in Europe (DATE)}, title = {{Multi-Story Power Distribution Networks for GPUs}}, year = {2016}, pages = {451--256} }
Mark Gottscho, Abbas BanaiyanMofrad, Nikil Dutt, Alex Nicolau, and Puneet Gupta. DPCS: Dynamic Power/Capacity Scaling for SRAM Caches in the Nanoscale Era. In ACM Transactions on Architecture and Code Optimization (TACO), Vol. 12, No. 3, Article 27. Online August 2015, in print October 2015.
@article{GottschoTACO2015, author = {Gottscho, Mark and BanaiyanMofrad, Abbas and Dutt, Nikil and Nicolau, Alex and Gupta, Puneet}, doi = {10.1145/2792982}, journal = {ACM Transactions on Architecture and Code Optimization (TACO)}, title = {{DPCS: Dynamic Power/Capacity Scaling for SRAM Caches in the Nanoscale Era}}, year = {2015}, volume = {12}, number = {3} }
Lucas Wanner, Liangzhen Lai, Abbas Rahimi, Mark Gottscho, Pietro Mercati, Chu-Hsiang Huang, Frederic Sala, Yuvraj Agarwal, Lara Dolecek, Nikil Dutt, Puneet Gupta, Rajesh Gupta, Ranjit Jhala, Rakesh Kumar, Sorin Lerner, Subhashish Mitra, Alexandru Nicolau, Tajana Simunic Rosing, Mani B. Srivastava, Steve Swanson, Dennis Sylvester, and Yuanyuan Zhou. NSF Expedition on Variability-Aware Software: Recent Results and Contributions. In De Gruyter Information Technology (it), Vol. 57, No. 3. Invited paper. June 2015.
@article{Wannerit2015, author = {Wanner, Lucas and Lai, Liangzhen and Rahimi, Abbas and Gottscho, Mark and Mercati, Pietro and Huang, Chu-Hsiang and Sala, Frederic and Agarwal, Yuvraj and Dolecek, Lara and Dutt, Nikil and Gupta, Puneet and Gupta, Rajesh and Jhala, Ranjit and Kumar, Rakesh and Lerner, Sorin and Mitra, Subhasish and Nicolau, Alexandru and {Simunic Rosing}, Tajana and Srivastava, Mani B. and Swanson, Steve and Sylvester, Dennis and Zhou, Yuanyuan}, year = {2016}, pages = {451--256} journal = {De Gruyter it - Information Technology}, number = {3}, title = {{NSF Expedition on Variability-Aware Software: Recent Results and Contributions}}, volume = {57}, year = {2015}, doi = {10.1515/itit-2014-1085} }
Salma Elmalaki, Mark Gottscho, Puneet Gupta, and Mani Srivastava. A Case for Battery Charging-Aware Power Management and Deferrable Task Scheduling. In USENIX Workshop on Power-Aware Computing and Systems (HotPower), 6 pages. Bloomfield, Colorado, October 2014.
@inproceedings{ElmalakiHotPower2014, author = {Elmalaki, Salma and Gottscho, Mark and Srivastava, Mani B. and Gupta, Puneet}, booktitle = {USENIX Workshop on Power-Aware Computing and Systems (HotPower)}, title = {{A Case for Battery Charging-Aware Power Management and Deferrable Task Scheduling}}, year = {2014} }
Mark Gottscho, Abbas BanaiyanMofrad, Nikil Dutt, Alex Nicolau, and Puneet Gupta. Power / Capacity Scaling: Energy Savings With Simple Fault-Tolerant Caches. In Proceedings of the ACM/IEEE Design Automation Conference (DAC), San Francisco, California. June 2014.
@inproceedings{GottschoDAC2014, author = {Gottscho, Mark and BanaiyanMofrad, Abbas and Dutt, Nikil and Nicolau, Alex and Gupta, Puneet}, booktitle = {Proceedings of the ACM/IEEE Design Automation Conference (DAC)}, doi = {10.1145/2593069.2593184}, title = {{Power / Capacity Scaling: Energy Savings With Simple Fault-Tolerant Caches}}, year = {2014} }
Nikil Dutt, Alex Nicolau, Puneet Gupta, Mark Gottscho, and Majid Shoushtari. Multi-Layer Memory Resiliency. In Proceedings of the ACM/IEEE Design Automation Conference (DAC), San Francisco, California. June 2014.
Invited paper@inproceedings{DuttDAC2014, author = {Dutt, Nikil and Nicolau, Alex and Gupta, Puneet and Gottscho, Mark and Shoushtari, Majid}, booktitle = {Proceedings of the ACM/IEEE Design Automation Conference (DAC)}, doi = {10.1145/2593069.2596684}, title = {{Multi-Layer Memory Resiliency}}, year = {2014} }
Mark Gottscho, Luis A.D. Bathen, Nikil Dutt, Alex Nicolau, and Puneet Gupta. ViPZonE: Hardware Power Variability-Aware Virtual Memory Management for Energy Savings. In IEEE Transactions on Computers, Vol. 64, Issue 5, pp. 1483-1496. Published online June 2014, print May 2015.
@article{GottschoTC2015, author = {Gottscho, Mark and Bathen, Luis A. D. and Dutt, Nikil and Nicolau, Alex and Gupta, Puneet}, journal = {IEEE Transactions on Computers (TC)}, doi = {10.1109/TC.2014.2329675}, title = {{ViPZonE: Hardware Power Variability-Aware Virtual Memory Management for Energy Savings}}, year = {2015} }
Nikil Dutt, Puneet Gupta, Alex Nicolau, Luis A. D. Bathen, and Mark Gottscho. Variability-Aware Memory Management for Nanoscale Computing. In Proceedings of the ACM/IEEE Asia and South Pacific Design Automation Conference (ASP-DAC), pp. 125-132. Yokohama, Japan, January 2013.
Invited paper@inproceedings{DuttASPDAC2013, author = {Dutt, Nikil and Gupta, Puneet and Nicolau, Alex and Bathen, Luis A. D. and Gottscho, Mark}, booktitle = {Proceedings of the ACM/IEEE Asia and South Pacific Design Automation Conference (ASP-DAC)}, doi = {10.1109/ASPDAC.2013.6509584}, title = {{Variability-Aware Memory Management for Nanoscale Computing}}, year = {2013} }
Luis A.D. Bathen, Mark Gottscho, Nikil Dutt, Alex Nicolau, and Puneet Gupta. ViPZonE: OS-Level Memory Variability-Driven Physical Address Zoning for Energy Savings. In Proceedings of the ACM International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS), pp. 33-42. Tampere, Finland, October 2012.
@inproceedings{BathenESWEEK2012, author = {Bathen, Luis A. D. and Gottscho, Mark and Dutt, Nikil and Nicolau, Alex and Gupta, Puneet}, booktitle = {Proceedings of the ACM International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS)}, doi = {10.1145/2380445.2380457}, title = {{ViPZonE: OS-Level Memory Variability-Driven Physical Address Zoning for Energy Savings}}, year = {2012} }
Mark Gottscho, Abde Ali Kagalwalla, and Puneet Gupta. Power Variability in Contemporary DRAMs. In IEEE Embedded Systems Letters (ESL), Vol. 4, No. 2. June 2012.
@article{GottschoESL2012, author = {Gottscho, Mark and Kagalwalla, Abde Ali and Gupta, Puneet}, journal = {IEEE Embedded Systems Letters (ESL)}, doi = {10.1109/LES.2012.2192414}, title = {{Power Variability in Contemporary DRAMs}}, year = {2012} }
Mark William Gottscho, Matthew William Ashcraft, Thomas Norrie, and Oliver Bowen. Direct Memory Access Architecture with Multi-Level Multi-Striding. Current assignee: Google LLC. US Patent App. 16/838,796, filed Apr. 2, 2020, published Aug. 19, 2021.
Mark W. Gottscho, Mohammed Shoaib, Sriram Govindan, Mark Santaniello, Bikash Sharma, J. Michael Andrewartha, Jie Liu, and Badriddine Khessib. Opportunistic Memory Tuning for Dynamic Workloads. Current assignee: Microsoft Technology Licensing, LLC. US Patent App. 15/055,466, filed Feb. 26, 2016, published Aug. 31, 2017.
Working on hardware and the mission to develop artificial general intelligence (AGI) that benefits all of humanity.
Moonlighted as technical advisor to ChipStack, a post-seed startup that is developing revolutionary AI-based tools for chip design and verification.
Founded and managed the Operators Team that consisted of ten compiler engineers. The team was responsible for formalizing internal MLIR dialects and developing high-performance custom dataflow kernels. Technical lead for the next-generation Reconfigurable Dataflow Unit (RDU); drove hardware/software codesign activities. Architected novel compiler support for mixed precision; led the team to land on a short timeline and enabled improved accuracy for customers. Architected new hardware features to accelerate computer vision models. Bootstrapped a new docs-as-code methodology and created a new dataflow assembly language to enable next-chip architecture spec development. Proposed and implemented a migration of the compiler build from CMake to Bazel to enable >10X speedups for developer workflows. Initiated a collaboration with OpenXLA. Up-leveled the compiler organization by organizing regular knowledge shares and supporting individuals' careers at junior and senior levels. Multiple patents pending.
Memory system/NoC, host interface, and inter-chip interconnect architecture, microarchitecture, logic design, and cross-functional technical leadership for multiple generations of Google's Tensor Processing Unit (TPU) datacenter AI accelerator chips (TPUv4i, TPUv4, TPUv5e, TPUv5p, and Trillium). Co-founded the Chip Development Kit (CDK) project which enabled large improvements in development velocity, microarchitecture quality, and verification quality using composable and formally verified libraries along with code generation tools. Pioneered a "correct-by-construction" methodology that has been silicon proven. Led a team of ~15 engineers.
Worked in the Sensing and Energy Research Group (SERG) in collaboration with Microsoft Cloud Server Infrastructure. Performance evaluation of memory system fault tolerance schemes for the cloud. Measured and modeled the impact of corrected memory errors on application performance.
Worked in the Sensing and Energy Research Group (SERG) in collaboration with Microsoft Cloud Server Infrastructure. Developed X-Mem, an Extensible Memory characterization tool, for exploring the impact of system configurations on cloud server performance.
RTL design and functional verification of soft IP core to identify malformed PCI Express transaction-layer packets.
Conducted lab and field tests of GPS and wireless sensor system upgrades for research aircraft.
Designed, integrated, and documented custom hardware panels for PCBs and wiring harnesses using 3D CAD.
General I.T. duties, including computer repair and retirement as well as hardware and software troubleshooting.
For my key design contributions to the TPUv4 and TPUv4i memory system architecture and on-chip interconnect (OCI).
For my CASES paper titled "Low-Cost Memory Fault Tolerance for IoT Devices."
This campus-wide fellowship is open to all PhD candidates nearing the completion of their dissertation. The award includes a stipend up to $20,000 over three academic quarters as well as potentially-waived tuition and fees. Received during the 2016-2017 academic year.
Won this fellowship for my proposal titled "Software-Defined Error-Correcting Codes," which was co-authored with Clayton Schoeny. This prestigious fellowship received 129 applications that year from top-18 US universities, from which 34 finalists (26.35%) and eight winners (6.2%) were selected. The award included $100,000 to support both of us for one academic year and also included mentorship by Qualcomm researchers and engineers. Department news article here.
For my SELSE paper titled "Software-Defined Error-Correcting Codes."
Written as an employee of SambaNova. This post attracted significant social media and press attention.
Written as an employee of SambaNova.