Stardust: Compiling Sparse Tensor Algebra to a Reconfigurable Dataflow Architecture
Abstract
We introduce Stardust, a compiler from sparse tensor algebra languages to a sparse reconfigurable dataflow architecture via a parallel-patterns programming model. Stardust lets performance engineers specify the placement of data into memories separately from the placement of computation onto compute units. Users first schedule data placement onto an abstract memory model, and then Stardust binds that data to complex, on-chip physical memories. With guidance from user schedules, Stardust binds computation using these on-chip data structures to the appropriate parallel patterns. Through cycle-accurate simulation, we show that Stardust generates nine more tensor algebra kernels than the original Capstan sparse RDA work. The generated kernels perform, on average, 138x better than generated CPU kernels and 41x better than generated GPU kernels.
Article
BibTeX
@inproceedings{hsu2025stardust,
author = {Hsu, Olivia and Rucker, Alexander and Zhao, Tian and Desai,
Varun and Olukotun, Kunle and Kjolstad, Fredrik},
title = {Stardust: Compiling Sparse Tensor Algebra to a Reconfigurable Dataflow Architecture},
year = {2025},
isbn = {9798400712753},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3696443.3708918},
doi = {10.1145/3696443.3708918},
abstract = {We introduce Stardust, a compiler from sparse tensor algebra
languages to a sparse reconfigurable dataflow architecture via a
parallel-patterns programming model. Stardust lets performance engineers
specify the placement of data into memories separately from the placement of
computation onto compute units. Users first schedule data placement onto an
abstract memory model, and then Stardust binds that data to complex, on-chip
physical memories. With guidance from user schedules, Stardust binds
computation using these on-chip data structures to the appropriate parallel
patterns. Through cycle-accurate simulation, we show that Stardust generates
nine more tensor algebra kernels than the original Capstan sparse RDA work. The
generated kernels perform, on average, 138x better than generated
CPU kernels and 41x better than generated GPU kernels.},
booktitle = {Proceedings of the 23rd ACM/IEEE International Symposium on Code Generation and Optimization},
pages = {628–643},
numpages = {16},
keywords = {DSLs, compilers, dataflow, parallel patterns, reconfigurable architectures, sparse tensor algebra},
location = {Las Vegas, NV, USA},
series = {CGO '25}
}