Abstract

Recent years have seen considerable work on compiling sparse tensor algebra expressions. This paper addresses a shortcoming in that work, namely how to generate efficient code (in time and space) that scatters values into a sparse result tensor. We address this shortcoming through a compiler design that generates code that uses sparse intermediate tensors (sparse workspaces) as efficient adapters between compute code that scatters and result tensors that do not support random insertion. Our compiler automatically detects sparse scattering behavior in tensor expressions and inserts necessary intermediate workspace tensors. We present an algorithm template for workspace insertion that is the backbone of our code generation algorithm. Our algorithm template is modular by design, supporting sparse workspaces that span multiple user-defined implementations. Our evaluation shows that sparse workspaces can be up to 27.12x faster than the dense workspaces of prior work. On the other hand, dense workspaces can be up to 7.58x faster than the sparse workspaces generated by our compiler in other situations, which motivates our compiler design that supports both. Our compiler produces sequential code that is competitive with hand-optimized linear and tensor algebra libraries on the expressions they support, but that generalizes to any other expression. Sparse workspaces are also more memory efficient than dense workspaces as they compress away zeros. This compression can asymptotically decrease memory usage, enabling tensor computations on data that would otherwise run out of memory.

Article

pdf

Movie

BibTeX

 @article{10.1145/3656426, 
author = {Zhang, Genghan and Hsu, Olivia and Kjolstad, Fredrik},
title = {Compilation of Modular and General Sparse Workspaces},
year = {2024},
issue_date = {June 2024},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {8},
number = {PLDI},
url = {https://doi-org.stanford.idm.oclc.org/10.1145/3656426},
doi = {10.1145/3656426},
abstract = {Recent years have seen considerable work on compiling sparse
tensor algebra expressions. This paper addresses a shortcoming in that work,
namely how to generate efficient code (in time and space) that scatters values
into a sparse result tensor. We address this shortcoming through a compiler
design that generates code that uses sparse intermediate tensors (sparse
workspaces) as efficient adapters between compute code that scatters and result
tensors that do not support random insertion. Our compiler automatically
detects sparse scattering behavior in tensor expressions and inserts necessary
intermediate workspace tensors. We present an algorithm template for workspace
insertion that is the backbone of our code generation algorithm. Our algorithm
template is modular by design, supporting sparse workspaces that span multiple
user-defined implementations. Our evaluation shows that sparse workspaces can
be up to 27.12 exttimes{} faster than the dense workspaces of prior work. On
the other hand, dense workspaces can be up to 7.58 exttimes{} faster than the
sparse workspaces generated by our compiler in other situations, which
motivates our compiler design that supports both. Our compiler produces
sequential code that is competitive with hand-optimized linear and tensor
algebra libraries on the expressions they support, but that generalizes to any
other expression. Sparse workspaces are also more memory efficient than dense
workspaces as they compress away zeros. This compression can asymptotically
decrease memory usage, enabling tensor computations on data that would
otherwise run out of memory.},
journal = {Proc. ACM Program. Lang.},
month = {jun},
articleno = {196},
numpages = {26},
keywords = {code composition, compilation, sparse tensor algebra, sparse workspaces}
}