@@ -45,109 +45,6 @@ namespace linalg {
4545// / when used on distributed loops with memref semantics!
4646void hoistRedundantVectorTransfers (func::FuncOp func);
4747
48- // / Greedily hoist redundant subset extract/insert operations on tensors outside
49- // / of `forOp`. The logic follows:
50- // / 1. Look for a write walking back from the `forOp` yield.
51- // / 2. Check the uses of the matching block argument and look for a matching
52- // / read (i.e. extract_slice of transfer_read) with matching indices.
53- // / 3. In the case of a transfer_write, we can bypass other non-conflicting
54- // / operations and find more hoisting opportunities.
55- // / 4. Hoist the read/write pair and update the tensor SSA links.
56- // /
57- // / Return the unmodified `forOp` if no hoisting occured.
58- // / Return a new scf::ForOp if hoisting on tensors occured.
59- // /
60- // / After this transformation the returned scf::ForOp may have unused arguments
61- // / that can be removed by application of canonicalization patterns.
62- // /
63- // / Example:
64- // / ========
65- // / IR Resembling:
66- // /
67- // / ```
68- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0)->(tensor<10xf32>) {
69- // / %1 = scf.for %j = %l to %u step %s iter_args(%a6 = %a0)->(tensor<10xf32>) {
70- // / %e = tensor.extract_slice %a6[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
71- // / %r = vector.transfer_read %e[%c0], %cst: tensor<?xf32>, vector<4xf32>
72- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
73- // / %w = vector.transfer_write %u, %e[%c0] : vector<4xf32>, tensor<?xf32>
74- // / %st = tensor.insert_slice %w into %a6[%i][%sz][1]
75- // / : tensor<?xf32> into tensor<10xf32>
76- // / scf.yield %st: tensor<10xf32>
77- // / }
78- // / scf.yield %1: tensor<10xf32>
79- // / }
80- // / ```
81- // /
82- // / Progressively hoists to:
83- // /
84- // / ```
85- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
86- // / %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
87- // / %1:2 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e)
88- // / -> (tensor<10xf32>, tensor<?xf32>) {
89- // / %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
90- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
91- // / %w = vector.transfer_write %u, %a7[%c0] : vector<4xf32>, tensor<?xf32>
92- // / scf.yield %a6, %w: tensor<10xf32>, tensor<?xf32>
93- // / }
94- // / %st = tensor.insert_slice %1#1 into %1#0[%i][%sz][1]
95- // / : tensor<?xf32> into tensor<10xf32>
96- // / scf.yield %1: tensor<10xf32>
97- // / }
98- // / ```
99- // /
100- // / and
101- // /
102- // / ```
103- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
104- // / %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
105- // / %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
106- // / %1:3 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e, %a7 = r)
107- // / -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
108- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
109- // / scf.yield %a6, %a7, %u: tensor<10xf32>, tensor<?xf32>, vector<4xf32>
110- // / }
111- // / %w = vector.transfer_write %1#2, %1#1[%c0] : vector<4xf32>, tensor<?xf32>
112- // / %st = tensor.insert_slice %w into %1#0[%i][%sz][1]
113- // / : tensor<?xf32> into tensor<10xf32>
114- // / scf.yield %1: tensor<10xf32>
115- // / }
116- // / ```
117- // /
118- // / It can then canonicalize to:
119- // /
120- // / ```
121- // / %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
122- // / %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
123- // / %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
124- // / %1 = scf.for %j = %l to %u step %s iter_args(%a7 = r)
125- // / -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
126- // / %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
127- // / scf.yield %u: vector<4xf32>
128- // / }
129- // / %w = vector.transfer_write %1, %e[%c0] : vector<4xf32>, tensor<?xf32>
130- // / %st = tensor.insert_slice %w into %a0[%i][%sz][1]
131- // / : tensor<?xf32> into tensor<10xf32>
132- // / scf.yield %1: tensor<10xf32>
133- // / }
134- // / ```
135- // /
136- // TODO: This should be further generalized along a few different axes:
137- // - Other loops than scf.ForOp that operate on tensors (both sequential and
138- // parallel loops).
139- // - Other subset extract/insert pairs than tensor.extract/insert_slice and
140- // vector.transfer_read/write.
141- // - More general areSubsetDisjoint analysis/interface to work across all
142- // subset op types and allow bypassing non-WAW-conflicting operations in
143- // more cases.
144- scf::ForOp hoistRedundantSubsetExtractInsert (RewriterBase &rewriter,
145- scf::ForOp forOp);
146-
147- // / Call into `hoistRedundantSubsetInsertExtract` without a RewriterBase.
148- // TODO: obsolete and should be retired
149- void hoistRedundantVectorTransfersOnTensor (func::FuncOp func);
150-
15148} // namespace linalg
15249} // namespace mlir
15350
0 commit comments