update deeptile

ZhangYan · ZhangYan · commit 5f56b96f0b68 · 2024-07-08T19:41:46.000-07:00
diff --git a/include/gc/Analysis/MatmulConfigAnalysis.h b/include/gc/Analysis/MatmulConfigAnalysis.h
@@ -28,11 +28,29 @@ struct SystemDesc {
   // get runtime OMP_NUM_THREADS
   uint32_t getNumThreads() {
     char *numThreads = getenv("OMP_NUM_THREADS");
-    if (numThreads) {
+    if (!threads_limited && numThreads) {
       return std::stoi(numThreads);
     }
+    return curThreads;
+  }
+
+  // set the expected threads
+  void limitOnSingleNode(uint32_t numa_node) {
+    char *cacheSize = getenv("NUMA_THREADS");
+    if (cacheSize) {
+      curThreads = std::stoi(cacheSize);
+      threads_limited = true;
+    }
+  }
+
+  uint32_t getNumNodes() {
+    char *numThreads = getenv("OMP_NUM_THREADS");
+    if (threads_limited && numThreads) {
+      return std::stoi(numThreads) / curThreads;
+    }
     return 1;
   }
+
   // get cache size by cacheLevel
   size_t getCacheSize(uint8_t cacheLevel) {
     if (cacheLevel == 1) {
@@ -57,6 +75,10 @@ struct SystemDesc {
   SmallVector<size_t> getContractionOperationMaxVectorLength() {
     return {512UL, 512UL};
   }
+
+private:
+  uint32_t curThreads = 1;
+  bool threads_limited = false;
 };
 
 struct MatmulConfig {
diff --git a/lib/gc/Analysis/MatmulConfigAnalysis.cpp b/lib/gc/Analysis/MatmulConfigAnalysis.cpp
@@ -345,6 +345,12 @@ previous matmul
 MatmulConfigAnalysis::MatmulConfigAnalysis(Operation *root) {
   SystemDesc sysDesc;
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(root)) {
+    // Check if the operation has an attribute named 'splited'
+    auto splitedAttr = linalgOp->getAttrOfType<IntegerAttr>("splited");
+    if (splitedAttr) {
+      sysDesc.limitOnSingleNode(splitedAttr.getInt());
+      llvm::outs() << "splited mm, and should be allocated on numa node 0.\n";
+    }
     auto oprandDimType = *getOprandDimType(linalgOp);
     // get the origin M,N,K size
     auto MDimTypeIdx = extractDimTypeIdx(oprandDimType[0], DimType::M);
diff --git a/lib/gc/Transforms/DeepTileContractionNamedOp.cpp b/lib/gc/Transforms/DeepTileContractionNamedOp.cpp
@@ -471,6 +471,7 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
         else
           tileSizes[d] = getAsIndexOpFoldResult(b.getContext(), tile);
       }
+
       SmallVector<Range> loopRanges =
           cast<TilingInterface>(currentOp.getOperation()).getIterationDomain(b);
       OpBuilder::InsertionGuard guard(b);
@@ -482,7 +483,6 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
             tileSizes[idx] = loopRanges[idx].size;
           }
         }
-
         SmallVector<OpFoldResult> newParallelDims;
         for (auto i = 0UL; i < reductionDims.size(); i++) {
           newParallelDims.push_back(getAsIndexOpFoldResult(b.getContext(), i));
@@ -595,6 +595,11 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
     auto NOuterBlockSize = NDimPos.size() > 1
                                ? (cfg.NBlock - 1) / cfg.innerMostNBlock + 1
                                : cfg.NBlock;
+    // Outermost Numa loop
+    option.nestedTileSizes.emplace_back(
+        SmallVector<size_t>{uint32_t(MFirstDim / 2)});
+    option.loopType.emplace_back(OuterLoopGenerationOption::LoopType::ForallOp);
+    option.loopDim.emplace_back(SmallVector<size_t>{MDimPos[0]});
     // Outer
     option.nestedTileSizes.emplace_back(SmallVector<size_t>{
         MParallelBlockSize, NParallelBlockSize, KParallelBlockSize});
diff --git a/lib/gc/Transforms/Tiling.cpp b/lib/gc/Transforms/Tiling.cpp
@@ -782,6 +782,22 @@ FailureOr<TiledLinalgOp> static tileLinalgOpImpl(
   return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);
 }
 
+FailureOr<TilingResult>
+getTiledImplementationOnNuma(Operation *op, OpBuilder &b,
+                             ArrayRef<OpFoldResult> offsets,
+                             ArrayRef<OpFoldResult> sizes) {
+  // Leave the `sizeBounds` value empty. That is only needed when the `sizes`
+  // specified could lead to out of bounds accesses.
+  Location loc = op->getLoc();
+  LinalgOp linalgOp = cast<LinalgOp>(op);
+  SmallVector<Value> valuesToTile = linalgOp->getOperands();
+
+  SmallVector<Type> resultTensorTypes =
+      getTensorOutputTypes(linalgOp, valuesToTile);
+  Operation *tiledOp = clone(b, linalgOp, resultTensorTypes, valuesToTile);
+  return TilingResult{{tiledOp}, SmallVector<Value>(tiledOp->getResults())};
+}
+
 FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
     RewriterBase &b, PartialReductionOpInterface op,
     ArrayRef<OpFoldResult> threadNums, ArrayRef<OpFoldResult> tileSizes,
@@ -964,6 +980,16 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
     // 4.b. Clone the op and update init operands.
     // We cannot use a IRMapping here because it can replace
     // different OpOperands with the same value.
+    bool isNumaLoop = false;
+    if (tileSizes.size() == iterationDomain.size()) {
+      for (auto [idx, tile] : llvm::enumerate(tileSizes)) {
+        if (idx == 0 && tileSizes[idx] == iterationDomain[idx].size)
+          break;
+        if (idx > 0 && tileSizes[idx] != iterationDomain[idx].size)
+          break;
+        isNumaLoop = true;
+      }
+    }
     Operation *clonedOp = b.clone(*op.getOperation());
     b.modifyOpInPlace(clonedOp, [&]() {
       for (auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
@@ -974,17 +1000,32 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
     });
     // 5. Tile the cloned op and delete the clone.
     if (tileSizes.empty() || threadNums.empty()) {
-      FailureOr<TilingResult> tilingResult =
-          cast<TilingInterface>(clonedOp).getTiledImplementation(
-              b, tiledOffsets, tiledSizes);
-      if (failed(tilingResult))
-        return clonedOp->emitError("Failed to tile op: ");
-      if (tilingResult->tiledOps.size() != 1) {
-        return clonedOp->emitError("expected a single produced tiled op, got ")
-               << tilingResult->tiledOps.size();
+      if (!isNumaLoop) {
+        FailureOr<TilingResult> tilingResult =
+            cast<TilingInterface>(clonedOp).getTiledImplementation(
+                b, tiledOffsets, tiledSizes);
+        if (failed(tilingResult))
+          return clonedOp->emitError("Failed to tile op: ");
+        if (tilingResult->tiledOps.size() != 1) {
+          return clonedOp->emitError(
+                     "expected a single produced tiled op, got ")
+                 << tilingResult->tiledOps.size();
+        }
+        tiledOp = tilingResult->tiledOps.front();
+        tilingResults = tilingResult->tiledValues;
+      } else {
+        FailureOr<TilingResult> tilingResult = getTiledImplementationOnNuma(
+            cast<TilingInterface>(clonedOp), b, tiledOffsets, tiledSizes);
+        if (failed(tilingResult))
+          return clonedOp->emitError("Failed to tile op: ");
+        if (tilingResult->tiledOps.size() != 1) {
+          return clonedOp->emitError(
+                     "expected a single produced tiled op, got ")
+                 << tilingResult->tiledOps.size();
+        }
+        tiledOp = tilingResult->tiledOps.front();
+        tilingResults = tilingResult->tiledValues;
       }
-      tiledOp = tilingResult->tiledOps.front();
-      tilingResults = tilingResult->tiledValues;
     } else {
       LinalgTilingOptions options;
       FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
@@ -1039,6 +1080,19 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
         nonZeroDimIdx++;
       }
     }
+    if (auto attr = resultSizesRank[0].dyn_cast<Attribute>()) {
+      if (auto intAttr = attr.dyn_cast<IntegerAttr>()) {
+        if (intAttr.getInt() == 16)
+          resultSizesRank[0] = b.getIndexAttr(32);
+      }
+    } else if (auto value = resultSizesRank[0].dyn_cast<Value>()) {
+      if (auto constantOp = value.getDefiningOp<arith::ConstantOp>()) {
+        if (auto intAttr = constantOp.getValue().dyn_cast<IntegerAttr>()) {
+          if (intAttr.getInt() == 16)
+            resultSizesRank[0] = b.getIndexAttr(32);
+        }
+      }
+    }
     if (hasReductionThreads) {
       for (auto [parallelDims, redVar] :
            llvm::zip(constantNewParallelDims, reductionInductionVars)) {