pytorch · JulianCloudNTH · Jun 13, 2026 · Jun 15, 2026 · Jun 15, 2026
@@ -40,6 +40,7 @@ set(WEBGPU_SRCS
     runtime/ops/quantized_linear/QuantizedLinear.cpp
     runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
     runtime/ops/rope/RotaryEmbedding.cpp
+    runtime/ops/prepack/Prepack.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -230,6 +230,10 @@ void WebGPUGraph::build(
 
   const auto* graph = vkgraph::GetVkGraph(flatbuffer_data);
 
+  // .pte byte sources for prepack-time constant materialization (build-only).
+  constant_data_ = constant_data;
+  named_data_map_ = named_data_map;
+
   // Phase 1: Create all values
   const auto* values = graph->values();
   const int num_vals = values ? values->size() : 0;
@@ -241,6 +245,41 @@ void WebGPUGraph::build(
   bools_.resize(num_vals, false);
   value_lists_.resize(num_vals);
 
+  // Pre-scan the op chain: a constant may be DEFERRED (no eager GPU buffer; the
+  // prepack node materializes it once) only if it is a prepack source AND never
+  // a direct arg of a non-prepack op. ValueList args are expanded so a constant
+  // reached through a list still counts as a direct use.
+  std::unordered_set<int> prepack_src_ids;
+  std::unordered_set<int> direct_use_ids;
+  const auto* chain_prescan = graph->chain();
+  if (chain_prescan) {
+    for (unsigned ci = 0; ci < chain_prescan->size(); ci++) {
+      const auto* oc = chain_prescan->Get(ci);
+      const bool is_prepack = oc->name()->str() == "et_vk.prepack.default";
+      const auto* a = oc->args();
+      if (!a) {
+        continue;
+      }
+      for (unsigned j = 0; j < a->size(); j++) {
+        int id = static_cast<int>(a->Get(j));
+        if (is_prepack && j == 0) {
+          prepack_src_ids.insert(id);
+        } else if (!is_prepack) {
+          direct_use_ids.insert(id);
+          const auto* v = values ? values->Get(id) : nullptr;
+          if (v && v->value_type() == vkgraph::GraphTypes::ValueList) {
+            const auto* items = v->value_as_ValueList()->items();
+            if (items) {
+              for (unsigned k = 0; k < items->size(); k++) {
+                direct_use_ids.insert(static_cast<int>(items->Get(k)));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
   for (int i = 0; i < num_vals; i++) {
     const auto* val = values->Get(i);
     if (!val || val->value_type() == vkgraph::GraphTypes::NONE) {
@@ -269,60 +308,94 @@ void WebGPUGraph::build(
         int constant_id = vk_tensor->constant_id();
         int mem_obj_id = vk_tensor->mem_obj_id();
 
-        // Constants always get dedicated buffers regardless of mem_obj_id
+        // Constants are dedicated. A prepack-routed constant is recorded as a
+        // ConstantSource (materialized once by its prepack node); if it is used
+        // ONLY via prepack it is deferred (no eager GPU buffer).
         if (constant_id >= 0 || mem_obj_id < 0) {
           tensor_mem_obj_ids_[i] = -1;
-          WGPUBufferDescriptor buf_desc = {};
-          buf_desc.size = std::max(tensor.nbytes, size_t(4));
-          buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
-              WGPUBufferUsage_CopySrc;
-          buf_desc.mappedAtCreation = false;
-          tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
-
-          if (constant_id >= 0 && constant_data && tensor.nbytes > 0) {
+
+          // Record the source for every prepack-routed constant (metadata only,
+          // resolved straight from the constants table) so materialize_constant
+          // always has a source -- independent of the defer decision below.
+          const bool is_prepack_src =
+              constant_id >= 0 && prepack_src_ids.count(i) != 0;
+          if (is_prepack_src) {
             const auto* constants = graph->constants();
-            if (constants &&
-                constant_id < static_cast<int>(constants->size())) {
-              const auto* vk_bytes = constants->Get(constant_id);
-              if (vk_bytes->offset() != UINT64_MAX) {
-                const uint8_t* src = constant_data + vk_bytes->offset();
-                wgpuQueueWriteBuffer(
-                    queue_, tensor.buffer, 0, src, tensor.nbytes);
-              } else if (
-                  vk_bytes->named_key() != nullptr &&
-                  named_data_map != nullptr) {
-                // Constant stored in the PTE named-data map.
-                auto buf =
-                    named_data_map->get_data(vk_bytes->named_key()->c_str());
-                if (!buf.ok()) {
-                  throw std::runtime_error(
-                      std::string("WebGPU: named constant '") +
-                      vk_bytes->named_key()->c_str() +
-                      "' not found in NamedDataMap");
-                }
-                if (buf->size() < tensor.nbytes) {
+            if (!constants ||
+                constant_id >= static_cast<int>(constants->size())) {
+              throw std::runtime_error(
+                  "WebGPU: constant_id set but the constants table is missing "
+                  "or the id is out of range");
+            }
+            const auto* vk_bytes = constants->Get(constant_id);
+            ConstantSource cs;
+            cs.nbytes = tensor.nbytes;
+            if (vk_bytes->offset() != UINT64_MAX) {
+              cs.inline_offset = vk_bytes->offset();
+            } else if (vk_bytes->named_key() != nullptr) {
+              cs.named_key = vk_bytes->named_key()->str();
+            } else {
+              throw std::runtime_error(
+                  "WebGPU: constant has no inline offset and no named-data key");
+            }
+            constant_sources_[i] = std::move(cs);
+          }
+
+          // Defer constants consumed solely via prepack: skip the eager buffer.
+          const bool defer = is_prepack_src && direct_use_ids.count(i) == 0;
+          if (!defer) {
+            WGPUBufferDescriptor buf_desc = {};
+            buf_desc.size = std::max(tensor.nbytes, size_t(4));
+            buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+                WGPUBufferUsage_CopySrc;
+            buf_desc.mappedAtCreation = false;
+            tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc);
+
+            if (constant_id >= 0 && constant_data && tensor.nbytes > 0) {
+              const auto* constants = graph->constants();
+              if (constants &&
+                  constant_id < static_cast<int>(constants->size())) {
+                const auto* vk_bytes = constants->Get(constant_id);
+                if (vk_bytes->offset() != UINT64_MAX) {
+                  const uint8_t* src = constant_data + vk_bytes->offset();
+                  wgpuQueueWriteBuffer(
+                      queue_, tensor.buffer, 0, src, tensor.nbytes);
+                } else if (
+                    vk_bytes->named_key() != nullptr &&
+                    named_data_map != nullptr) {
+                  // Constant stored in the PTE named-data map.
+                  auto buf =
+                      named_data_map->get_data(vk_bytes->named_key()->c_str());
+                  if (!buf.ok()) {
+                    throw std::runtime_error(
+                        std::string("WebGPU: named constant '") +
+                        vk_bytes->named_key()->c_str() +
+                        "' not found in NamedDataMap");
+                  }
+                  if (buf->size() < tensor.nbytes) {
+                    throw std::runtime_error(
+                        std::string("WebGPU: named constant '") +
+                        vk_bytes->named_key()->c_str() + "' undersized: have " +
+                        std::to_string(buf->size()) + " bytes, need " +
+                        std::to_string(tensor.nbytes));
+                  }
+                  wgpuQueueWriteBuffer(
+                      queue_, tensor.buffer, 0, buf->data(), tensor.nbytes);
+                  buf->Free();
+                } else {
                   throw std::runtime_error(
-                      std::string("WebGPU: named constant '") +
-                      vk_bytes->named_key()->c_str() + "' undersized: have " +
-                      std::to_string(buf->size()) + " bytes, need " +
-                      std::to_string(tensor.nbytes));
+                      "WebGPU: constant has no inline offset and no named-data key");
                 }
-                wgpuQueueWriteBuffer(
-                    queue_, tensor.buffer, 0, buf->data(), tensor.nbytes);
-                buf->Free();
               } else {
                 throw std::runtime_error(
-                    "WebGPU: constant has no inline offset and no named-data key");
+                    "WebGPU: constant_id set but the constants table is missing "
+                    "or the id is out of range");
               }
-            } else {
+            } else if (constant_id >= 0 && tensor.nbytes > 0) {
+              // constant_id set but constant_data null -> fail loud.
               throw std::runtime_error(
-                  "WebGPU: constant_id set but the constants table is missing "
-                  "or the id is out of range");
+                  "WebGPU: constant_id set but constant_data is null");
             }
-          } else if (constant_id >= 0 && tensor.nbytes > 0) {
-            // constant_id set but constant_data null -> fail loud.
-            throw std::runtime_error(
-                "WebGPU: constant_id set but constant_data is null");
           }
         } else {
           // Shared buffer: track required size, defer allocation to pass 2
@@ -458,6 +531,47 @@ void WebGPUGraph::build(
       webgpu_operator_registry().get_op_fn(op_name)(*this, args);
     }
   }
+
+  // Prepack nodes (Phase 3) materialized their constants directly into the
+  // consumer buffers via materialize_constant; no separate copy pass needed.
+  // The .pte bytes are freed right after build() returns (WebGPUBackend
+  // processed->Free()), so clear the build-only source pointers.
+  constant_data_ = nullptr;
+  named_data_map_ = nullptr;
+}
+
+void WebGPUGraph::materialize_constant(int const_value_id, WGPUBuffer dst) {
+  auto it = constant_sources_.find(const_value_id);
+  if (it == constant_sources_.end()) {
+    throw std::runtime_error(
+        "WebGPU prepack: no source recorded for constant id " +
+        std::to_string(const_value_id));
+  }
+  const ConstantSource& cs = it->second;
+  if (cs.nbytes == 0) {
+    return;
+  }
+  if (cs.inline_offset != UINT64_MAX) {
+    if (constant_data_ == nullptr) {
+      throw std::runtime_error("WebGPU prepack: inline constant data is null");
+    }
+    wgpuQueueWriteBuffer(
+        queue_, dst, 0, constant_data_ + cs.inline_offset, cs.nbytes);
+  } else if (!cs.named_key.empty() && named_data_map_ != nullptr) {
+    auto buf = named_data_map_->get_data(cs.named_key.c_str());
+    if (!buf.ok()) {
+      throw std::runtime_error(
+          "WebGPU prepack: named constant '" + cs.named_key + "' not found");
+    }
+    if (buf->size() < cs.nbytes) {
+      throw std::runtime_error(
+          "WebGPU prepack: named constant '" + cs.named_key + "' undersized");
+    }
+    wgpuQueueWriteBuffer(queue_, dst, 0, buf->data(), cs.nbytes);
+    buf->Free();
+  } else {
+    throw std::runtime_error("WebGPU prepack: constant has no source");
+  }
 }
 
 WGPUShaderModule WebGPUGraph::get_or_create_shader(
@@ -773,10 +887,11 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const {
   for (size_t i = 0; i < value_types_.size(); i++) {
     if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) {
       stats.num_tensors++;
-      // Shared tensors are tracked via shared_buffer_sizes_
+      // Shared tensors are tracked via shared_buffer_sizes_; a deferred
+      // prepack-routed constant has no buffer (no GPU memory) -> not counted.
       bool is_shared =
           i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0;
-      if (!is_shared) {
+      if (!is_shared && tensors_[i].buffer != nullptr) {
         stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes;
       }
     }

diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
@@ -50,6 +50,15 @@ struct OutputCopy {
   size_t nbytes = 0;
 };
 
+// CPU-side record for a prepack-routed constant; mirrors Vulkan's TensorRef
+// (sizes + a data reference, not a live GPU tensor). The prepack node is the
+// sole materialization, so the constant needs no eager GPU buffer.
+struct ConstantSource {
+  uint64_t inline_offset = UINT64_MAX; // offset into constant_data_; else key
+  std::string named_key; // non-empty => fetch from named_data_map_
+  size_t nbytes = 0;
+};
+
 struct ExecuteConfig {
   size_t chunk_size = 0;
   size_t initial_chunk_size = 0;
@@ -180,6 +189,11 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  // Materialize a recorded prepack-routed constant into dst via one CPU->GPU
+  // transfer. Build-time only (the .pte bytes are freed after build()).
+  // Mirrors Vulkan prepack_standard.
+  void materialize_constant(int const_value_id, WGPUBuffer dst);
+
   void add_uniform_buffer_bytes(size_t bytes) {
     uniform_buffer_bytes_ += bytes;
   }
@@ -286,6 +300,13 @@ class WebGPUGraph {
 
   std::vector<WebGPUDispatch> dispatches_;
 
+  // Prepack-routed constant sources (offset/named-key + size); the prepack node
+  // materializes these once. constant_data_/named_data_map_ point at the .pte
+  // bytes and are valid only during build().
+  const uint8_t* constant_data_ = nullptr;
+  const executorch::runtime::NamedDataMap* named_data_map_ = nullptr;
+  std::unordered_map<int, ConstantSource> constant_sources_;
+
   ExecuteConfig execute_config_;
 
   // Caches for reusing GPU objects across dispatches.

diff --git a/backends/webgpu/runtime/ops/prepack/Prepack.cpp b/backends/webgpu/runtime/ops/prepack/Prepack.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+
+#include <stdexcept>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// Materialize a constant into the prepack-output buffer via one CPU->GPU write.
+void prepack_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // et_vk.prepack.default args: [src (constant), out].
+  if (args.size() != 2) {
+    throw std::runtime_error("WebGPU prepack: expected 2 args (src, out)");
+  }
+  const auto& src = graph.get_tensor(args.at(0));
+  const auto& out = graph.get_tensor(args.at(1));
+
+  if (src.dims != out.dims) {
+    throw std::runtime_error("WebGPU prepack: src/out shape mismatch");
+  }
+  if (src.elem_size != out.elem_size) {
+    throw std::runtime_error(
+        "WebGPU prepack: src/out dtype mismatch (cast unsupported)");
+  }
+  if (src.nbytes != out.nbytes) {
+    throw std::runtime_error("WebGPU prepack: src/out byte-size mismatch");
+  }
+  if (out.buffer == nullptr) {
+    throw std::runtime_error("WebGPU prepack: null out buffer binding");
+  }
+
+  // Sole materialization: write the .pte bytes once, straight into the
+  // consumer's buffer (no eager src buffer, no buffer->buffer copy).
+  graph.materialize_constant(args.at(0), out.buffer);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(et_vk.prepack.default, prepack_impl);
+}
+
+} // namespace executorch::backends::webgpu