Revert "[Reland] fix missing-prototypes warnings in torch_cpu (Part 4) (

pytorch#101949)" This reverts commit 4f2c007. Reverted pytorch#101949 on behalf of https://github.com/osalpekar due to As noted in @izaitsevfb's comment, we are still seeing linker errors, this time due to `nnc_prepacked_linear_clamp_run` being made a static function. ([comment](pytorch#101949 (comment)))
rui-ren · May 23, 2023 · 32ce06a · 32ce06a
1 parent 45a8f69
commit 32ce06a
Show file tree

Hide file tree

Showing 138 changed files with 772 additions and 572 deletions.
diff --git a/BUILD.bazel b/BUILD.bazel
@@ -1598,7 +1598,6 @@ TORCH_COPTS = COMMON_COPTS + [
     "-fvisibility-inlines-hidden",
     "-fno-math-errno ",
     "-fno-trapping-math",
-    "-Wno-error=unused-function",
 ]
 
 torch_sources = {

diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
@@ -763,7 +763,7 @@ IValueComparator getGreaterThanComparator(const IValue& v) {
   };
 }
 
-std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
+static std::ostream& operator<<(std::ostream& out, const ivalue::EnumHolder& v) {
   out << v.qualifiedClassName() << "." << v.name();
   return out;
 }

diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -1628,7 +1628,7 @@ struct ivalue::EnumHolder : c10::intrusive_ptr_target {
 
   TORCH_API friend std::ostream& operator<<(
       std::ostream& out,
-      const ivalue::EnumHolder& v);
+      const EnumHolder& v);
 
   TORCH_API const std::string qualifiedClassName() const;
 

diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp
@@ -405,7 +405,7 @@ static std::tuple<Tensor,optional<int64_t>> searchsorted_batch_rule(
   TORCH_INTERNAL_ASSERT(false);
 }
 
-static Tensor bucketize_decomp_Tensor(
+Tensor bucketize_decomp_Tensor(
     const Tensor& self,
     const Tensor& boundaries,
     bool out_int32,
@@ -415,7 +415,7 @@ static Tensor bucketize_decomp_Tensor(
   return at::searchsorted(boundaries, self, out_int32, right, nullopt, nullopt);
 }
 
-static Tensor bucketize_decomp_Scalar(
+Tensor bucketize_decomp_Scalar(
     const Scalar& self,
     const Tensor& boundaries,
     bool out_int32,

diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp
@@ -374,8 +374,8 @@ TORCH_IMPL_FUNC(softshrink_backward_out) (
   shrink_backward_stub(device_type(), *this, lambd);
 }
 
-#if AT_MKLDNN_ENABLED()
 static bool use_mkldnn(const Tensor& input) {
+#if AT_MKLDNN_ENABLED()
   if (!at::globalContext().userEnabledMkldnn()) {
     return false;
   }
@@ -386,8 +386,9 @@ static bool use_mkldnn(const Tensor& input) {
     (input.device().is_cpu() &&
     (((input.scalar_type() == kBFloat16) && mkldnn_bf16_device_check()) ||
     (input.scalar_type() == kFloat))); // input is dense layout and bfloat16/float32
-}
 #endif
+  return false;
+}
 
 TORCH_IMPL_FUNC(gelu_out_cpu) (
   const Tensor& self, c10::string_view approximate, const Tensor& result

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
@@ -809,7 +809,7 @@ Tensor& arctan2_out(const Tensor& self, const Tensor& other, Tensor& result) {
   return at::atan2_out(result, self, other);
 }
 
-static Tensor& add_relu_impl(
+Tensor& add_relu_impl(
     Tensor& result, const Tensor& self, const Tensor& other, const Scalar& alpha) {
   auto iter = TensorIterator::binary_op(result, self, other);
   Scalar min_val;
@@ -1003,7 +1003,7 @@ Tensor& mul__scalar_sparse_csr(Tensor& self, const Scalar& other) {
   return self;
 }
 
-static Device correct_out_device(const Tensor& self, const Tensor& other) {
+Device correct_out_device(const Tensor& self, const Tensor& other) {
   if (self.device() == at::kCPU){
       return other.device();
   } else {
@@ -1049,7 +1049,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
   }
 }
 
-static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
+Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const Scalar& alpha) {
   auto out_device = correct_out_device(self, other);
   // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
   auto device_ = Device(DeviceType::Meta);

diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
@@ -770,7 +770,6 @@ static void check_input_same_type_as_parameters(
   check_input_same_type_as_parameters(input, weight, /*bias=*/ Tensor());
 }
 
-#if AT_MKLDNN_ENABLED()
 static void check_input_same_type_as_parameters(
     const Tensor& input,
     const Tensor& weight,
@@ -789,7 +788,6 @@ static void check_input_same_type_as_parameters(
     check_input_same_type_as_parameters(input, weight, bias);
   }
 }
-#endif
 
 static auto view4d(const at::Tensor& tensor) -> at::Tensor {
   TORCH_CHECK(tensor.ndimension() == 3,

diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
@@ -21,7 +21,6 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_copy_from.h>
-#include <ATen/ops/_propagate_xla_data.h>
 #include <ATen/ops/copy_native.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/expand_copy.h>

diff --git a/aten/src/ATen/native/LegacyBatching.cpp b/aten/src/ATen/native/LegacyBatching.cpp
@@ -3,11 +3,6 @@
 #include <ATen/WrapDimUtils.h>
 #include <ATen/LegacyVmapTransforms.h>
 
-#ifdef AT_PER_OPERATOR_HEADERS
-#include <ATen/ops/_add_batch_dim_native.h>
-#include <ATen/ops/_remove_batch_dim_native.h>
-#endif
-
 namespace at { namespace native {
 
 // Adds a batch dimension to the tensor `self` out-of-place

diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp
@@ -1893,7 +1893,7 @@ The behavior depends on the dimensionality of the Tensors as follows:
 - Otherwise, we return bmm, after broadcasting and folding the batched dimensions if
   there's more than one
 */
-static Tensor _matmul_impl(
+Tensor _matmul_impl(
     Tensor& out,
     const Tensor& tensor1,
     const Tensor& tensor2) {

diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp
@@ -20,7 +20,7 @@
 
 namespace at { namespace native {
 
-static void checkLongTensor(const Tensor& tensor) {
+void checkLongTensor(const Tensor& tensor) {
   TORCH_CHECK(tensor.dim() == 1 && tensor.device().type() == at::kCPU && tensor.scalar_type() == at::kLong,
            "'lengths' argument should be a 1D CPU int64 tensor, but got ",
             tensor.dim(), "D ", tensor.device().str(), " ", tensor.scalar_type(), " tensor");

diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp
@@ -1809,7 +1809,7 @@ std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data(
                          std::move(std::get<2>(results)));
 }
 
-static std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
+std::tuple<Tensor, Tensor, Tensor> quantized_lstm_data_legacy(
     const Tensor& data,
     const Tensor& batch_sizes,
     c10::List<at::Tensor> hx_,

diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp
@@ -11,7 +11,6 @@
 #include <ATen/ops/resize_as_native.h>
 #include <ATen/ops/resize_native.h>
 #include <ATen/ops/resize.h>
-#include <ATen/ops/_resize_output.h>
 #endif
 
 namespace at { namespace native {

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -400,7 +400,7 @@ static void build_index_op(
   iter.build(config);
 }
 
-static void check_indices_on_cpu_or_selfdevice(
+void check_indices_on_cpu_or_selfdevice(
     const Tensor& self,
     const at::MaterializedIOptTensorListRef& indices) {
   auto dev = self.device();
@@ -965,7 +965,7 @@ TORCH_IMPL_FUNC(index_add_cpu_out)
   }
 }
 
-static void index_reduce_func_impl(
+void index_reduce_func_impl(
   const Tensor& self,
   int64_t dim,
   const Tensor& index,
@@ -1149,7 +1149,7 @@ static void check_indexarray_range(
   }
 }
 
-static Tensor & index_select_out_cpu_dim1_(
+Tensor & index_select_out_cpu_dim1_(
     Tensor & result_contig, const Tensor & self, const Tensor & index_contig) {
 
   auto self_contig = self.contiguous();
@@ -1379,6 +1379,10 @@ Tensor index_select_quantized_cpu_(const Tensor & self, int64_t dim, const Tenso
   return at::native::index_select_out_cpu_(self, dim, index, result);
 }
 
+Tensor index_select_backward(const Tensor& grad, at::IntArrayRef self_sizes, int64_t dim, const Tensor& index) {
+    return at::native::index_select_backward_symint(grad, c10::fromIntArrayRefSlow(self_sizes), dim, index);
+}
+
 Tensor index_select_backward_symint(const Tensor& grad, c10::SymIntArrayRef self_sizes, int64_t dim, const Tensor& index) {
   // for composite compliance, use out-of-place variant of
   // `index_add` if index tensor is a Tensor Subclass.
@@ -1533,7 +1537,7 @@ static void scatter_reduce_exclude_self_helper(
   });
 }
 
-static void _scatter_via_index_put(
+void _scatter_via_index_put(
   const Tensor& self,
   int64_t dim,
   const Tensor& index,

diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp
@@ -1009,7 +1009,7 @@ Tensor dense_to_sparse_bsc(const Tensor& self, IntArrayRef blocksize, c10::optio
   return dense_to_sparse_compressed<Layout::SparseBsc>(self, blocksize, dense_dim_opt);
 }
 
-static void _check_blocksize_matches(
+void _check_blocksize_matches(
     const Tensor& self,
     c10::optional<IntArrayRef> blocksize_opt,
     const std::string& name) {
@@ -1023,7 +1023,7 @@ static void _check_blocksize_matches(
   }
 }
 
-static Tensor sparse_compressed_clone(
+Tensor sparse_compressed_clone(
     const Tensor& self,
     c10::optional<IntArrayRef> blocksize,
     const std::string& name) {
@@ -1046,7 +1046,7 @@ static Tensor sparse_compressed_clone(
       values.device());
 }
 
-static Tensor sparse_compressed_to_flipped(
+Tensor sparse_compressed_to_flipped(
     const Tensor& self,
     c10::optional<IntArrayRef> blocksize,
     const std::string& name) {

diff --git a/aten/src/ATen/native/Unfold3d.cpp b/aten/src/ATen/native/Unfold3d.cpp
@@ -1,6 +1,5 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/core/Tensor.h>
-#include <ATen/native/Unfold3d.h>
 #include <ATen/Config.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Parallel.h>

diff --git a/aten/src/ATen/native/WeightNorm.cpp b/aten/src/ATen/native/WeightNorm.cpp
@@ -10,8 +10,6 @@
 #else
 #include <ATen/ops/_weight_norm_differentiable_backward_native.h>
 #include <ATen/ops/_weight_norm_interface.h>
-#include <ATen/ops/_weight_norm_interface_backward_native.h>
-#include <ATen/ops/_weight_norm_interface_native.h>
 #include <ATen/ops/_weight_norm_native.h>
 #include <ATen/ops/empty_strided.h>
 #include <ATen/ops/norm_except_dim.h>

diff --git a/aten/src/ATen/native/cpu/PowKernel.cpp b/aten/src/ATen/native/cpu/PowKernel.cpp
@@ -13,7 +13,7 @@ namespace at::native {
 
 inline namespace CPU_CAPABILITY {
 
-static void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
+void pow_tensor_tensor_kernel(TensorIteratorBase& iter) {
   const auto dtype = iter.common_dtype();
   if (isFloatingType(dtype) || isComplexType(dtype)) {
     AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, dtype, "pow", [&]() {
@@ -90,7 +90,7 @@ void reciprocal_kernel(TensorIteratorBase& iter);
 void rsqrt_kernel(TensorIteratorBase& iter);
 void sqrt_kernel(TensorIteratorBase& iter);
 
-static void pow_tensor_scalar_kernel(
+void pow_tensor_scalar_kernel(
     TensorIteratorBase& iter,
     const Scalar& exp_scalar) {
   // prevent multiple calls to iter.common_dtype()

diff --git a/aten/src/ATen/native/mkl/SparseBlasImpl.cpp b/aten/src/ATen/native/mkl/SparseBlasImpl.cpp
@@ -32,7 +32,6 @@ namespace mkl {
 
 namespace {
 
-#if AT_USE_MKL_SPARSE()
 c10::MaybeOwned<Tensor> prepare_dense_matrix_for_mkl(
     const Tensor& tensor) {
   if (tensor.is_non_overlapping_and_dense() ||
@@ -111,6 +110,7 @@ void inline col_indices_and_values_resize_(const Tensor& input, int64_t nnz) {
 /*
   Resizes `input` tensor and fills it with the data from MKL.
 */
+#if AT_USE_MKL_SPARSE()
 template <typename scalar_t>
 void mkl_result_copy_(const Tensor& input, sparse_matrix_t mkl_desc) {
   sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;

diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp
@@ -6,7 +6,7 @@
 namespace at {
 namespace native {
 
-static TensorOptions verify_empty_parameters(
+TensorOptions verify_empty_parameters(
     const at::Tensor& self,
     c10::optional<ScalarType> dtype,
     c10::optional<Layout> layout,

diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp
@@ -79,6 +79,64 @@ Tensor bmm_nested(const Tensor& self, const Tensor& mat2) {
   return output;
 }
 
+// utilities support `matmul_nested`
+namespace {
+// Args:
+//     self_sizes: the sizes of `self` in `matmul_nested`
+//     mat2_sizes: the sizes of `mat2` in `matmul_nested`
+//     buffer_op: the options for new buffer
+//     sizemat_op: the options for new size matrix
+// Returns:
+//     the batch size of each input underlying tensor, i.e. the product of batch-dimension sizes
+//     the empty output nested tensor
+inline std::tuple<std::vector<int64_t>, Tensor>
+matmul_nested_helper(
+    const std::vector<IntArrayRef>& self_sizes,
+    const std::vector<IntArrayRef>& mat2_sizes,
+    const c10::TensorOptions& buffer_op,
+    const c10::TensorOptions& sizemat_op) {
+  int64_t ntensors = self_sizes.size(),
+      ndims = self_sizes[0].size();
+  std::vector<int64_t> batch_sizes(ntensors, 1);
+  Tensor sizemat = at::empty({ntensors, ndims}, sizemat_op);
+  int64_t* sizemat_ptr = sizemat.mutable_data_ptr<int64_t>();
+  int64_t numel = 0;
+  for (int64_t i = 0; i < ntensors; i++) {
+    const IntArrayRef& self_size = self_sizes[i],
+        & mat2_size = mat2_sizes[i];
+    int64_t& batch_size = batch_sizes[i];
+    // batch dimensions
+    for (int64_t j = 0; j < ndims - 2; j++) {
+      const int64_t& self_sizej = self_size[j],
+          & mat2_sizej = mat2_size[j];
+      TORCH_CHECK(
+          self_sizej == mat2_sizej,
+          "matmul: For nested tensors, no broadcasting is currently performed: ",
+          i, "-th nested matrices in batch at dimension ", j + 1,
+          " have mismatching sizes ", self_sizej, " and ", mat2_sizej);
+      sizemat_ptr[j] = self_sizej;
+      batch_size *= sizemat_ptr[j];
+    }
+    // matrix multiplication dimensions
+    const int64_t& self_size0 = self_size[ndims - 2], & self_size1 = self_size[ndims - 1],
+        & mat2_size0 = mat2_size[ndims - 2], & mat2_size1 = mat2_size[ndims - 1];
+    TORCH_CHECK(
+        self_size1 == mat2_size0,
+        "matmul: ",
+        i, "-th nested matrices in batch cannot be multiplied (",
+        self_size0, "x", self_size1, " and ",
+        mat2_size0, "x", mat2_size1, ")");
+    sizemat_ptr[ndims - 2] = self_size0;
+    sizemat_ptr[ndims - 1] = mat2_size1;
+    sizemat_ptr += ndims;
+    numel += batch_size * self_size0 * mat2_size1;
+  }
+  Tensor buffer = at::empty(numel, buffer_op);
+  Tensor output = wrap_buffer(buffer, sizemat);
+  return std::make_tuple(batch_sizes, output);
+}
+}
+
 Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) {
   // Tensor self = self_.contiguous();
   // Tensor mat2 = mat2_.contiguous();

diff --git a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp
@@ -128,7 +128,7 @@ Tensor fake_quantize_per_channel_affine_cachemask_backward(
   return dY * mask;
 }
 
-static Tensor _get_rounded_zero_point(
+Tensor _get_rounded_zero_point(
     const Tensor& zero_point,
     int64_t quant_min,
     int64_t quant_max) {

diff --git a/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerTensorAffine.cpp
@@ -133,7 +133,7 @@ Tensor fake_quantize_per_tensor_affine_cachemask_backward(
   return dY * mask;
 }
 
-static int64_t _get_zero_point_from_tensor(
+int64_t _get_zero_point_from_tensor(
     const Tensor& zero_point,
     int64_t quant_min,
     int64_t quant_max,

diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -285,7 +285,7 @@ std::tuple<double, int64_t> _choose_qparams_per_tensor(
   return std::make_tuple(q_params.scale, q_params.zero_point);
 }
 
-static float calculate_quant_loss(
+float calculate_quant_loss(
     const float* input,
     int numel,
     float xmin,