Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ struct TeamVectorApplyLeftHouseholderInternal {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
value_type tmp(0);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(member, m),
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, m),
[&](const int &i, value_type &val) {
val += KokkosKernels::ArithTraits<value_type>::conj(u2[i * u2s]) * A2[i * as0 + j * as1];
},
Expand All @@ -58,14 +58,18 @@ struct TeamVectorApplyLeftHouseholderInternal {
// A2 -= u2 w1t (ger)
if (as0 <= as1) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m),
[&](const int &i) { A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; });
Kokkos::parallel_for(
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, m),
[&](const int &i) { A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; });
});
} else {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m),
[&](const int &i) { A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j]; });
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m),
[&](const int &i) {
A2[i * as0 + j * as1] -= u2[i * u2s] * w1t[j];
});
});
}

return 0;
Expand Down Expand Up @@ -97,7 +101,7 @@ struct TeamVectorApplyRightHouseholderInternal {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
value_type tmp(0);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(member, n),
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j, value_type &val) { val += A2[i * as0 + j * as1] * u2[j * u2s]; }, tmp);
Kokkos::single(Kokkos::PerThread(member), [&]() {
w1[i] = (tmp + a1[i * a1s]) * inv_tau; // \= (*tau);
Expand All @@ -111,16 +115,20 @@ struct TeamVectorApplyRightHouseholderInternal {
// A2 -= w1 * u2' (ger with conjugate)
if (as0 <= as1) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) {
A2[i * as0 + j * as1] -= w1[i] * KokkosKernels::ArithTraits<ValueType>::conj(u2[j * u2s]);
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, m),
[&](const int &i) {
A2[i * as0 + j * as1] -=
w1[i] * KokkosKernels::ArithTraits<ValueType>::conj(u2[j * u2s]);
});
});
} else {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
A2[i * as0 + j * as1] -= w1[i] * KokkosKernels::ArithTraits<ValueType>::conj(u2[j * u2s]);
});
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
A2[i * as0 + j * as1] -=
w1[i] * KokkosKernels::ArithTraits<ValueType>::conj(u2[j * u2s]);
});
});
}

return 0;
Expand Down
7 changes: 4 additions & 3 deletions batched/dense/impl/KokkosBatched_Axpy_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,8 @@ struct TeamVectorAxpyInternal {
KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, const int m, const ScalarType alpha,
const ValueType* KOKKOS_RESTRICT X, const int xs0,
/* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int& i) { Y[i * ys0] += alpha * X[i * xs0]; });
Kokkos::parallel_for(Kokkos::RangePolicy(member, 0, m),
[&](const int& i) { Y[i * ys0] += alpha * X[i * xs0]; });
// member.team_barrier();
return 0;
}
Expand All @@ -159,7 +160,7 @@ struct TeamVectorAxpyInternal {
const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
const ValueType* KOKKOS_RESTRICT X, const int xs0,
/* */ ValueType* KOKKOS_RESTRICT Y, const int ys0) {
Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m),
Kokkos::parallel_for(Kokkos::RangePolicy(member, 0, m),
[&](const int& i) { Y[i * ys0] += alpha[i * alphas0] * X[i * xs0]; });
// member.team_barrier();
return 0;
Expand All @@ -170,7 +171,7 @@ struct TeamVectorAxpyInternal {
const ScalarType* KOKKOS_RESTRICT alpha, const int alphas0,
const ValueType* KOKKOS_RESTRICT X, const int xs0, const int xs1,
/* */ ValueType* KOKKOS_RESTRICT Y, const int ys0, const int ys1) {
Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, m * n), [&](const int& iTemp) {
Kokkos::parallel_for(Kokkos::RangePolicy(member, 0, m * n), [&](const int& iTemp) {
int i, j;
getIndices<int, layout>(iTemp, n, m, j, i);
Y[i * ys0 + j * ys1] += alpha[i * alphas0] * X[i * xs0 + j * xs1];
Expand Down
19 changes: 12 additions & 7 deletions batched/dense/impl/KokkosBatched_Copy_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ struct TeamVectorCopyInternal {
KOKKOS_FORCEINLINE_FUNCTION static int invoke(const MemberType &member, Op op, const int m,
const ValueType *KOKKOS_RESTRICT A, const int as0,
/* */ ValueType *KOKKOS_RESTRICT B, const int bs0) {
Kokkos::parallel_for(Kokkos::TeamVectorRange(member, m), [&](const int &i) { B[i * bs0] = op(A[i * as0]); });
Kokkos::parallel_for(Kokkos::RangePolicy(member, 0, m),
[&](const int &i) { B[i * bs0] = op(A[i * as0]); });
// member.team_barrier();
return 0;
}
Expand All @@ -86,14 +87,18 @@ struct TeamVectorCopyInternal {
/* */ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) {
if (as0 > as1) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n),
[&](const int &j) { B[i * bs0 + j * bs1] = op(A[i * as0 + j * as1]); });
Kokkos::parallel_for(
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j) { B[i * bs0 + j * bs1] = op(A[i * as0 + j * as1]); });
});
} else {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [&](const int &i) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n),
[&](const int &j) { B[i * bs0 + j * bs1] = op(A[i * as0 + j * as1]); });
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, m),
[&](const int &i) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n),
[&](const int &j) {
B[i * bs0 + j * bs1] = op(A[i * as0 + j * as1]);
});
});
}
// member.team_barrier();
return 0;
Expand Down
4 changes: 2 additions & 2 deletions batched/dense/impl/KokkosBatched_Dot_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ struct TeamVectorDotInternal {
using ats = KokkosKernels::ArithTraits<ValueType>;
ValueType t(0);
Kokkos::parallel_reduce(
Kokkos::TeamVectorRange(member, m),
Kokkos::RangePolicy(member, 0, m),
[&](const int &i, ValueType &update) {
const int idx_a = i * as0, idx_b = i * bs0;
update += ats::conj(A[idx_a]) * B[idx_b];
Expand All @@ -130,7 +130,7 @@ struct TeamVectorDotInternal {
const ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1;
const ValueType *KOKKOS_RESTRICT B_at_j = B + j * bs1;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(member, m),
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, m),
[&](const int &i, ValueType &update) {
const int idx_a = i * as0, idx_b = i * bs0;
update += ats::conj(A_at_j[idx_a]) * B_at_j[idx_b];
Expand Down
31 changes: 17 additions & 14 deletions batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,14 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGemmInternal<Algo::Gemm::Unblocked, false>:

Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
const ValueType *KOKKOS_RESTRICT pA = A + i * as0;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
const ValueType *KOKKOS_RESTRICT pB = B + j * bs1;

ValueType c = ValueType(0);
for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0];
C[i * cs0 + j * cs1] += alpha * c;
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j) {
const ValueType *KOKKOS_RESTRICT pB = B + j * bs1;

ValueType c = ValueType(0);
for (int p = 0; p < k; ++p) c += pA[p * as1] * pB[p * bs0];
C[i * cs0 + j * cs1] += alpha * c;
});
});
}
return 0;
Expand Down Expand Up @@ -85,13 +86,15 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGemmInternal<Algo::Gemm::Unblocked, true>::

Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
const ValueType *KOKKOS_RESTRICT pA = A + i * as0;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) {
const ValueType *KOKKOS_RESTRICT pB = B + j * bs1;

ValueType c = ValueType(0);
for (int p = 0; p < k; ++p) c += KokkosKernels::ArithTraits<ValueType>::conj(pA[p * as1]) * pB[p * bs0];
C[i * cs0 + j * cs1] += alpha * c;
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j) {
const ValueType *KOKKOS_RESTRICT pB = B + j * bs1;

ValueType c = ValueType(0);
for (int p = 0; p < k; ++p)
c += KokkosKernels::ArithTraits<ValueType>::conj(pA[p * as1]) * pB[p * bs0];
C[i * cs0 + j * cs1] += alpha * c;
});
});
}
return 0;
Expand Down
19 changes: 10 additions & 9 deletions batched/dense/impl/KokkosBatched_Gesv_Impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(const Me
reducer_value_type value;
Kokkos::MaxLoc<value_type, int> reducer_value(value);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(member, n),
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j, reducer_value_type &update) {
if (Kokkos::abs(A(j, i)) > update.val) {
update.val = Kokkos::abs(A(j, i));
Expand All @@ -231,7 +231,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(const Me
reducer_value);
D2(i) = 1. / value.val;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(member, n),
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j, reducer_value_type &update) {
if (Kokkos::abs(A(i, j)) > update.val) {
update.val = Kokkos::abs(A(i, j));
Expand All @@ -243,15 +243,15 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(const Me
});

Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A(i, j) *= D2(j); });
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n), [&](const int &j) { A(i, j) *= D2(j); });
});

Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) {
value_type D1_i = KokkosKernels::ArithTraits<value_type>::zero();
reducer_value_type value;
Kokkos::MaxLoc<value_type, int> reducer_value(value);
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange(member, n),
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j, reducer_value_type &update) {
if (Kokkos::abs(A(i, j)) > update.val) {
update.val = Kokkos::abs(A(i, j));
Expand All @@ -260,7 +260,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(const Me
},
reducer_value);
D1_i = 1. / value.val;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), [&](const int &j) { A(i, j) *= D1_i; });
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n), [&](const int &j) { A(i, j) *= D1_i; });
Y(i) *= D1_i;
});

Expand All @@ -269,7 +269,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(const Me
reducer_value_type value{};
Kokkos::MaxLoc<value_type, int> reducer_value(value);
Kokkos::parallel_reduce(
Kokkos::TeamVectorRange(member, n),
Kokkos::RangePolicy(member, 0, n),
[&](const int &j, reducer_value_type &update) {
if (tmp_v_1(j) > update.val) {
update.val = tmp_v_1(j);
Expand All @@ -281,7 +281,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(const Me
value.loc = 0;
value.val = KokkosKernels::ArithTraits<value_type>::zero();
Kokkos::parallel_reduce(
Kokkos::TeamVectorRange(member, n),
Kokkos::RangePolicy(member, 0, n),
[&](const int &j, reducer_value_type &update) {
if (Kokkos::abs(A(row_index, j) * tmp_v_2(j)) > update.val) {
update.val = Kokkos::abs(A(row_index, j) * tmp_v_2(j));
Expand All @@ -294,7 +294,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting<MemberType>::invoke(const Me
tmp_v_1(row_index) = KokkosKernels::ArithTraits<value_type>::zero();
tmp_v_2(col_index) = KokkosKernels::ArithTraits<value_type>::zero();

Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n),
Kokkos::parallel_for(Kokkos::RangePolicy(member, 0, n),
[&](const int &j) { PDAD(col_index, j) = A(row_index, j); });
PDY(col_index) = Y(row_index);
}
Expand Down Expand Up @@ -323,7 +323,8 @@ KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, const
const VectorType3 DX) {
const size_t n = X.extent(0);

Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const size_t &i) { DX(i) = D(i) * X(i); });
Kokkos::parallel_for(Kokkos::RangePolicy(member, 0, n),
[&](const size_t &i) { DX(i) = D(i) * X(i); });
}

///
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,9 @@ class BatchedDblBufGemm {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_M), [&](const int &thread_id) {
int m_offset = thread_id + start_m;

Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_N), [&](const int &vlane_id) {
Kokkos::parallel_for(
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, STRIDE_N),
[&](const int &vlane_id) {
int n_offset = vlane_id + start_n;

// Here we populate scratch memory with one or more "k" tiles for
Expand Down Expand Up @@ -513,7 +515,9 @@ class BatchedDblBufGemm {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_N), [&](const int &thread_id) {
int n_offset = thread_id + start_n;

Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_M), [&](const int &vlane_id) {
Kokkos::parallel_for(
Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, STRIDE_M),
[&](const int &vlane_id) {
int m_offset = vlane_id + start_m;

// Here we populate scratch memory with one or more "k" tiles for
Expand Down
2 changes: 1 addition & 1 deletion batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ struct TeamVectorSetIdentityInternal {
/* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) {
const ValueType one(1), zero(0);
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n),
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, n),
[&](const int &j) { A[i * as0 + j * as1] = i == j ? one : zero; });
});

Expand Down
7 changes: 4 additions & 3 deletions batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ struct TeamVectorSetLowerTriangularInternal {
/* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) {
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &j) {
const int jdist = j + dist;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, m), [=](const int &i) {
if (i >= jdist) A[i * as0 + j * as1] = alpha;
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, m),
[=](const int &i) {
if (i >= jdist) A[i * as0 + j * as1] = alpha;
});
});
return 0;
}
Expand Down
11 changes: 6 additions & 5 deletions batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,11 @@ KOKKOS_INLINE_FUNCTION int TeamVectorTrsmInternalLeftLower<Algo::Trsm::Unblocked
member.team_barrier();
}
Kokkos::parallel_for(Kokkos::TeamThreadRange(member, iend), [&](const int &i) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, jend), [&](const int &j) {
// assume layout right for batched computation
B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1];
});
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, jend),
[&](const int &j) {
// assume layout right for batched computation
B2[i * bs0 + j * bs1] -= a21[i * as0] * b1t[j * bs1];
});
});
}
}
Expand Down Expand Up @@ -108,7 +109,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorTrsmInternalLeftUpper<Algo::Trsm::Unblocked
}

Kokkos::parallel_for(Kokkos::TeamThreadRange(member, iend), [&](const int &i) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, jend),
Kokkos::parallel_for(Kokkos::RangePolicy(Kokkos::ThreadHandle<MemberType>(member), 0, jend),
[&](const int &j) { B0[i * bs0 + j * bs1] -= a01[i * as0] * b1t[j * bs1]; });
});
}
Expand Down
Loading