Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
160 commits
Select commit Hold shift + click to select a range
ef6ff36
Adjust .gitignore for my tools
Oct 7, 2025
cb30403
add external fluid simulation example
petbab May 17, 2026
2f42a3f
Merge pull request #71 from petbab/fluid-sim-example
jiri-filipovic May 19, 2026
b9eb213
fix C++ backend: dataSize==0 convention and missing stdev in precise …
jiri-filipovic Apr 10, 2026
7b8cb5e
fix C++ backend: wait for async actions before cleanup and on synchro…
jiri-filipovic Apr 10, 2026
5281dc7
skip Local and Symbol arguments in C++ backend with warning
jiri-filipovic Apr 13, 2026
d729941
Add baseline for tuning compiler option from a separate group
Peter-Pis Apr 7, 2026
9936c9a
Add default values for compiler options that are tuned separately
Peter-Pis Apr 9, 2026
edf7e0e
better naming for separate compiler options group
Peter-Pis Apr 9, 2026
9c6192a
cleanup separate compiler options
Peter-Pis Apr 10, 2026
9def1e8
Add python bindings for separate compiler options tuning
Peter-Pis Apr 10, 2026
43ef8b4
Fix default values for separate compiler options tuning
Peter-Pis Apr 13, 2026
c990be5
Add script support for separate compiler options
Peter-Pis Apr 13, 2026
a418048
Add compilation data to de/serialization in T4 format
Mar 4, 2026
1d176cf
Fix naming bug to comply with T4 results schema
Mar 5, 2026
ead1bf9
Change parameters' values type in configuration in T4 results schema
Mar 13, 2026
e683001
more compiler options in CoulombSum3D + switch between tuning them se…
jiri-filipovic Apr 15, 2026
a190f67
updating and fixing onboarding guide
jiri-filipovic Apr 15, 2026
1375c28
Fix Doxygen annotation mismatches and configuration gaps
jiri-filipovic Apr 15, 2026
00f5eba
experimental sequential dump of compiled CUDA kernels
jiri-filipovic Apr 16, 2026
de51c7c
fix constraint ordering
jiri-filipovic Apr 27, 2026
a51ece1
Change profiling overhead accumulation
Mar 19, 2026
a7ba3b5
Pass duration to last pass AFTER overhead accounting
Apr 17, 2026
37fa0da
Refactor computation of total overhead
Apr 21, 2026
13a9900
Refactor extra duration overhead
Apr 28, 2026
48059e4
Remove undefined declaration
Apr 28, 2026
efe3f18
Update full search space files for 03KernelTuning
Apr 28, 2026
75a1720
Capture overhead for precise measurements
Apr 30, 2026
53a09ef
fixed kernel results validation if a kernel produces NaNs
jiri-filipovic May 5, 2026
980c902
FlushL2Cache is now Sanitize (additionally reset stack size in CUDA e…
jiri-filipovic May 5, 2026
10de115
use default stack size in sanitizing
jiri-filipovic May 5, 2026
178eef0
Prepare 2.3 release
jiri-filipovic May 6, 2026
7c0285b
fixing logo
jiri-filipovic May 6, 2026
a3cb4b6
fixing logo
jiri-filipovic May 6, 2026
400c5cc
add external fluid simulation example
petbab May 17, 2026
681758a
Add first draft of Example base class
Petronous Feb 20, 2026
f541ecd
Add option for choosing reference kernel file to Example base class.
Petronous Feb 20, 2026
05c36c2
Finish v1 of Example class: define Run, add virt funcs to constructor
Petronous Feb 20, 2026
5d3a3de
Convert Transpose.cpp to make use of Example class
Petronous Feb 20, 2026
3db1ecd
Modify premake5.lua to compile Example.cpp for examples
Petronous Feb 20, 2026
cef49cf
Start implementing factory function to be able to initialize with vir…
Petronous Feb 20, 2026
273a1cc
Fix InitReferenceDefault; Add InitKernelsDefault, InitBuffers
Petronous Feb 20, 2026
860c401
Fix factory method pattern in Example
Petronous Feb 20, 2026
cdb0556
Decompose Example class based on usage of reference kernel or computa…
Petronous Feb 20, 2026
e233758
Fix GlobalSizeType setting; Reformat function arguments.
Petronous Feb 20, 2026
694f058
Add support for custom searcher and stop condition.
Petronous Feb 20, 2026
973db91
Reformat function parameters in ExampleReferenceKernel; Remove debug …
Petronous Feb 20, 2026
a986892
Add ExampleReferenceComputation to finish decomposition of Example cl…
Petronous Feb 20, 2026
bda66be
Reformat Bicg indenting
Petronous Feb 20, 2026
f708f03
Remove Conv3d ref kernel; Adjust ALGORITHM handling in kernel to match.
Petronous Mar 25, 2026
1dddef9
Fix: Make ExampleReferenceComputation inherit ExampleBase publicly
Petronous Mar 25, 2026
987a8cf
Convert Convolution3d.cpp to inherit ExampleReferenceComputation
Petronous Mar 25, 2026
b3124ca
Remove redundant reference kernel from Convolution3d.cpp
Petronous Mar 25, 2026
a0fb61e
Fix: Add ExampleReferenceComputation as friend to Convolution3d
Petronous Mar 25, 2026
edec75d
Fix paths in Convolution3d
Petronous Mar 25, 2026
93d5d4c
Convert ThreadModifiers to count with CUDA GlobalSizeType in Convolut…
Petronous Mar 25, 2026
1f8078c
Fix typo (m_tuner -> tuner) in a comment in Convolution3d
Petronous Mar 25, 2026
12b15f1
Reformat Convolution3d: Add line breaks between funcs, Add comments t…
Petronous Mar 25, 2026
3db35a5
Change Example*::Create method to return unique_ptr
Petronous Mar 25, 2026
d94ee75
Remove std:: from Convolution3d
Petronous Mar 25, 2026
0f61bf1
Add min and max val params to ExampleBase::FillBuffers
Petronous Mar 25, 2026
43e374c
Convert Reduction.cpp to use ExampleReferenceComputation
Petronous Mar 25, 2026
5caa305
Rebase upstream changes onto examples-refactor branch
Petronous Apr 28, 2026
7cbfdd3
Add Example base class files to newly pulled projects in premake5.lua
Petronous Apr 28, 2026
1b799a1
WIP: Begin work on testing script
Petronous Apr 28, 2026
72198a9
WIP: Pull original versions of projects into separate folder for test…
Petronous Apr 28, 2026
661a6dc
WIP: Refactor premak5.lua example project declaration
Petronous Apr 28, 2026
677ba94
WIP: More work on testing
Petronous Apr 28, 2026
bab9957
Fix premake5.lua AddExampleProject: rename param enableOpenMP -> shou…
Petronous Apr 29, 2026
0f0587f
WIP: add caching and remove unneeded rm statements, fix premake calls
Petronous Apr 29, 2026
c851867
Make FillBuffers templated, move to header file.
Petronous Apr 29, 2026
e79b86c
FIx Transpose: Instantiate FillBuffers explicitly
Petronous Apr 29, 2026
4bd2df1
Fix Reduction: Instantiate FillBuffers explicitly
Petronous Apr 29, 2026
82423c7
WIP: fix space before ], use the REF_OUTPUT_JSON everywhere
Petronous Apr 29, 2026
595aae3
Fix ExampleBase::FillBuffers: Add conditional declaration of uniform_…
Petronous Apr 29, 2026
8eba418
Disable profiling in reference Transpose
Petronous Apr 29, 2026
8e20e93
Refactor Sort
Petronous Apr 29, 2026
1e7f1c4
Fix Transpose: Increase problemSize so it is consistent with reference.
Petronous Apr 29, 2026
77ee283
WIP: Fix caching in test script
Petronous Apr 29, 2026
bf1de14
WIP: Refactor Sort2 (broken: Floating point exception in unknown place)
Petronous Apr 29, 2026
a634960
Add fast math and OpenMP compiler option methods to ExampleBase
Petronous May 10, 2026
d783a07
Add precision parameter to InitReferenceKernel in ExampleReferenceKernel
Petronous May 10, 2026
b92dd84
Add summary print to ExampleBase
Petronous May 10, 2026
980b4f1
Fix assumption of 2D in ExampleBase::InitKernelDefault
Petronous May 10, 2026
3d84fd1
Fix virtual -> override in Convolution3d methods
Petronous May 10, 2026
bab2ea1
Modify Convolution3d reference version for approximate testing
Petronous May 10, 2026
1b5fcaa
Modify CoulombSum2d reference version for approximate testing
Petronous May 10, 2026
91a3e59
Modify CoulombSum3d reference version for approximate testing
Petronous May 10, 2026
de2b291
Modify Convolution3d for approximate testing
Petronous May 10, 2026
77bbf10
WIP: Refactor Nbody (broken kernel)
Petronous May 10, 2026
796911a
WIP: Add special case for output name in testing script
Petronous May 10, 2026
7820dd4
WIP: Fix Sort2 crashing by inserting constraint that disables last co…
Petronous May 10, 2026
d65be49
Refactor CoulombSum2d
Petronous May 10, 2026
3e1454f
Refactor CoulombSum3d
Petronous May 10, 2026
a1e3ebd
Modify ClTuneGemm reference version for approximate testing
Petronous May 11, 2026
4382ca2
Refactor ClTuneGemm
Petronous May 11, 2026
483c224
Rename ExampleBase::InitKernels to InitKernel
Petronous May 14, 2026
e015fcb
Remove superfluous InitReferenceOutputsDefault from ExampleReferenceK…
Petronous May 14, 2026
cb5be23
Rename ExampleBase::InitTuningParameters to InitTuningSpace
Petronous May 14, 2026
0c895ce
Add CLI handling helper for Examples
Petronous May 18, 2026
131d351
Add support for CLI handling to ExampleBase
Petronous May 18, 2026
5244c80
Add support for CLI handling to ExampleReferenceKernel
Petronous May 18, 2026
dea60f8
Add support for CLI handling to ExampleReferenceComputation
Petronous May 18, 2026
fb9e8ba
WIP: Rename testing scripts to remove mentions of the Sort Example
May 14, 2026
ff6dc52
Remove override from InitSearcher
Petronous May 18, 2026
15f8036
Update refactored Examples to match changes to base classes
Petronous May 18, 2026
19e04b1
Add support for precise measurement parameters to CLI and ExampleBase.
Petronous May 18, 2026
f0a1710
Add support for --useDynamicTuning option
Petronous May 18, 2026
0ff21c7
Remove debug prints from ExampleConfigurator.cpp
Petronous May 18, 2026
91bc3c6
Update Examples using ExampleReferenceComputation to friend ExampleBase
Petronous May 18, 2026
c923193
Refactor Bicg
Petronous May 18, 2026
c4a67a4
Split AtfCCSD from AtfSample
Petronous May 18, 2026
ef21ba4
Split AtfConvolution from AtfSample
Petronous May 18, 2026
0696e6c
Remove unused models from AtfConvolution
Petronous May 18, 2026
40b98a2
Refactor Covariance
Petronous May 18, 2026
5866e08
Split AtfGEMM from AtfSamples
Petronous May 18, 2026
b9802e3
Split AtfPRL from AtfSamples
Petronous May 18, 2026
1b8353b
Remove redundant AtfSamples folder
Petronous May 18, 2026
1f245fa
Remove redundant includes from Atf Examples
Petronous May 18, 2026
411c0e9
Add AtfSamples splitoffs to premake5.lua
Petronous May 19, 2026
fe48c52
Split ReferenceVersions/AtfSamples into CCSD, Convolution, GEMM, PRL …
Petronous May 19, 2026
577479f
Change constants in RefVersion AtfSamples splitoffs so they compile c…
Petronous May 19, 2026
95aa124
Fix Atf splitoffs build: rename .cpp files to correspond with Example…
Petronous May 19, 2026
1449ae6
Revert "Remove redundant AtfSamples folder"
Petronous May 19, 2026
d6a3465
Refactor AtfCCSD
Petronous May 19, 2026
81488a6
Refactor AtfGEMM
Petronous May 19, 2026
bea0e82
Refactor AtfPRL
Petronous May 19, 2026
9f1ffb9
Refactor AtfConvolution
Petronous May 19, 2026
130a271
Refactor ClTuneConvolution
Petronous May 19, 2026
cc98d98
Revert Nbody to before refactoring
Petronous May 19, 2026
edeb9e2
Refactor Nbody manually
Petronous May 19, 2026
2b76e33
Refactor KernelTunerConvolution
Petronous May 19, 2026
57c5720
Refactor KernelTunerPnpoly
Petronous May 19, 2026
ee0b20f
Add customizable problem size to Nbody
Petronous May 20, 2026
d0a6c62
Add customizable problem size to AtfConvolution
Petronous May 20, 2026
a25df2f
Add customizable problem size to AtfGEMM
Petronous May 20, 2026
1341098
Add customizable problem size AtfPRL
Petronous May 20, 2026
0513c0f
Make Example configuration includable
Petronous May 20, 2026
3093a0b
Add to ExampleBase: support for custom suffix; support for cpp kernels
Petronous May 20, 2026
461710c
Refactor RodiniaHotspot
Petronous May 20, 2026
6d0f69c
Add support for RodiniaHotspot to premake5.lua
Petronous May 20, 2026
b99c62a
Remove unused headers from RodiniaHotspot
Petronous May 20, 2026
cbdf473
Move GemmBatch to Examples
Petronous May 20, 2026
b99f215
Refactor GemmBatch
Petronous May 20, 2026
70d8142
Add GemmBatch to premake5.lua
Petronous May 20, 2026
8f6b2d7
Refactor Dummy
Petronous May 20, 2026
8da9808
Remove redundant Examples
Petronous May 20, 2026
07ddae0
WIP: Refactor Microbenchmarks
Petronous May 20, 2026
45977d3
Revert "WIP: Refactor Microbenchmarks"
Petronous May 20, 2026
fd4ba04
Delete unused AtfSamples.cpp
May 21, 2026
49a029c
Add customizable CLI to CoulombSum3d
Petronous May 26, 2026
ceab27b
Reintroduce problemSize handling to CoulombSum3d
Petronous May 26, 2026
a998fe4
Rename CoulombSum3d config members to keep naming scheme consistent
Petronous May 26, 2026
c15f789
Clean up unused code in RodiniaHotspot
Petronous May 26, 2026
b9f5f0a
Update reference version of CoulombSum3d from master
Petronous May 26, 2026
e39de70
Add support for separate compiler tuning in CoulombSum3d
Petronous May 26, 2026
8fe6e61
Merge remote-tracking branch 'upstream' into examples-refactor
Petronous May 26, 2026
b701bca
Merge remote-tracking branch 'upstream/development' into examples-ref…
Petronous May 26, 2026
8eb7da7
Rename ReferenceVersions to LegacyExamples and move test scripts there
Petronous May 29, 2026
fbae8d0
Fix Sort: divide GLOBAL_SIZE by LOCAL_SIZE for correct CUDA sizing
Petronous May 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ Build/*
premake5.exe
premake5
*.swp

.cache
compile_commands.json
460 changes: 460 additions & 0 deletions Examples/AtfCCSD/AtfCCSD.cpp

Large diffs are not rendered by default.

129,355 changes: 129,355 additions & 0 deletions Examples/AtfCCSD/TcAbcdefGebcDfga1.cl

Large diffs are not rendered by default.

129,398 changes: 129,398 additions & 0 deletions Examples/AtfCCSD/TcAbcdefGebcDfga1.cu

Large diffs are not rendered by default.

224,763 changes: 224,763 additions & 0 deletions Examples/AtfCCSD/TcAbcdefGebcDfga2.cl

Large diffs are not rendered by default.

224,806 changes: 224,806 additions & 0 deletions Examples/AtfCCSD/TcAbcdefGebcDfga2.cu

Large diffs are not rendered by default.

203 changes: 203 additions & 0 deletions Examples/AtfConvolution/AtfConvolution.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#include "../ExampleBase.h"
#include <memory>

using namespace std;

class AtfConvolution : public ExampleBase {
protected:
AtfConvolution(shared_ptr<ExampleConfiguration> config, int defaultProblemSize,
string exampleFolderPath, string defaultKernelFileBaseName) :
ExampleBase(config, defaultProblemSize, exampleFolderPath, defaultKernelFileBaseName)
{
// Keep OpenCL sizes as specified
m_inputSize1 = static_cast<int>(sqrt(m_problemSize)) * 1024;
m_inputSize2 = m_inputSize1;

m_tuner.SetGlobalSizeType(ktt::GlobalSizeType::OpenCL);
}

friend ExampleBase;

// Input sizes - kept as member variables
uint64_t m_inputSize1;
uint64_t m_inputSize2;

// Data vectors
vector<float> m_in;
vector<float> m_out;
vector<float> m_intRes;

// Argument IDs
ktt::ArgumentId m_inId;
ktt::ArgumentId m_outId;
ktt::ArgumentId m_intResId;

// Helper function for parameter range
vector<uint64_t> ParameterRange(const uint64_t max)
{
vector<uint64_t> values;

for (uint64_t i = 1; i <= max; ++i)
{
values.push_back(i);
}

return values;
}

void InitData() override
{
// Initialize data buffers with fixed sizes
m_in.resize(m_inputSize1 * m_inputSize2);
m_out.resize((m_inputSize1 - 4) * (m_inputSize2 - 4));
m_intRes.resize((m_inputSize1 - 4) * (m_inputSize2 - 4));

for (size_t i = 0; i < m_in.size(); ++i)
{
m_in[i] = static_cast<float>((i % 100) + 1);
}

for (size_t i = 0; i < m_out.size(); ++i)
{
m_out[i] = 0.0f;
}

for (size_t i = 0; i < m_intRes.size(); ++i)
{
m_intRes[i] = 0.0f;
}
}

void InitKernel() override
{
m_inId = m_tuner.AddArgumentVector(m_in, ktt::ArgumentAccessType::ReadOnly);
m_outId = m_tuner.AddArgumentVector(m_out, ktt::ArgumentAccessType::ReadWrite);
m_intResId = m_tuner.AddArgumentVector(m_intRes, ktt::ArgumentAccessType::ReadWrite);

InitKernelDefault("gaussian_1", "Convolution", ktt::DimensionVector(), {m_inId, m_outId, m_intResId});
}

void InitTuningSpace() override
{
// Constraint lambdas
auto DescendingConstraint = [](const vector<uint64_t>& v)
{
bool valid = true;

for (size_t i = 1; i < v.size(); ++i)
{
valid = valid && (v[i - 1] >= v[i]);
}

return valid;
};

auto UnequalConstraint = [](const vector<uint64_t>& v)
{
if (v.size() < 2)
{
return true;
}

bool valid = true;

for (size_t i = 1; i < v.size(); ++i)
{
valid = valid && (v[i - 1] != v[i]);
}

valid = valid && (v[v.size() - 1] != v[0]);
return valid;
};

auto LessThanOrEqualCeilDivConstraint = [](const vector<uint64_t>& v) { return v[0] <= (v[1] + v[2] - 1) / v[2]; };
auto DividesConstraint = [](const vector<uint64_t>& v) { return v[1] % v[0] == 0; };
auto DividesDivConstraint = [](const vector<uint64_t>& v) { return (v[1] / v[2]) % v[0] == 0; };

// Add parameters
m_tuner.AddParameter(m_kernel, "CACHE_L_CB", vector<uint64_t>{0, 1});
m_tuner.AddParameter(m_kernel, "CACHE_P_CB", vector<uint64_t>{0, 1});
m_tuner.AddParameter(m_kernel, "G_CB_RES_DEST_LEVEL", vector<uint64_t>{2});
m_tuner.AddParameter(m_kernel, "L_CB_RES_DEST_LEVEL", vector<uint64_t>{2, 1, 0});
m_tuner.AddParameter(m_kernel, "P_CB_RES_DEST_LEVEL", vector<uint64_t>{2, 1, 0});

m_tuner.AddParameter(m_kernel, "OCL_DIM_L_1", vector<uint64_t>{0, 1});
m_tuner.AddParameter(m_kernel, "OCL_DIM_L_2", vector<uint64_t>{0, 1});

m_tuner.AddParameter(m_kernel, "INPUT_SIZE_L_1", vector<uint64_t>{m_inputSize1 - 4});
m_tuner.AddParameter(m_kernel, "L_CB_SIZE_L_1", ParameterRange(m_inputSize1 - 4));
m_tuner.AddParameter(m_kernel, "P_CB_SIZE_L_1", ParameterRange(m_inputSize1 - 4));
m_tuner.AddParameter(m_kernel, "NUM_WG_L_1", ParameterRange(m_inputSize1 - 4));
m_tuner.AddParameter(m_kernel, "NUM_WI_L_1", ParameterRange(m_inputSize1 - 4));

m_tuner.AddParameter(m_kernel, "INPUT_SIZE_L_2", vector<uint64_t>{m_inputSize2 - 4});
m_tuner.AddParameter(m_kernel, "L_CB_SIZE_L_2", ParameterRange(m_inputSize2 - 4));
m_tuner.AddParameter(m_kernel, "P_CB_SIZE_L_2", ParameterRange(m_inputSize2 - 4));
m_tuner.AddParameter(m_kernel, "NUM_WG_L_2", ParameterRange(m_inputSize2 - 4));
m_tuner.AddParameter(m_kernel, "NUM_WI_L_2", ParameterRange(m_inputSize2 - 4));

m_tuner.AddParameter(m_kernel, "L_REDUCTION", vector<uint64_t>{1});
m_tuner.AddParameter(m_kernel, "P_WRITE_BACK", vector<uint64_t>{0});
m_tuner.AddParameter(m_kernel, "L_WRITE_BACK", vector<uint64_t>{2});

// Add constraints
m_tuner.AddConstraint(m_kernel, {"G_CB_RES_DEST_LEVEL", "L_CB_RES_DEST_LEVEL", "P_CB_RES_DEST_LEVEL"}, DescendingConstraint);
m_tuner.AddConstraint(m_kernel, {"OCL_DIM_L_1", "OCL_DIM_L_2"}, UnequalConstraint);

m_tuner.AddConstraint(m_kernel, {"L_CB_SIZE_L_1", "INPUT_SIZE_L_1"}, DividesConstraint);
m_tuner.AddConstraint(m_kernel, {"P_CB_SIZE_L_1", "L_CB_SIZE_L_1"}, DividesConstraint);
m_tuner.AddConstraint(m_kernel, {"NUM_WG_L_1", "INPUT_SIZE_L_1", "L_CB_SIZE_L_1"}, DividesDivConstraint);
m_tuner.AddConstraint(m_kernel, {"NUM_WI_L_1", "L_CB_SIZE_L_1", "P_CB_SIZE_L_1"}, DividesDivConstraint);
m_tuner.AddConstraint(m_kernel, {"NUM_WI_L_1", "INPUT_SIZE_L_1", "NUM_WG_L_1"}, LessThanOrEqualCeilDivConstraint);

m_tuner.AddConstraint(m_kernel, {"L_CB_SIZE_L_2", "INPUT_SIZE_L_2"}, DividesConstraint);
m_tuner.AddConstraint(m_kernel, {"P_CB_SIZE_L_2", "L_CB_SIZE_L_2"}, DividesConstraint);
m_tuner.AddConstraint(m_kernel, {"NUM_WG_L_2", "INPUT_SIZE_L_2", "L_CB_SIZE_L_2"}, DividesDivConstraint);
m_tuner.AddConstraint(m_kernel, {"NUM_WI_L_2", "L_CB_SIZE_L_2", "P_CB_SIZE_L_2"}, DividesDivConstraint);
m_tuner.AddConstraint(m_kernel, {"NUM_WI_L_2", "INPUT_SIZE_L_2", "NUM_WG_L_2"}, LessThanOrEqualCeilDivConstraint);

// Thread modifiers for global X dimension
m_tuner.AddThreadModifier(m_kernel, {m_definition}, ktt::ModifierType::Global, ktt::ModifierDimension::X,
{"OCL_DIM_L_1", "NUM_WG_L_1", "NUM_WI_L_1", "OCL_DIM_L_2", "NUM_WG_L_2", "NUM_WI_L_2"},
[](const uint64_t, const vector<uint64_t>& values)
{
return static_cast<uint64_t>(values[0] == 0) * values[1] * values[2]
+ static_cast<uint64_t>(values[3] == 0) * values[4] * values[5];
});

// Thread modifiers for global Y dimension
m_tuner.AddThreadModifier(m_kernel, {m_definition}, ktt::ModifierType::Global, ktt::ModifierDimension::Y,
{"OCL_DIM_L_1", "NUM_WG_L_1", "NUM_WI_L_1", "OCL_DIM_L_2", "NUM_WG_L_2", "NUM_WI_L_2"},
[](const uint64_t, const vector<uint64_t>& values)
{
return static_cast<uint64_t>(values[0] == 1) * values[1] * values[2]
+ static_cast<uint64_t>(values[3] == 1) * values[4] * values[5];
});

// Thread modifiers for local X dimension
m_tuner.AddThreadModifier(m_kernel, {m_definition}, ktt::ModifierType::Local, ktt::ModifierDimension::X,
{"OCL_DIM_L_1", "NUM_WI_L_1", "OCL_DIM_L_2", "NUM_WI_L_2"},
[](const uint64_t, const vector<uint64_t>& values)
{
return static_cast<uint64_t>(values[0] == 0) * values[1]
+ static_cast<uint64_t>(values[2] == 0) * values[3];
});

// Thread modifiers for local Y dimension
m_tuner.AddThreadModifier(m_kernel, {m_definition}, ktt::ModifierType::Local, ktt::ModifierDimension::Y,
{"OCL_DIM_L_1", "NUM_WI_L_1", "OCL_DIM_L_2", "NUM_WI_L_2"},
[](const uint64_t, const vector<uint64_t>& values)
{
return static_cast<uint64_t>(values[0] == 1) * values[1]
+ static_cast<uint64_t>(values[2] == 1) * values[3];
});
}
};

int main(int argc, char **argv)
{
unique_ptr<AtfConvolution> atfConvolution = AtfConvolution::Create<AtfConvolution>(argc, argv, 16, "Examples/AtfConvolution", "GaussianStatic1");
atfConvolution->Run();

return 0;
}
Loading