Ginkgo: The performance-debugging program (original) (raw)

The simple solver with performance debugging example..

This example depends on simple-solver-logging, minimal-cuda-solver.

Table of contents
Introduction About the example The commented program Results Comments about programming and debugging The plain program

Introduction

About the example

This example runs a solver on a test problem and shows how to use loggers to debug performance and convergence rate.

The commented program

creates a zero vector

template

std::unique_ptr<vec> create_vector(

std::shared_ptr exec, gko::size_type size,

ValueType value)

{

auto res = vec::create(exec);

return res;

}

utilities for computing norms and residuals

template

ValueType get_first_element(const vec* norm)

{

return norm->get_executor()->copy_val_to_host(norm->get_const_values());

}

template

{

auto exec = b->get_executor();

auto b_norm = gko::initialize<real_vec>({0.0}, exec);

b->compute_norm2(b_norm);

return get_first_element(b_norm.get());

}

template

const gko::LinOp* system_matrix, const vec* b,

const vec* x)

{

auto one = gko::initialize<vec>({1.0}, exec);

auto neg_one = gko::initialize<vec>({-1.0}, exec);

system_matrix->apply(one, x, neg_one, res);

return compute_norm(res.get());

}

}

namespace loggers {

A logger that accumulates the time of all operations. For each operation type (allocations, free, copy, internal operations i.e. kernels), the timing is taken before and after. This can create significant overhead since to ensure proper timings, calls to synchronize are required.

void on_allocation_started(const gko::Executor* exec,

{

this->start_operation(exec, "allocate");

}

void on_allocation_completed(const gko::Executor* exec,

{

this->end_operation(exec, "allocate");

}

{

this->start_operation(exec, "free");

}

{

this->end_operation(exec, "free");

}

{

this->start_operation(to, "copy");

}

{

this->end_operation(to, "copy");

}

void on_operation_launched(const gko::Executor* exec,

{

this->start_operation(exec, op->get_name());

}

void on_operation_completed(const gko::Executor* exec,

{

this->end_operation(exec, op->get_name());

}

void write_data(std::ostream& ostream)

{

for (const auto& entry : total) {

ostream << "\t" << entry.first.c_str() << ": "

<< std::chrono::duration_caststd::chrono::nanoseconds(

entry.second)

.count()

<< std::endl;

}

}

private:

Helper which synchronizes and starts the time before every operation.

const std::string& name) const

{

nested.emplace_back(0);

start[name] = std::chrono::steady_clock::now();

}

Helper to compute the end time and store the operation's time at its end. Also time nested operations.

void end_operation(const gko::Executor* exec, const std::string& name) const

{

const auto end = std::chrono::steady_clock::now();

const auto diff = end - start[name];

make sure timings for nested operations are not counted twice

total[name] += diff - nested.back();

nested.pop_back();

if (nested.size() > 0) {

nested.back() += diff;

}

}

mutable std::map<std::string, std::chrono::steady_clock::time_point> start;

mutable std::map<std::string, std::chrono::steady_clock::duration> total;

the position i of this vector holds the total time spend on child operations on nesting level i

mutable std::vectorstd::chrono::steady\_clock::duration nested;

};

This logger tracks the persistently allocated data

Store amount of bytes allocated on every allocation

{

storage[location] = num_bytes;

}

Reset the amount of bytes on every free

{

storage[location] = 0;

}

Write the data after summing the total from all allocations

void write_data(std::ostream& ostream)

{

for (const auto& e : storage) {

total += e.second;

}

ostream << "Storage: " << total << std::endl;

}

private:

mutable std::unordered_map<gko::uintptr, gko::size_type> storage;

};

Logs true and recurrent residuals of the solver

template

Depending on the available information, store the norm or compute it from the residual. If the true residual norm could not be computed, store the value -1.0.

const gko::LinOp* residual_norm) const override

{

if (residual_norm) {

rec_res_norms.push_back(utils::get_first_element(

gko::as<real_vec>(residual_norm)));

} else {

rec_res_norms.push_back(

utils::compute_norm(gko::as<vec>(residual)));

}

if (solution) {

true_res_norms.push_back(utils::compute_residual_norm(

matrix, b, gko::as<vec>(solution)));

} else {

true_res_norms.push_back(-1.0);

}

}

ResidualLogger(const gko::LinOp* matrix, const vec* b)

: gko:🪵:Logger(gko:🪵:Logger::iteration_complete_mask),

matrix{matrix},

b{b}

{}

void write_data(std::ostream& ostream)

{

ostream << "Recurrent Residual Norms: " << std::endl;

ostream << "[" << std::endl;

for (const auto& entry : rec_res_norms) {

ostream << "\t" << entry << std::endl;

}

ostream << "];" << std::endl;

ostream << "True Residual Norms: " << std::endl;

ostream << "[" << std::endl;

for (const auto& entry : true_res_norms) {

ostream << "\t" << entry << std::endl;

}

ostream << "];" << std::endl;

}

private:

const vec* b;

mutable std::vector<gko::remove_complex> rec_res_norms;

mutable std::vector<gko::remove_complex> true_res_norms;

};

}

namespace {

Print usage help

void print_usage(const char* filename)

{

std::cerr << "Usage: " << filename << " [executor] [matrix file]"

<< std::endl;

std::cerr << "matrix file should be a file in matrix market format. "

"The file data/A.mtx is provided as an example."

<< std::endl;

std::exit(-1);

}

template

{

std::cout << "[" << std::endl;

for (int i = 0; i < elements_to_print; ++i) {

std::cout << "\t" << vec->at(i) << std::endl;

}

std::cout << "];" << std::endl;

}

}

int main(int argc, char* argv[])

{

Parametrize the benchmark here Pick a value type

using ValueType = double;

using IndexType = int;

Pick a matrix format

Pick a solver

Pick a preconditioner type

Pick a residual norm reduction value

Pick an output file name

const auto of_name = "log.txt";

Simple shortcut

Print version information

Figure out where to run the code

if (argc == 2 && (std::string(argv[1]) == "--help")) {

std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;

std::exit(-1);

}

Figure out where to run the code

const auto executor_string = argc >= 2 ? argv[1] : "reference";

std::map<std::string, std::function<std::shared_ptrgko::Executor()>>

exec_map{

{"cuda",

[] {

}},

{"hip",

[] {

}},

{"dpcpp",

[] {

}},

{"reference", [] { return gko::ReferenceExecutor::create(); }}};

executor where Ginkgo will perform the computation

const auto exec = exec_map.at(executor_string)();

Read the input matrix file directory

std::string input_mtx = "data/A.mtx";

if (argc == 3) {

input_mtx = std::string(argv[2]);

}

Read data: A is read from disk Create a StorageLogger to track the size of A

auto storage_logger = std::make_sharedloggers::StorageLogger();

Add the logger to the executor

Read the matrix A from file

auto A = gko::share(gko::read(std::ifstream(input_mtx), exec));

Remove the storage logger

Pick a maximum iteration count

const auto max_iters = A->get_size()[0];

Generate b and x vectors

auto b = utils::create_vector(exec, A->get_size()[0], 1.0);

auto x = utils::create_vector(exec, A->get_size()[0], 0.0);

Declare the solver factory. The preconditioner's arguments should be adapted if needed.

auto solver_factory =

solver::build()

.with_criteria(

.with_reduction_factor(reduction_factor),

gko::stop::Iteration::build().with_max_iters(max_iters))

.with_preconditioner(preconditioner::create(exec))

.on(exec);

Declare the output file for all our loggers

std::ofstream output_file(of_name);

Do a warmup run

Clone x to not overwrite the original one

Generate and call apply on a solver

solver_factory->generate(A)->apply(b, x_clone);

}

Do a timed run

Clone x to not overwrite the original one

Synchronize ensures no operation are ongoing

Time before generate

auto g_tic = std::chrono::steady_clock::now();

Generate a solver

auto generated_solver = solver_factory->generate(A);

Time after generate

auto g_tac = std::chrono::steady_clock::now();

Compute the generation time

auto generate_time =

std::chrono::duration_caststd::chrono::nanoseconds(g_tac - g_tic);

Write the generate time to the output file

output_file << "Generate time (ns): " << generate_time.count()

<< std::endl;

Similarly time the apply

auto a_tic = std::chrono::steady_clock::now();

generated_solver->apply(b, x_clone);

auto a_tac = std::chrono::steady_clock::now();

auto apply_time =

std::chrono::duration_caststd::chrono::nanoseconds(a_tac - a_tic);

output_file << "Apply time (ns): " << apply_time.count() << std::endl;

Compute the residual norm

auto residual =

utils::compute_residual_norm(A.get(), b.get(), x_clone.get());

output_file << "Residual_norm: " << residual << std::endl;

}

Log the internal operations using the OperationLogger without timing

Create an OperationLogger to analyze the generate step

auto gen_logger = std::make_sharedloggers::OperationLogger();

Add the generate logger to the executor

Generate a solver

auto generated_solver = solver_factory->generate(A);

Remove the generate logger from the executor

Write the data to the output file

output_file << "Generate operations times (ns):" << std::endl;

gen_logger->write_data(output_file);

Create an OperationLogger to analyze the apply step

auto apply_logger = std::make_sharedloggers::OperationLogger();

Create a ResidualLogger to log the recurent residual

auto res_logger = std::make_shared<loggers::ResidualLogger>(

A.get(), b.get());

generated_solver->add_logger(res_logger);

Solve the system

generated_solver->apply(b, x);

Write the data to the output file

output_file << "Apply operations times (ns):" << std::endl;

apply_logger->write_data(output_file);

res_logger->write_data(output_file);

}

Print solution

std::cout << "Solution, first ten entries: \n";

print_vector(x.get());

Print output file location

std::cout << "The performance and residual data can be found in " << of_name

<< std::endl;

}

Results

This is the expected standard output:

Solution, first ten entries:

[

0.252218

0.108645

0.0662811

0.0630433

0.0384088

0.0396536

0.0402648

0.0338935

0.0193098

0.0234653

];

The performance and residual data can be found in log.txt

Here is a sample output in the file log.txt:

Generate time (ns): 861

Apply time (ns): 108144

Residual_norm: 2.10788e-15

Generate operations times (ns):

Apply operations times (ns):

allocate: 14991

cg::step_1#5: 7683

cg::step_2#7: 7756

copy: 7751

csr::advanced_spmv#5: 21819

csr::spmv#3: 20429

dense::compute_dot#3: 18043

dense::compute_norm2#2: 16726

free: 8857

residual_norm::residual_norm#9: 3614

Recurrent Residual Norms:

[

4.3589

2.30455

1.46771

0.984875

0.741833

0.513623

0.384165

0.316439

0.227709

0.170312

0.0973722

0.0616831

0.0454123

0.031953

0.0161606

0.00657015

0.00264367

0.000858809

0.000286461

1.64195e-15

];

True Residual Norms:

[

4.3589

2.30455

1.46771

0.984875

0.741833

0.513623

0.384165

0.316439

0.227709

0.170312

0.0973722

0.0616831

0.0454123

0.031953

0.0161606

0.00657015

0.00264367

0.000858809

0.000286461

2.10788e-15

];

Comments about programming and debugging

The plain program

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include <unordered_map>

#include

#include

#include <ginkgo/ginkgo.hpp>

template

template

namespace utils {

template

std::unique_ptr<vec> create_vector(

std::shared_ptr exec, gko::size_type size,

ValueType value)

{

auto res = vec::create(exec);

return res;

}

template

ValueType get_first_element(const vec* norm)

{

return norm->get_executor()->copy_val_to_host(norm->get_const_values());

}

template

{

auto exec = b->get_executor();

auto b_norm = gko::initialize<real_vec>({0.0}, exec);

b->compute_norm2(b_norm);

return get_first_element(b_norm.get());

}

template

const gko::LinOp* system_matrix, const vec* b,

const vec* x)

{

auto one = gko::initialize<vec>({1.0}, exec);

auto neg_one = gko::initialize<vec>({-1.0}, exec);

system_matrix->apply(one, x, neg_one, res);

return compute_norm(res.get());

}

}

namespace loggers {

void on_allocation_started(const gko::Executor* exec,

{

this->start_operation(exec, "allocate");

}

void on_allocation_completed(const gko::Executor* exec,

{

this->end_operation(exec, "allocate");

}

{

this->start_operation(exec, "free");

}

{

this->end_operation(exec, "free");

}

{

this->start_operation(to, "copy");

}

{

this->end_operation(to, "copy");

}

void on_operation_launched(const gko::Executor* exec,

{

this->start_operation(exec, op->get_name());

}

void on_operation_completed(const gko::Executor* exec,

{

this->end_operation(exec, op->get_name());

}

void write_data(std::ostream& ostream)

{

for (const auto& entry : total) {

ostream << "\t" << entry.first.c_str() << ": "

<< std::chrono::duration_caststd::chrono::nanoseconds(

entry.second)

.count()

<< std::endl;

}

}

private:

const std::string& name) const

{

nested.emplace_back(0);

start[name] = std::chrono::steady_clock::now();

}

void end_operation(const gko::Executor* exec, const std::string& name) const

{

const auto end = std::chrono::steady_clock::now();

const auto diff = end - start[name];

total[name] += diff - nested.back();

nested.pop_back();

if (nested.size() > 0) {

nested.back() += diff;

}

}

mutable std::map<std::string, std::chrono::steady_clock::time_point> start;

mutable std::map<std::string, std::chrono::steady_clock::duration> total;

mutable std::vectorstd::chrono::steady\_clock::duration nested;

};

{

storage[location] = num_bytes;

}

{

storage[location] = 0;

}

void write_data(std::ostream& ostream)

{

for (const auto& e : storage) {

total += e.second;

}

ostream << "Storage: " << total << std::endl;

}

private:

mutable std::unordered_map<gko::uintptr, gko::size_type> storage;

};

template

const gko::LinOp* residual_norm) const override

{

if (residual_norm) {

rec_res_norms.push_back(utils::get_first_element(

gko::as<real_vec>(residual_norm)));

} else {

rec_res_norms.push_back(

utils::compute_norm(gko::as<vec>(residual)));

}

if (solution) {

true_res_norms.push_back(utils::compute_residual_norm(

matrix, b, gko::as<vec>(solution)));

} else {

true_res_norms.push_back(-1.0);

}

}

ResidualLogger(const gko::LinOp* matrix, const vec* b)

: gko:🪵:Logger(gko:🪵:Logger::iteration_complete_mask),

matrix{matrix},

b{b}

{}

void write_data(std::ostream& ostream)

{

ostream << "Recurrent Residual Norms: " << std::endl;

ostream << "[" << std::endl;

for (const auto& entry : rec_res_norms) {

ostream << "\t" << entry << std::endl;

}

ostream << "];" << std::endl;

ostream << "True Residual Norms: " << std::endl;

ostream << "[" << std::endl;

for (const auto& entry : true_res_norms) {

ostream << "\t" << entry << std::endl;

}

ostream << "];" << std::endl;

}

private:

const vec* b;

mutable std::vector<gko::remove_complex> rec_res_norms;

mutable std::vector<gko::remove_complex> true_res_norms;

};

}

namespace {

void print_usage(const char* filename)

{

std::cerr << "Usage: " << filename << " [executor] [matrix file]"

<< std::endl;

std::cerr << "matrix file should be a file in matrix market format. "

"The file data/A.mtx is provided as an example."

<< std::endl;

std::exit(-1);

}

template

{

std::cout << "[" << std::endl;

for (int i = 0; i < elements_to_print; ++i) {

std::cout << "\t" << vec->at(i) << std::endl;

}

std::cout << "];" << std::endl;

}

}

int main(int argc, char* argv[])

{

using ValueType = double;

using IndexType = int;

const auto of_name = "log.txt";

if (argc == 2 && (std::string(argv[1]) == "--help")) {

std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;

std::exit(-1);

}

const auto executor_string = argc >= 2 ? argv[1] : "reference";

std::map<std::string, std::function<std::shared_ptrgko::Executor()>>

exec_map{

{"cuda",

[] {

}},

{"hip",

[] {

}},

{"dpcpp",

[] {

}},

{"reference", [] { return gko::ReferenceExecutor::create(); }}};

const auto exec = exec_map.at(executor_string)();

std::string input_mtx = "data/A.mtx";

if (argc == 3) {

input_mtx = std::string(argv[2]);

}

auto storage_logger = std::make_sharedloggers::StorageLogger();

auto A = gko::share(gko::read(std::ifstream(input_mtx), exec));

const auto max_iters = A->get_size()[0];

auto b = utils::create_vector(exec, A->get_size()[0], 1.0);

auto x = utils::create_vector(exec, A->get_size()[0], 0.0);

auto solver_factory =

solver::build()

.with_criteria(

.with_reduction_factor(reduction_factor),

gko::stop::Iteration::build().with_max_iters(max_iters))

.with_preconditioner(preconditioner::create(exec))

.on(exec);

std::ofstream output_file(of_name);

{

solver_factory->generate(A)->apply(b, x_clone);

}

{

auto g_tic = std::chrono::steady_clock::now();

auto generated_solver = solver_factory->generate(A);

auto g_tac = std::chrono::steady_clock::now();

auto generate_time =

std::chrono::duration_caststd::chrono::nanoseconds(g_tac - g_tic);

output_file << "Generate time (ns): " << generate_time.count()

<< std::endl;

auto a_tic = std::chrono::steady_clock::now();

generated_solver->apply(b, x_clone);

auto a_tac = std::chrono::steady_clock::now();

auto apply_time =

std::chrono::duration_caststd::chrono::nanoseconds(a_tac - a_tic);

output_file << "Apply time (ns): " << apply_time.count() << std::endl;

auto residual =

utils::compute_residual_norm(A.get(), b.get(), x_clone.get());

output_file << "Residual_norm: " << residual << std::endl;

}

{

auto gen_logger = std::make_sharedloggers::OperationLogger();

auto generated_solver = solver_factory->generate(A);

output_file << "Generate operations times (ns):" << std::endl;

gen_logger->write_data(output_file);

auto apply_logger = std::make_sharedloggers::OperationLogger();

auto res_logger = std::make_shared<loggers::ResidualLogger>(

A.get(), b.get());

generated_solver->add_logger(res_logger);

generated_solver->apply(b, x);

output_file << "Apply operations times (ns):" << std::endl;

apply_logger->write_data(output_file);

res_logger->write_data(output_file);

}

std::cout << "Solution, first ten entries: \n";

print_vector(x.get());

std::cout << "The performance and residual data can be found in " << of_name

<< std::endl;

}