Scheduling multi-stage pipelines (original) (raw)

// Halide tutorial lesson 8: Scheduling multi-stage pipelines

// On linux, you can compile and run it like so: // g++ lesson_08*.cpp -g -std=c++17 -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -lpthread -ldl -o lesson_08 // LD_LIBRARY_PATH=<path/to/libHalide.so> ./lesson_08

// On os x: // g++ lesson_08*.cpp -g -std=c++17 -I <path/to/Halide.h> -L <path/to/libHalide.so> -lHalide -o lesson_08 // DYLD_LIBRARY_PATH=<path/to/libHalide.dylib> ./lesson_08

// If you have the entire Halide source tree, you can also build it by // running: // make tutorial_lesson_08_scheduling_2 // in a shell with the current directory at the top of the halide // source tree.

#include "Halide.h" #include <stdio.h>

using namespace Halide;

int main(int argc, char **argv) { // First we'll declare some Vars to use below. Var x("x"), y("y");

// Let's examine various scheduling options for a simple two stage
// pipeline. We'll start with the default schedule:
{
    Func producer("producer_default"), consumer("consumer_default");

    // The first stage will be some simple pointwise math similar
    // to our familiar gradient function. The value at position x,
    // y is the sin of product of x and y.
    producer(x, y) = sin(x * y);

    // Now we'll add a second stage which averages together multiple
    // points in the first stage.
    consumer(x, y) = (producer(x, y) +
                      producer(x, y + 1) +
                      producer(x + 1, y) +
                      producer(x + 1, y + 1)) / 4;

    // We'll turn on tracing for both functions.
    consumer.trace_stores();
    producer.trace_stores();

    // And evaluate it over a 4x4 box.
    printf("\nEvaluating producer-consumer pipeline with default schedule\n");
    consumer.realize({4, 4});
    **// Click to show output ...**

> Begin pipeline consumer_default.0() > Tag consumer_default.0() tag = "func_type_and_dim: 1 2 32 1 2 0 4 0 4" > Store consumer_default.0(0, 0) = 0.210368 > Store consumer_default.0(1, 0) = 0.437692 > Store consumer_default.0(2, 0) = 0.262604 > Store consumer_default.0(3, 0) = -0.153921 > Store consumer_default.0(0, 1) = 0.437692 > Store consumer_default.0(1, 1) = 0.475816 > Store consumer_default.0(2, 1) = 0.003550 > Store consumer_default.0(3, 1) = 0.023565 > Store consumer_default.0(0, 2) = 0.262604 > Store consumer_default.0(1, 2) = 0.003550 > Store consumer_default.0(2, 2) = -0.225879 > Store consumer_default.0(3, 2) = 0.146372 > Store consumer_default.0(0, 3) = -0.153921 > Store consumer_default.0(1, 3) = 0.023565 > Store consumer_default.0(2, 3) = 0.146372 > Store consumer_default.0(3, 3) = -0.237233 > End pipeline consumer_default.0()

    // There were no messages about computing values of the
    // producer. This is because the default schedule fully
    // inlines 'producer' into 'consumer'. It is as if we had
    // written the following code instead:

    // consumer(x, y) = (sin(x * y) +
    //                   sin(x * (y + 1)) +
    //                   sin((x + 1) * y) +
    //                   sin((x + 1) * (y + 1))/4);

    // All calls to 'producer' have been replaced with the body of
    // 'producer', with the arguments substituted in for the
    // variables.

    // The equivalent C code is:
    float result[4][4];
    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            result[y][x] = (sin(x * y) +
                            sin(x * (y + 1)) +
                            sin((x + 1) * y) +
                            sin((x + 1) * (y + 1))) / 4;
        }
    }
    printf("\n");

    // If we look at the loop nest, the producer doesn't appear
    // at all. It has been inlined into the consumer.
    printf("Pseudo-code for the schedule:\n");
    consumer.print_loop_nest();
    printf("\n");
    **// Click to show output ...**

> produce consumer_default: > for y: > for x: > consumer_default(...) = ...

}

// Next we'll examine the next simplest option - computing all
// values required in the producer before computing any of the
// consumer. We call this schedule "root".
{
    // Start with the same function definitions:
    Func producer("producer_root"), consumer("consumer_root");
    producer(x, y) = sin(x * y);
    consumer(x, y) = (producer(x, y) +
                      producer(x, y + 1) +
                      producer(x + 1, y) +
                      producer(x + 1, y + 1)) / 4;

    // Tell Halide to evaluate all of producer before any of consumer.
    producer.compute_root();

    // Turn on tracing.
    consumer.trace_stores();
    producer.trace_stores();

    // Compile and run.
    printf("\nEvaluating producer.compute_root()\n");
    consumer.realize({4, 4});
    **// Click to show output ...**

> Begin pipeline consumer_root.0() > Tag producer_root.0() tag = "func_type_and_dim: 1 2 32 1 2 0 5 0 5" > Tag consumer_root.0() tag = "func_type_and_dim: 1 2 32 1 2 0 4 0 4" > Store producer_root.0(0, 0) = 0.000000 > Store producer_root.0(1, 0) = 0.000000 > Store producer_root.0(2, 0) = 0.000000 > Store producer_root.0(3, 0) = 0.000000 > Store producer_root.0(4, 0) = 0.000000 > Store producer_root.0(0, 1) = 0.000000 > Store producer_root.0(1, 1) = 0.841471 > Store producer_root.0(2, 1) = 0.909297 > Store producer_root.0(3, 1) = 0.141120 > Store producer_root.0(4, 1) = -0.756802 > Store producer_root.0(0, 2) = 0.000000 > Store producer_root.0(1, 2) = 0.909297 > Store producer_root.0(2, 2) = -0.756802 > Store producer_root.0(3, 2) = -0.279415 > Store producer_root.0(4, 2) = 0.989358 > Store producer_root.0(0, 3) = 0.000000 > Store producer_root.0(1, 3) = 0.141120 > Store producer_root.0(2, 3) = -0.279415 > Store producer_root.0(3, 3) = 0.412118 > Store producer_root.0(4, 3) = -0.536573 > Store producer_root.0(0, 4) = 0.000000 > Store producer_root.0(1, 4) = -0.756802 > Store producer_root.0(2, 4) = 0.989358 > Store producer_root.0(3, 4) = -0.536573 > Store producer_root.0(4, 4) = -0.287903 > Store consumer_root.0(0, 0) = 0.210368 > Store consumer_root.0(1, 0) = 0.437692 > Store consumer_root.0(2, 0) = 0.262604 > Store consumer_root.0(3, 0) = -0.153921 > Store consumer_root.0(0, 1) = 0.437692 > Store consumer_root.0(1, 1) = 0.475816 > Store consumer_root.0(2, 1) = 0.003550 > Store consumer_root.0(3, 1) = 0.023565 > Store consumer_root.0(0, 2) = 0.262604 > Store consumer_root.0(1, 2) = 0.003550 > Store consumer_root.0(2, 2) = -0.225879 > Store consumer_root.0(3, 2) = 0.146372 > Store consumer_root.0(0, 3) = -0.153921 > Store consumer_root.0(1, 3) = 0.023565 > Store consumer_root.0(2, 3) = 0.146372 > Store consumer_root.0(3, 3) = -0.237233 > End pipeline consumer_root.0()

    // Reading the output we can see that:
    // A) There were stores to producer.
    // B) They all happened before any stores to consumer.

    // See below for a visualization.
    // The producer is on the left and the consumer is on the
    // right. Stores are marked in orange and loads are marked in
    // blue.

     ![](http://halide-lang.org/tutorials/figures/lesson_08_compute_root.gif)

    // Equivalent C:

    float result[4][4];

    // Allocate some temporary storage for the producer.
    float producer_storage[5][5];

    // Compute the producer.
    for (int y = 0; y < 5; y++) {
        for (int x = 0; x < 5; x++) {
            producer_storage[y][x] = sin(x * y);
        }
    }

    // Compute the consumer. Skip the prints this time.
    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {
            result[y][x] = (producer_storage[y][x] +
                            producer_storage[y + 1][x] +
                            producer_storage[y][x + 1] +
                            producer_storage[y + 1][x + 1]) / 4;
        }
    }

    // Note that consumer was evaluated over a 4x4 box, so Halide
    // automatically inferred that producer was needed over a 5x5
    // box. This is the same 'bounds inference' logic we saw in
    // the previous lesson, where it was used to detect and avoid
    // out-of-bounds reads from an input image.

    // If we print the loop nest, we'll see something very
    // similar to the C above.
    printf("Pseudo-code for the schedule:\n");
    consumer.print_loop_nest();
    printf("\n");
    **// Click to show output ...**

> produce producer_root: > for y: > for x: > producer_root(...) = ... > consume producer_root: > produce consumer_root: > for y: > for x: > consumer_root(...) = ...

}

// Let's compare the two approaches above from a performance
// perspective.

// Full inlining (the default schedule):
// - Temporary memory allocated: 0
// - Loads: 0
// - Stores: 16
// - Calls to sin: 64

// producer.compute_root():
// - Temporary memory allocated: 25 floats
// - Loads: 64
// - Stores: 41
// - Calls to sin: 25

// There's a trade-off here. Full inlining used minimal temporary
// memory and memory bandwidth, but did a whole bunch of redundant
// expensive math (calling sin). It evaluated most points in
// 'producer' four times. The second schedule,
// producer.compute_root(), did the mimimum number of calls to
// sin, but used more temporary memory and more memory bandwidth.

// In any given situation the correct choice can be difficult to
// make. If you're memory-bandwidth limited, or don't have much
// memory (e.g. because you're running on an old cell-phone), then
// it can make sense to do redundant math. On the other hand, sin
// is expensive, so if you're compute-limited then fewer calls to
// sin will make your program faster. Adding vectorization or
// multi-core parallelism tilts the scales in favor of doing
// redundant work, because firing up multiple cpu cores increases
// the amount of math you can do per second, but doesn't increase
// your system memory bandwidth or capacity.

// We can make choices in between full inlining and
// compute_root. Next we'll alternate between computing the
// producer and consumer on a per-scanline basis:
{
    // Start with the same function definitions:
    Func producer("producer_y"), consumer("consumer_y");
    producer(x, y) = sin(x * y);
    consumer(x, y) = (producer(x, y) +
                      producer(x, y + 1) +
                      producer(x + 1, y) +
                      producer(x + 1, y + 1)) / 4;

    // Tell Halide to evaluate producer as needed per y coordinate
    // of the consumer:
    producer.compute_at(consumer, y);

    // This places the code that computes the producer just
    // *inside* the consumer's for loop over y, as in the
    // equivalent C below.

    // Turn on tracing.
    producer.trace_stores();
    consumer.trace_stores();

    // Compile and run.
    printf("\nEvaluating producer.compute_at(consumer, y)\n");
    consumer.realize({4, 4});
    **// Click to show output ...**

> Begin pipeline consumer_y.0() > Tag producer_y.0() tag = "func_type_and_dim: 1 2 32 1 2 0 5 0 5" > Tag consumer_y.0() tag = "func_type_and_dim: 1 2 32 1 2 0 4 0 4" > Store producer_y.0(0, 0) = 0.000000 > Store producer_y.0(1, 0) = 0.000000 > Store producer_y.0(2, 0) = 0.000000 > Store producer_y.0(3, 0) = 0.000000 > Store producer_y.0(4, 0) = 0.000000 > Store producer_y.0(0, 1) = 0.000000 > Store producer_y.0(1, 1) = 0.841471 > Store producer_y.0(2, 1) = 0.909297 > Store producer_y.0(3, 1) = 0.141120 > Store producer_y.0(4, 1) = -0.756802 > Store consumer_y.0(0, 0) = 0.210368 > Store consumer_y.0(1, 0) = 0.437692 > Store consumer_y.0(2, 0) = 0.262604 > Store consumer_y.0(3, 0) = -0.153921 > Store producer_y.0(0, 1) = 0.000000 > Store producer_y.0(1, 1) = 0.841471 > Store producer_y.0(2, 1) = 0.909297 > Store producer_y.0(3, 1) = 0.141120 > Store producer_y.0(4, 1) = -0.756802 > Store producer_y.0(0, 2) = 0.000000 > Store producer_y.0(1, 2) = 0.909297 > Store producer_y.0(2, 2) = -0.756802 > Store producer_y.0(3, 2) = -0.279415 > Store producer_y.0(4, 2) = 0.989358 > Store consumer_y.0(0, 1) = 0.437692 > Store consumer_y.0(1, 1) = 0.475816 > Store consumer_y.0(2, 1) = 0.003550 > Store consumer_y.0(3, 1) = 0.023565 > Store producer_y.0(0, 2) = 0.000000 > Store producer_y.0(1, 2) = 0.909297 > Store producer_y.0(2, 2) = -0.756802 > Store producer_y.0(3, 2) = -0.279415 > Store producer_y.0(4, 2) = 0.989358 > Store producer_y.0(0, 3) = 0.000000 > Store producer_y.0(1, 3) = 0.141120 > Store producer_y.0(2, 3) = -0.279415 > Store producer_y.0(3, 3) = 0.412118 > Store producer_y.0(4, 3) = -0.536573 > Store consumer_y.0(0, 2) = 0.262604 > Store consumer_y.0(1, 2) = 0.003550 > Store consumer_y.0(2, 2) = -0.225879 > Store consumer_y.0(3, 2) = 0.146372 > Store producer_y.0(0, 3) = 0.000000 > Store producer_y.0(1, 3) = 0.141120 > Store producer_y.0(2, 3) = -0.279415 > Store producer_y.0(3, 3) = 0.412118 > Store producer_y.0(4, 3) = -0.536573 > Store producer_y.0(0, 4) = 0.000000 > Store producer_y.0(1, 4) = -0.756802 > Store producer_y.0(2, 4) = 0.989358 > Store producer_y.0(3, 4) = -0.536573 > Store producer_y.0(4, 4) = -0.287903 > Store consumer_y.0(0, 3) = -0.153921 > Store consumer_y.0(1, 3) = 0.023565 > Store consumer_y.0(2, 3) = 0.146372 > Store consumer_y.0(3, 3) = -0.237233 > End pipeline consumer_y.0()

    // See below for a visualization.

     ![](http://halide-lang.org/tutorials/figures/lesson_08_compute_y.gif)

    // Reading the log or looking at the figure you should see
    // that producer and consumer alternate on a per-scanline
    // basis. Let's look at the equivalent C:

    float result[4][4];

    // There's an outer loop over scanlines of consumer:
    for (int y = 0; y < 4; y++) {

        // Allocate space and compute enough of the producer to
        // satisfy this single scanline of the consumer. This
        // means a 5x2 box of the producer.
        float producer_storage[2][5];
        for (int py = y; py < y + 2; py++) {
            for (int px = 0; px < 5; px++) {
                producer_storage[py - y][px] = sin(px * py);
            }
        }

        // Compute a scanline of the consumer.
        for (int x = 0; x < 4; x++) {
            result[y][x] = (producer_storage[0][x] +
                            producer_storage[1][x] +
                            producer_storage[0][x + 1] +
                            producer_storage[1][x + 1]) / 4;
        }
    }

    // Again, if we print the loop nest, we'll see something very
    // similar to the C above.
    printf("Pseudo-code for the schedule:\n");
    consumer.print_loop_nest();
    printf("\n");
    **// Click to show output ...**

> produce consumer_y: > for y: > produce producer_y: > for y: > for x: > producer_y(...) = ... > consume producer_y: > for x: > consumer_y(...) = ...

    // The performance characteristics of this strategy are in
    // between inlining and compute root. We still allocate some
    // temporary memory, but less than compute_root, and with
    // better locality (we load from it soon after writing to it,
    // so for larger images, values should still be in cache). We
    // still do some redundant work, but less than full inlining:

    // producer.compute_at(consumer, y):
    // - Temporary memory allocated: 10 floats
    // - Loads: 64
    // - Stores: 56
    // - Calls to sin: 40
}

// We could also say producer.compute_at(consumer, x), but this
// would be very similar to full inlining (the default
// schedule). Instead let's distinguish between the loop level at
// which we allocate storage for producer, and the loop level at
// which we actually compute it. This unlocks a few optimizations.
{
    Func producer("producer_root_y"), consumer("consumer_root_y");
    producer(x, y) = sin(x * y);
    consumer(x, y) = (producer(x, y) +
                      producer(x, y + 1) +
                      producer(x + 1, y) +
                      producer(x + 1, y + 1)) / 4;

    // Tell Halide to make a buffer to store all of producer at
    // the outermost level:
    producer.store_root();
    // ... but compute it as needed per y coordinate of the
    // consumer.
    producer.compute_at(consumer, y);

    producer.trace_stores();
    consumer.trace_stores();

    printf("\nEvaluating producer.store_root().compute_at(consumer, y)\n");
    consumer.realize({4, 4});
    **// Click to show output ...**

> Begin pipeline consumer_root_y.0() > Tag producer_root_y.0() tag = "func_type_and_dim: 1 2 32 1 2 0 5 0 5" > Tag consumer_root_y.0() tag = "func_type_and_dim: 1 2 32 1 2 0 4 0 4" > Store producer_root_y.0(0, 0) = 0.000000 > Store producer_root_y.0(1, 0) = 0.000000 > Store producer_root_y.0(2, 0) = 0.000000 > Store producer_root_y.0(3, 0) = 0.000000 > Store producer_root_y.0(4, 0) = 0.000000 > Store producer_root_y.0(0, 1) = 0.000000 > Store producer_root_y.0(1, 1) = 0.841471 > Store producer_root_y.0(2, 1) = 0.909297 > Store producer_root_y.0(3, 1) = 0.141120 > Store producer_root_y.0(4, 1) = -0.756802 > Store consumer_root_y.0(0, 0) = 0.210368 > Store consumer_root_y.0(1, 0) = 0.437692 > Store consumer_root_y.0(2, 0) = 0.262604 > Store consumer_root_y.0(3, 0) = -0.153921 > Store producer_root_y.0(0, 2) = 0.000000 > Store producer_root_y.0(1, 2) = 0.909297 > Store producer_root_y.0(2, 2) = -0.756802 > Store producer_root_y.0(3, 2) = -0.279415 > Store producer_root_y.0(4, 2) = 0.989358 > Store consumer_root_y.0(0, 1) = 0.437692 > Store consumer_root_y.0(1, 1) = 0.475816 > Store consumer_root_y.0(2, 1) = 0.003550 > Store consumer_root_y.0(3, 1) = 0.023565 > Store producer_root_y.0(0, 3) = 0.000000 > Store producer_root_y.0(1, 3) = 0.141120 > Store producer_root_y.0(2, 3) = -0.279415 > Store producer_root_y.0(3, 3) = 0.412118 > Store producer_root_y.0(4, 3) = -0.536573 > Store consumer_root_y.0(0, 2) = 0.262604 > Store consumer_root_y.0(1, 2) = 0.003550 > Store consumer_root_y.0(2, 2) = -0.225879 > Store consumer_root_y.0(3, 2) = 0.146372 > Store producer_root_y.0(0, 4) = 0.000000 > Store producer_root_y.0(1, 4) = -0.756802 > Store producer_root_y.0(2, 4) = 0.989358 > Store producer_root_y.0(3, 4) = -0.536573 > Store producer_root_y.0(4, 4) = -0.287903 > Store consumer_root_y.0(0, 3) = -0.153921 > Store consumer_root_y.0(1, 3) = 0.023565 > Store consumer_root_y.0(2, 3) = 0.146372 > Store consumer_root_y.0(3, 3) = -0.237233 > End pipeline consumer_root_y.0()

    // See below for a
    // visualization.

     ![](http://halide-lang.org/tutorials/figures/lesson_08_store_root_compute_y.gif)

    // Reading the log or looking at the figure you should see
    // that producer and consumer again alternate on a
    // per-scanline basis. It computes a 5x2 box of the producer
    // to satisfy the first scanline of the consumer, but after
    // that it only computes a 5x1 box of the output for each new
    // scanline of the consumer!
    //
    // Halide has detected that for all scanlines except for the
    // first, it can reuse the values already sitting in the
    // buffer we've allocated for producer. Let's look at the
    // equivalent C:

    float result[4][4];

    // producer.store_root() implies that storage goes here:
    float producer_storage[5][5];

    // There's an outer loop over scanlines of consumer:
    for (int y = 0; y < 4; y++) {

        // Compute enough of the producer to satisfy this scanline
        // of the consumer.
        for (int py = y; py < y + 2; py++) {

            // Skip over rows of producer that we've already
            // computed in a previous iteration.
            if (y > 0 && py == y) continue;

            for (int px = 0; px < 5; px++) {
                producer_storage[py][px] = sin(px * py);
            }
        }

        // Compute a scanline of the consumer.
        for (int x = 0; x < 4; x++) {
            result[y][x] = (producer_storage[y][x] +
                            producer_storage[y + 1][x] +
                            producer_storage[y][x + 1] +
                            producer_storage[y + 1][x + 1]) / 4;
        }
    }

    printf("Pseudo-code for the schedule:\n");
    consumer.print_loop_nest();
    printf("\n");
    **// Click to show output ...**

> store producer_root_y: > produce consumer_root_y: > for y.: > produce producer_root_y: > for x: > producer_root_y(...) = ... > consume producer_root_y: > for x: > consumer_root_y(...) = ...

    // The performance characteristics of this strategy are pretty
    // good! The numbers are similar to compute_root, except locality
    // is better. We're doing the minimum number of sin calls,
    // and we load values soon after they are stored, so we're
    // probably making good use of the cache:

    // producer.store_root().compute_at(consumer, y):
    // - Temporary memory allocated: 10 floats
    // - Loads: 64
    // - Stores: 41
    // - Calls to sin: 25

    // Note that my claimed amount of memory allocated doesn't
    // match the reference C code. Halide is performing one more
    // optimization under the hood. It folds the storage for the
    // producer down into a circular buffer of two
    // scanlines. Equivalent C would actually look like this:

    {
        // Actually store 2 scanlines instead of 5
        float producer_storage[2][5];
        for (int y = 0; y < 4; y++) {
            for (int py = y; py < y + 2; py++) {
                if (y > 0 && py == y) continue;
                for (int px = 0; px < 5; px++) {
                    // Stores to producer_storage have their y coordinate bit-masked.
                    producer_storage[py & 1][px] = sin(px * py);
                }
            }

            // Compute a scanline of the consumer.
            for (int x = 0; x < 4; x++) {
                // Loads from producer_storage have their y coordinate bit-masked.
                result[y][x] = (producer_storage[y & 1][x] +
                                producer_storage[(y + 1) & 1][x] +
                                producer_storage[y & 1][x + 1] +
                                producer_storage[(y + 1) & 1][x + 1]) / 4;
            }
        }
    }
}

// We can do even better, by leaving the storage in the outermost
// loop, but moving the computation into the innermost loop:
{
    Func producer("producer_root_x"), consumer("consumer_root_x");
    producer(x, y) = sin(x * y);
    consumer(x, y) = (producer(x, y) +
                      producer(x, y + 1) +
                      producer(x + 1, y) +
                      producer(x + 1, y + 1)) / 4;

    // Store outermost, compute innermost.
    producer.store_root().compute_at(consumer, x);

    producer.trace_stores();
    consumer.trace_stores();

    printf("\nEvaluating producer.store_root().compute_at(consumer, x)\n");
    consumer.realize({4, 4});
    **// Click to show output ...**

> Begin pipeline consumer_root_x.0() > Tag producer_root_x.0() tag = "func_type_and_dim: 1 2 32 1 2 0 5 0 5" > Tag consumer_root_x.0() tag = "func_type_and_dim: 1 2 32 1 2 0 4 0 4" > Store producer_root_x.0(0, 0) = 0.000000 > Store producer_root_x.0(1, 0) = 0.000000 > Store producer_root_x.0(2, 0) = 0.000000 > Store producer_root_x.0(3, 0) = 0.000000 > Store producer_root_x.0(4, 0) = 0.000000 > Store producer_root_x.0(0, 1) = 0.000000 > Store producer_root_x.0(1, 1) = 0.841471 > Store consumer_root_x.0(0, 0) = 0.210368 > Store producer_root_x.0(2, 1) = 0.909297 > Store consumer_root_x.0(1, 0) = 0.437692 > Store producer_root_x.0(3, 1) = 0.141120 > Store consumer_root_x.0(2, 0) = 0.262604 > Store producer_root_x.0(4, 1) = -0.756802 > Store consumer_root_x.0(3, 0) = -0.153921 > Store producer_root_x.0(0, 2) = 0.000000 > Store producer_root_x.0(1, 2) = 0.909297 > Store consumer_root_x.0(0, 1) = 0.437692 > Store producer_root_x.0(2, 2) = -0.756802 > Store consumer_root_x.0(1, 1) = 0.475816 > Store producer_root_x.0(3, 2) = -0.279415 > Store consumer_root_x.0(2, 1) = 0.003550 > Store producer_root_x.0(4, 2) = 0.989358 > Store consumer_root_x.0(3, 1) = 0.023565 > Store producer_root_x.0(0, 3) = 0.000000 > Store producer_root_x.0(1, 3) = 0.141120 > Store consumer_root_x.0(0, 2) = 0.262604 > Store producer_root_x.0(2, 3) = -0.279415 > Store consumer_root_x.0(1, 2) = 0.003550 > Store producer_root_x.0(3, 3) = 0.412118 > Store consumer_root_x.0(2, 2) = -0.225879 > Store producer_root_x.0(4, 3) = -0.536573 > Store consumer_root_x.0(3, 2) = 0.146372 > Store producer_root_x.0(0, 4) = 0.000000 > Store producer_root_x.0(1, 4) = -0.756802 > Store consumer_root_x.0(0, 3) = -0.153921 > Store producer_root_x.0(2, 4) = 0.989358 > Store consumer_root_x.0(1, 3) = 0.023565 > Store producer_root_x.0(3, 4) = -0.536573 > Store consumer_root_x.0(2, 3) = 0.146372 > Store producer_root_x.0(4, 4) = -0.287903 > Store consumer_root_x.0(3, 3) = -0.237233 > End pipeline consumer_root_x.0()

    // See below for a
    // visualization.

     ![](http://halide-lang.org/tutorials/figures/lesson_08_store_root_compute_x.gif)

    // You should see that producer and consumer now alternate on
    // a per-pixel basis. Here's the equivalent C:

    float result[4][4];

    // producer.store_root() implies that storage goes here, but
    // we can fold it down into a circular buffer of two
    // scanlines:
    float producer_storage[2][5];

    // For every pixel of the consumer:
    for (int y = 0; y < 4; y++) {
        for (int x = 0; x < 4; x++) {

            // Compute enough of the producer to satisfy this
            // pixel of the consumer, but skip values that we've
            // already computed:
            if (y == 0 && x == 0) {
                producer_storage[y & 1][x] = sin(x * y);
            }
            if (y == 0) {
                producer_storage[y & 1][x + 1] = sin((x + 1) * y);
            }
            if (x == 0) {
                producer_storage[(y + 1) & 1][x] = sin(x * (y + 1));
            }
            producer_storage[(y + 1) & 1][x + 1] = sin((x + 1) * (y + 1));

            result[y][x] = (producer_storage[y & 1][x] +
                            producer_storage[(y + 1) & 1][x] +
                            producer_storage[y & 1][x + 1] +
                            producer_storage[(y + 1) & 1][x + 1]) / 4;
        }
    }

    printf("Pseudo-code for the schedule:\n");
    consumer.print_loop_nest();
    printf("\n");
    **// Click to show output ...**

> store producer_root_x: > produce consumer_root_x: > for y.: > for x.: > produce producer_root_x: > producer_root_x(...) = ... > consume producer_root_x: > consumer_root_x(...) = ...

    // The performance characteristics of this strategy are the
    // best so far. One of the four values of the producer we need
    // is probably still sitting in a register, so I won't count
    // it as a load:
    // producer.store_root().compute_at(consumer, x):
    // - Temporary memory allocated: 10 floats
    // - Loads: 48
    // - Stores: 41
    // - Calls to sin: 25
}

// So what's the catch? Why not always do
// producer.store_root().compute_at(consumer, x) for this type of
// code?
//
// The answer is parallelism. In both of the previous two
// strategies we've assumed that values computed in previous
// iterations are lying around for us to reuse. This assumes that
// previous values of x or y happened earlier in time and have
// finished. This is not true if you parallelize or vectorize
// either loop. Darn. If you parallelize, Halide won't inject the
// optimizations that skip work already done if there's a parallel
// loop in between the store_at level and the compute_at level,
// and won't fold the storage down into a circular buffer either,
// which makes our store_root pointless.

// We're running out of options. We can make new ones by
// splitting. We can store_at or compute_at at the natural
// variables of the consumer (x and y), or we can split x or y
// into new inner and outer sub-variables and then schedule with
// respect to those. We'll use this to express fusion in tiles:
{
    Func producer("producer_tile"), consumer("consumer_tile");
    producer(x, y) = sin(x * y);
    consumer(x, y) = (producer(x, y) +
                      producer(x, y + 1) +
                      producer(x + 1, y) +
                      producer(x + 1, y + 1)) / 4;

    // We'll compute 8x8 of the consumer, in 4x4 tiles.
    Var x_outer, y_outer, x_inner, y_inner;
    consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);

    // Compute the producer per tile of the consumer
    producer.compute_at(consumer, x_outer);

    // Notice that I wrote my schedule starting from the end of
    // the pipeline (the consumer). This is because the schedule
    // for the producer refers to x_outer, which we introduced
    // when we tiled the consumer. You can write it in the other
    // order, but it tends to be harder to read.

    // Turn on tracing.
    producer.trace_stores();
    consumer.trace_stores();

    printf("\nEvaluating:\n"
           "consumer.tile(x, y, x_outer, y_outer, x_inner, y_inner, 4, 4);\n"
           "producer.compute_at(consumer, x_outer);\n");
    consumer.realize({8, 8});
    **// Click to show output ...**

> Begin pipeline consumer_tile.0() > Tag producer_tile.0() tag = "func_type_and_dim: 1 2 32 1 2 0 9 0 9" > Tag consumer_tile.0() tag = "func_type_and_dim: 1 2 32 1 2 0 8 0 8" > Store producer_tile.0(0, 0) = 0.000000 > Store producer_tile.0(1, 0) = 0.000000 > Store producer_tile.0(2, 0) = 0.000000 > Store producer_tile.0(3, 0) = 0.000000 > Store producer_tile.0(4, 0) = 0.000000 > Store producer_tile.0(0, 1) = 0.000000 > Store producer_tile.0(1, 1) = 0.841471 > Store producer_tile.0(2, 1) = 0.909297 > Store producer_tile.0(3, 1) = 0.141120 > Store producer_tile.0(4, 1) = -0.756802 > Store producer_tile.0(0, 2) = 0.000000 > Store producer_tile.0(1, 2) = 0.909297 > Store producer_tile.0(2, 2) = -0.756802 > Store producer_tile.0(3, 2) = -0.279415 > Store producer_tile.0(4, 2) = 0.989358 > Store producer_tile.0(0, 3) = 0.000000 > Store producer_tile.0(1, 3) = 0.141120 > Store producer_tile.0(2, 3) = -0.279415 > Store producer_tile.0(3, 3) = 0.412118 > Store producer_tile.0(4, 3) = -0.536573 > Store producer_tile.0(0, 4) = 0.000000 > Store producer_tile.0(1, 4) = -0.756802 > Store producer_tile.0(2, 4) = 0.989358 > Store producer_tile.0(3, 4) = -0.536573 > Store producer_tile.0(4, 4) = -0.287903 > Store consumer_tile.0(0, 0) = 0.210368 > Store consumer_tile.0(1, 0) = 0.437692 > Store consumer_tile.0(2, 0) = 0.262604 > Store consumer_tile.0(3, 0) = -0.153921 > Store consumer_tile.0(0, 1) = 0.437692 > Store consumer_tile.0(1, 1) = 0.475816 > Store consumer_tile.0(2, 1) = 0.003550 > Store consumer_tile.0(3, 1) = 0.023565 > Store consumer_tile.0(0, 2) = 0.262604 > Store consumer_tile.0(1, 2) = 0.003550 > Store consumer_tile.0(2, 2) = -0.225879 > Store consumer_tile.0(3, 2) = 0.146372 > Store consumer_tile.0(0, 3) = -0.153921 > Store consumer_tile.0(1, 3) = 0.023565 > Store consumer_tile.0(2, 3) = 0.146372 > Store consumer_tile.0(3, 3) = -0.237233 > Store producer_tile.0(4, 0) = 0.000000 > Store producer_tile.0(5, 0) = 0.000000 > Store producer_tile.0(6, 0) = 0.000000 > Store producer_tile.0(7, 0) = 0.000000 > Store producer_tile.0(8, 0) = 0.000000 > Store producer_tile.0(4, 1) = -0.756802 > Store producer_tile.0(5, 1) = -0.958924 > Store producer_tile.0(6, 1) = -0.279415 > Store producer_tile.0(7, 1) = 0.656987 > Store producer_tile.0(8, 1) = 0.989358 > Store producer_tile.0(4, 2) = 0.989358 > Store producer_tile.0(5, 2) = -0.544021 > Store producer_tile.0(6, 2) = -0.536573 > Store producer_tile.0(7, 2) = 0.990607 > Store producer_tile.0(8, 2) = -0.287903 > Store producer_tile.0(4, 3) = -0.536573 > Store producer_tile.0(5, 3) = 0.650288 > Store producer_tile.0(6, 3) = -0.750987 > Store producer_tile.0(7, 3) = 0.836656 > Store producer_tile.0(8, 3) = -0.905578 > Store producer_tile.0(4, 4) = -0.287903 > Store producer_tile.0(5, 4) = 0.912945 > Store producer_tile.0(6, 4) = -0.905578 > Store producer_tile.0(7, 4) = 0.270906 > Store producer_tile.0(8, 4) = 0.551427 > Store consumer_tile.0(4, 0) = -0.428932 > Store consumer_tile.0(5, 0) = -0.309585 > Store consumer_tile.0(6, 0) = 0.094393 > Store consumer_tile.0(7, 0) = 0.411586 > Store consumer_tile.0(4, 1) = -0.317597 > Store consumer_tile.0(5, 1) = -0.579733 > Store consumer_tile.0(6, 1) = 0.207901 > Store consumer_tile.0(7, 1) = 0.587262 > Store consumer_tile.0(4, 2) = 0.139763 > Store consumer_tile.0(5, 2) = -0.295323 > Store consumer_tile.0(6, 2) = 0.134926 > Store consumer_tile.0(7, 2) = 0.158445 > Store consumer_tile.0(4, 3) = 0.184689 > Store consumer_tile.0(5, 3) = -0.023333 > Store consumer_tile.0(6, 3) = -0.137251 > Store consumer_tile.0(7, 3) = 0.188352 > Store producer_tile.0(0, 4) = 0.000000 > Store producer_tile.0(1, 4) = -0.756802 > Store producer_tile.0(2, 4) = 0.989358 > Store producer_tile.0(3, 4) = -0.536573 > Store producer_tile.0(4, 4) = -0.287903 > Store producer_tile.0(0, 5) = 0.000000 > Store producer_tile.0(1, 5) = -0.958924 > Store producer_tile.0(2, 5) = -0.544021 > Store producer_tile.0(3, 5) = 0.650288 > Store producer_tile.0(4, 5) = 0.912945 > Store producer_tile.0(0, 6) = 0.000000 > Store producer_tile.0(1, 6) = -0.279415 > Store producer_tile.0(2, 6) = -0.536573 > Store producer_tile.0(3, 6) = -0.750987 > Store producer_tile.0(4, 6) = -0.905578 > Store producer_tile.0(0, 7) = 0.000000 > Store producer_tile.0(1, 7) = 0.656987 > Store producer_tile.0(2, 7) = 0.990607 > Store producer_tile.0(3, 7) = 0.836656 > Store producer_tile.0(4, 7) = 0.270906 > Store producer_tile.0(0, 8) = 0.000000 > Store producer_tile.0(1, 8) = 0.989358 > Store producer_tile.0(2, 8) = -0.287903 > Store producer_tile.0(3, 8) = -0.905578 > Store producer_tile.0(4, 8) = 0.551427 > Store consumer_tile.0(0, 4) = -0.428932 > Store consumer_tile.0(1, 4) = -0.317597 > Store consumer_tile.0(2, 4) = 0.139763 > Store consumer_tile.0(3, 4) = 0.184689 > Store consumer_tile.0(0, 5) = -0.309585 > Store consumer_tile.0(1, 5) = -0.579733 > Store consumer_tile.0(2, 5) = -0.295323 > Store consumer_tile.0(3, 5) = -0.023333 > Store consumer_tile.0(0, 6) = 0.094393 > Store consumer_tile.0(1, 6) = 0.207901 > Store consumer_tile.0(2, 6) = 0.134926 > Store consumer_tile.0(3, 6) = -0.137251 > Store consumer_tile.0(0, 7) = 0.411586 > Store consumer_tile.0(1, 7) = 0.587262 > Store consumer_tile.0(2, 7) = 0.158445 > Store consumer_tile.0(3, 7) = 0.188352 > Store producer_tile.0(4, 4) = -0.287903 > Store producer_tile.0(5, 4) = 0.912945 > Store producer_tile.0(6, 4) = -0.905578 > Store producer_tile.0(7, 4) = 0.270906 > Store producer_tile.0(8, 4) = 0.551427 > Store producer_tile.0(4, 5) = 0.912945 > Store producer_tile.0(5, 5) = -0.132352 > Store producer_tile.0(6, 5) = -0.988032 > Store producer_tile.0(7, 5) = -0.428183 > Store producer_tile.0(8, 5) = 0.745113 > Store producer_tile.0(4, 6) = -0.905578 > Store producer_tile.0(5, 6) = -0.988032 > Store producer_tile.0(6, 6) = -0.991779 > Store producer_tile.0(7, 6) = -0.916522 > Store producer_tile.0(8, 6) = -0.768255 > Store producer_tile.0(4, 7) = 0.270906 > Store producer_tile.0(5, 7) = -0.428183 > Store producer_tile.0(6, 7) = -0.916522 > Store producer_tile.0(7, 7) = -0.953753 > Store producer_tile.0(8, 7) = -0.521551 > Store producer_tile.0(4, 8) = 0.551427 > Store producer_tile.0(5, 8) = 0.745113 > Store producer_tile.0(6, 8) = -0.768255 > Store producer_tile.0(7, 8) = -0.521551 > Store producer_tile.0(8, 8) = 0.920026 > Store consumer_tile.0(4, 4) = 0.351409 > Store consumer_tile.0(5, 4) = -0.278254 > Store consumer_tile.0(6, 4) = -0.512722 > Store consumer_tile.0(7, 4) = 0.284816 > Store consumer_tile.0(4, 5) = -0.278254 > Store consumer_tile.0(5, 5) = -0.775048 > Store consumer_tile.0(6, 5) = -0.831129 > Store consumer_tile.0(7, 5) = -0.341961 > Store consumer_tile.0(4, 6) = -0.512722 > Store consumer_tile.0(5, 6) = -0.831129 > Store consumer_tile.0(6, 6) = -0.944644 > Store consumer_tile.0(7, 6) = -0.790020 > Store consumer_tile.0(4, 7) = 0.284816 > Store consumer_tile.0(5, 7) = -0.341961 > Store consumer_tile.0(6, 7) = -0.790020 > Store consumer_tile.0(7, 7) = -0.269207 > End pipeline consumer_tile.0()

    // See below for a visualization.

     ![](http://halide-lang.org/tutorials/figures/lesson_08_tile.gif)

    // The producer and consumer now alternate on a per-tile
    // basis. Here's the equivalent C:

    float result[8][8];

    // For every tile of the consumer:
    for (int y_outer = 0; y_outer < 2; y_outer++) {
        for (int x_outer = 0; x_outer < 2; x_outer++) {
            // Compute the x and y coords of the start of this tile.
            int x_base = x_outer * 4;
            int y_base = y_outer * 4;

            // Compute enough of producer to satisfy this tile. A
            // 4x4 tile of the consumer requires a 5x5 tile of the
            // producer.
            float producer_storage[5][5];
            for (int py = y_base; py < y_base + 5; py++) {
                for (int px = x_base; px < x_base + 5; px++) {
                    producer_storage[py - y_base][px - x_base] = sin(px * py);
                }
            }

            // Compute this tile of the consumer
            for (int y_inner = 0; y_inner < 4; y_inner++) {
                for (int x_inner = 0; x_inner < 4; x_inner++) {
                    int x = x_base + x_inner;
                    int y = y_base + y_inner;
                    result[y][x] =
                        (producer_storage[y - y_base][x - x_base] +
                         producer_storage[y - y_base + 1][x - x_base] +
                         producer_storage[y - y_base][x - x_base + 1] +
                         producer_storage[y - y_base + 1][x - x_base + 1]) / 4;
                }
            }
        }
    }

    printf("Pseudo-code for the schedule:\n");
    consumer.print_loop_nest();
    printf("\n");
    **// Click to show output ...**

> produce consumer_tile: > for y.y_outer: > for x.x_outer: > produce producer_tile: > for y: > for x: > producer_tile(...) = ... > consume producer_tile: > for y.y_inner in [0, 3]: > for x.x_inner in [0, 3]: > consumer_tile(...) = ...

    // Tiling can make sense for problems like this one with
    // stencils that reach outwards in x and y. Each tile can be
    // computed independently in parallel, and the redundant work
    // done by each tile isn't so bad once the tiles get large
    // enough.
}

// Let's try a mixed strategy that combines what we have done with
// splitting, parallelizing, and vectorizing. This is one that
// often works well in practice for large images. If you
// understand this schedule, then you understand 95% of scheduling
// in Halide.
{
    Func producer("producer_mixed"), consumer("consumer_mixed");
    producer(x, y) = sin(x * y);
    consumer(x, y) = (producer(x, y) +
                      producer(x, y + 1) +
                      producer(x + 1, y) +
                      producer(x + 1, y + 1)) / 4;

    // Split the y coordinate of the consumer into strips of 16 scanlines:
    Var yo, yi;
    consumer.split(y, yo, yi, 16);
    // Compute the strips using a thread pool and a task queue.
    consumer.parallel(yo);
    // Vectorize across x by a factor of four.
    consumer.vectorize(x, 4);

    // Now store the producer per-strip. This will be 17 scanlines
    // of the producer (16+1), but hopefully it will fold down
    // into a circular buffer of two scanlines:
    producer.store_at(consumer, yo);
    // Within each strip, compute the producer per scanline of the
    // consumer, skipping work done on previous scanlines.
    producer.compute_at(consumer, yi);
    // Also vectorize the producer (because sin is vectorizable on x86 using SSE).
    producer.vectorize(x, 4);

    // Let's leave tracing off this time, because we're going to
    // evaluate over a larger image.
    // consumer.trace_stores();
    // producer.trace_stores();

    Buffer<float> halide_result = consumer.realize({160, 160});

    // See below for a visualization.

     Your browser does not support the video tag :(

    // Here's the equivalent (serial) C:

    float c_result[160][160];

    // For every strip of 16 scanlines (this loop is parallel in
    // the Halide version)
    for (int yo = 0; yo < 160 / 16; yo++) {
        int y_base = yo * 16;

        // Allocate a two-scanline circular buffer for the producer
        float producer_storage[2][161];

        // For every scanline in the strip of 16:
        for (int yi = 0; yi < 16; yi++) {
            int y = y_base + yi;

            for (int py = y; py < y + 2; py++) {
                // Skip scanlines already computed *within this task*
                if (yi > 0 && py == y) continue;

                // Compute this scanline of the producer in 4-wide vectors
                for (int x_vec = 0; x_vec < 160 / 4 + 1; x_vec++) {
                    int x_base = x_vec * 4;
                    // 4 doesn't divide 161, so push the last vector left
                    // (see lesson 05).
                    if (x_base > 161 - 4) x_base = 161 - 4;
                    // If you're on x86, Halide generates SSE code for this part:
                    int x[] = {x_base, x_base + 1, x_base + 2, x_base + 3};
                    float vec[4] = {sinf(x[0] * py), sinf(x[1] * py),
                                    sinf(x[2] * py), sinf(x[3] * py)};
                    producer_storage[py & 1][x[0]] = vec[0];
                    producer_storage[py & 1][x[1]] = vec[1];
                    producer_storage[py & 1][x[2]] = vec[2];
                    producer_storage[py & 1][x[3]] = vec[3];
                }
            }

            // Now compute consumer for this scanline:
            for (int x_vec = 0; x_vec < 160 / 4; x_vec++) {
                int x_base = x_vec * 4;
                // Again, Halide's equivalent here uses SSE.
                int x[] = {x_base, x_base + 1, x_base + 2, x_base + 3};
                float vec[] = {
                    (producer_storage[y & 1][x[0]] +
                     producer_storage[(y + 1) & 1][x[0]] +
                     producer_storage[y & 1][x[0] + 1] +
                     producer_storage[(y + 1) & 1][x[0] + 1]) /
                        4,
                    (producer_storage[y & 1][x[1]] +
                     producer_storage[(y + 1) & 1][x[1]] +
                     producer_storage[y & 1][x[1] + 1] +
                     producer_storage[(y + 1) & 1][x[1] + 1]) /
                        4,
                    (producer_storage[y & 1][x[2]] +
                     producer_storage[(y + 1) & 1][x[2]] +
                     producer_storage[y & 1][x[2] + 1] +
                     producer_storage[(y + 1) & 1][x[2] + 1]) /
                        4,
                    (producer_storage[y & 1][x[3]] +
                     producer_storage[(y + 1) & 1][x[3]] +
                     producer_storage[y & 1][x[3] + 1] +
                     producer_storage[(y + 1) & 1][x[3] + 1]) /
                        4};

                c_result[y][x[0]] = vec[0];
                c_result[y][x[1]] = vec[1];
                c_result[y][x[2]] = vec[2];
                c_result[y][x[3]] = vec[3];
            }
        }
    }
    printf("Pseudo-code for the schedule:\n");
    consumer.print_loop_nest();
    printf("\n");
    **// Click to show output ...**

> produce consumer_mixed: > parallel y.yo: > store producer_mixed: > for y.yi. in [-1, 15]: > produce producer_mixed: > for x.x: > vectorized x.v1 in [0, 3]: > producer_mixed(...) = ... > consume producer_mixed: > for x.x: > vectorized x.v0 in [0, 3]: > consumer_mixed(...) = ...

    // Look on my code, ye mighty, and despair!

    // Let's check the C result against the Halide result. Doing
    // this I found several bugs in my C implementation, which
    // should tell you something.
    for (int y = 0; y < 160; y++) {
        for (int x = 0; x < 160; x++) {
            float error = halide_result(x, y) - c_result[y][x];
            // It's floating-point math, so we'll allow some slop:
            if (error < -0.001f || error > 0.001f) {
                printf("halide_result(%d, %d) = %f instead of %f\n",
                       x, y, halide_result(x, y), c_result[y][x]);
                return -1;
            }
        }
    }
}

// This stuff is hard. We ended up in a three-way trade-off
// between memory bandwidth, redundant work, and
// parallelism. Halide can't make the correct choice for you
// automatically (sorry). Instead it tries to make it easier for
// you to explore various options, without messing up your
// program. In fact, Halide promises that scheduling calls like
// compute_root won't change the meaning of your algorithm -- you
// should get the same bits back no matter how you schedule
// things.

// So be empirical! Experiment with various schedules and keep a
// log of performance. Form hypotheses and then try to prove
// yourself wrong. Don't assume that you just need to vectorize
// your code by a factor of four and run it on eight cores and
// you'll get 32x faster. This almost never works. Modern systems
// are complex enough that you can't predict performance reliably
// without running your code.

// We suggest you start by scheduling all of your non-trivial
// stages compute_root, and then work from the end of the pipeline
// upwards, inlining, parallelizing, and vectorizing each stage in
// turn until you reach the top.

// Halide is not just about vectorizing and parallelizing your
// code. That's not enough to get you very far. Halide is about
// giving you tools that help you quickly explore different
// trade-offs between locality, redundant work, and parallelism,
// without messing up the actual result you're trying to compute.

printf("Success!\n");
return 0;

}