integratefields.clΒΆ

///
/// @file integratefields.cl
///
/// Streamline trajectory integration kernel and related tracing functions.
///
/// @author CPS
///
/// @todo Fix big-DTM crash issue
/// @todo Perhaps use compiler directive volatile where variables not const?
/// @todo Update doc about trajectory integration to describe subpixel seeding & jittering
///
/// @bug Crashes (reported as 'abort 6' by PyOpenCL) occur for very large DTMs.
///      The reason remains obscure: it may be because of GPU timeout, but more likely
///      is because of a memory leakage.
///

///
/// Byte reversal
///
/// Macro to perform byte reversal per GJS's suggestion.
///
/// @param[in,out] initial_rng_state: RNG state (thus initally the seed) and RNG variate
///
#define BYTE_REVERSAL(initial_rng_state) \
    initial_rng_state =   (initial_rng_state>>24)&0xff \
                        | (initial_rng_state>> 8)&0xff00 \
                        | (initial_rng_state<< 8)&0xff0000 \
                        | (initial_rng_state<<24)&0xff000000;

#ifdef KERNEL_INTEGRATE_FIELDS
///
/// GPU kernel that drives streamline integration from seed positions
/// given in @p seed_point_array, controlled by the 'flow' vector field
/// given in @p uv_array, and either terminated at pixels masked in
/// mask_array or because a streamline exceeds a threshold
/// distance (length or number of integration points) given by parameters
/// stored in info. Further integration parameters are provided in this struct.
///
/// The kernel acts on one seed point only. It chooses this seed point
/// by computing a global id and using it to index the @p seed_point_array.
/// UPDATE: now doing sub-pixel streamlines as a set per seed point... need to doc here
///
/// Compiled if KERNEL_INTEGRATE_FIELDS is defined.
///
/// @param[in]  seed_point_array: list of initial streamline point vectors,
///                               one allotted to each kernel instance
/// @param[in]  mask_array: grid pixel mask (padded),
///                         with @p true = masked, @p false = good
/// @param[in]  uv_array: flow unit velocity vector grid (padded)
/// @param[out] mapping_array: multi-flag array
/// @param[out] slc_array: grid recording accumulated count of streamline integration
///                        steps across each pixel (padded)
/// @param[out] slt_array: grid recording accumulated count of streamline segment lengths
///                        crossing each pixel (padded)
///
/// @returns void
///
/// @ingroup integrate
///
__kernel void integrate_fields( __global const float2 *seed_point_array,
                                __global const bool   *mask_array,
                                __global const float2 *uv_array,
                                __global       uint   *mapping_array,
                                __global       uint   *slc_array,
                                __global       uint   *slt_array )
{
    // global_id plus the chunk SEEDS_CHUNK_OFFSET is a seed point index
    const uint global_id = get_global_id(0u)+get_global_id(1u)*get_global_size(0u),
               seed_idx = (SEEDS_CHUNK_OFFSET)+global_id;
#ifdef VERBOSE
    // Report how kernel instances are distributed
    if (global_id==0 || global_id==get_global_offset(0u)) {
        printf("\n  >>> on GPU/OpenCL device: id=%d offset=%d ",
                get_global_id(0u),
                get_global_offset(0u));
        printf("#workitems=%d x #workgroups=%d = %d=%d\n",
                get_local_size(0u), get_num_groups(0u),
                get_local_size(0u)*get_num_groups(0u),
                get_global_size(0u));
    }
#endif
    if (seed_idx>=N_SEED_POINTS) {
        // This is a "padding" seed, so let's bail
        return;
    }

    const float2 current_seed_point_vec = seed_point_array[seed_idx];
    __private uint i=0,j=0, initial_rng_state;

    // Trace a set of streamlines from a grid of sub-pixel positions centered
    //    on the seed point
    // Generate an initial RNG state (aka 'seed the RNG')
    //   [was: using the sum of the current pixel index and the sub-pixel index]
    //   using the current pixel index ("seed_idx")
    //   byte-reversed per GJS suggestion
    initial_rng_state = seed_idx;
    BYTE_REVERSAL(initial_rng_state);
    lehmer_rand_uint(&initial_rng_state);
    BYTE_REVERSAL(initial_rng_state);


    for (j=0u;j<SUBPIXEL_SEED_POINT_DENSITY;j++) {
        for (i=0u;i<SUBPIXEL_SEED_POINT_DENSITY;i++){
            // Trace a jittered streamline from a sub-pixel-offset first point
            jittered_trajectory(uv_array, mask_array,
                                mapping_array, slc_array, slt_array,
                                global_id, seed_idx,
                                current_seed_point_vec + (float2)(
                                    (float)i*SUBPIXEL_SEED_STEP-SUBPIXEL_SEED_HALFSPAN,
                                    (float)j*SUBPIXEL_SEED_STEP-SUBPIXEL_SEED_HALFSPAN ),
                                initial_rng_state);
        }
    }
}
#endif