Commit aeb90d1f authored by Dorothea Vom Bruch's avatar Dorothea Vom Bruch
Browse files

Merge branch 'dcampora_fix_velo_determinism'

parents 5b355016 d06763d8
Pipeline #959835 passed with stages
in 13 minutes and 53 seconds
......@@ -94,20 +94,19 @@ struct CheckerInvoker {
// TODO: Enable back and understand if any duplicated hits exist
// Check all tracks for duplicate LHCb IDs
/* for (int i_track = 0; i_track < event_tracks.size(); ++i_track) { */
/* const auto& track = event_tracks[i_track]; */
/* auto ids = track.ids(); */
/* std::sort(std::begin(ids), std::end(ids)); */
/* bool containsDuplicates = (std::unique(std::begin(ids), std::end(ids))) != std::end(ids); */
/* if (containsDuplicates) { */
/* warning_cout << "WARNING: Track #" << i_track << " contains duplicate LHCb IDs" << std::endl << std::hex;
*/
/* for (auto id : ids) { */
/* warning_cout << "0x" << id << ", "; */
/* } */
/* warning_cout << std::endl << std::endl << std::dec; */
/* } */
/* } */
for (int i_track = 0; i_track < event_tracks.size(); ++i_track) {
const auto& track = event_tracks[i_track];
auto ids = track.ids();
std::sort(std::begin(ids), std::end(ids));
bool containsDuplicates = (std::unique(std::begin(ids), std::end(ids))) != std::end(ids);
if (containsDuplicates) {
warning_cout << "WARNING: Track #" << i_track << " contains duplicate LHCb IDs" << std::endl << std::hex;
for (auto id : ids) {
warning_cout << "0x" << id << ", ";
}
warning_cout << std::endl << std::endl << std::dec;
}
}
}
}
return scifi_ids_events;
......
......@@ -14,11 +14,9 @@ __global__ void estimate_input_size(
const uint event_number = blockIdx.x;
const uint selected_event_number = dev_event_list[event_number];
const uint raw_bank_starting_chunk = threadIdx.y; // up to 26
const uint raw_bank_chunk_size = Velo::Constants::n_sensors / blockDim.y; // blockDim.y = 26 -> chunk_size = 8
const char* raw_input = dev_raw_input + dev_raw_input_offsets[selected_event_number];
uint* estimated_input_size = dev_estimated_input_size + event_number * Velo::Constants::n_modules;
uint* module_cluster_num = dev_module_cluster_num + event_number * Velo::Constants::n_modules;
auto module_cluster_num = dev_module_cluster_num + event_number * Velo::Constants::n_modules;
uint* event_candidate_num = dev_event_candidate_num + event_number;
uint32_t* cluster_candidates = dev_cluster_candidates + event_number * VeloClustering::max_candidates_event;
......@@ -37,217 +35,211 @@ __global__ void estimate_input_size(
// Read raw event
const auto raw_event = VeloRawEvent(raw_input);
for (int raw_bank_rel_number = 0; raw_bank_rel_number < raw_bank_chunk_size; ++raw_bank_rel_number) {
const int raw_bank_number = raw_bank_starting_chunk * raw_bank_chunk_size + raw_bank_rel_number;
if (raw_bank_number < raw_event.number_of_raw_banks) {
// Read raw bank
const auto raw_bank = VeloRawBank(raw_event.payload + raw_event.raw_bank_offset[raw_bank_number]);
uint* estimated_module_size = estimated_input_size + (raw_bank.sensor_index >> 2);
for (int i = 0; i < (raw_bank.sp_count + blockDim.x - 1) / blockDim.x; ++i) {
const auto sp_index = i * blockDim.x + threadIdx.x;
if (sp_index < raw_bank.sp_count) {
// Decode sp
const uint32_t sp_word = raw_bank.sp_word[sp_index];
const uint32_t no_sp_neighbours = sp_word & 0x80000000U;
const uint32_t sp_addr = (sp_word & 0x007FFF00U) >> 8;
const uint8_t sp = sp_word & 0xFFU;
if (no_sp_neighbours) {
// The SP does not have any neighbours
// The problem is as simple as a lookup pattern
// It can be implemented in two operations
// Pattern 0:
// (x x)
// o o
// (x x
// x x)
//
for (int raw_bank_number = threadIdx.y; raw_bank_number < raw_event.number_of_raw_banks;
raw_bank_number += blockDim.y) {
// Read raw bank
const auto raw_bank = VeloRawBank(raw_event.payload + raw_event.raw_bank_offset[raw_bank_number]);
uint* estimated_module_size = estimated_input_size + (raw_bank.sensor_index >> 2);
for (int sp_index = threadIdx.x; sp_index < raw_bank.sp_count; sp_index += blockDim.x) { // Decode sp
const uint32_t sp_word = raw_bank.sp_word[sp_index];
const uint32_t no_sp_neighbours = sp_word & 0x80000000U;
const uint32_t sp_addr = (sp_word & 0x007FFF00U) >> 8;
const uint8_t sp = sp_word & 0xFFU;
if (no_sp_neighbours) {
// The SP does not have any neighbours
// The problem is as simple as a lookup pattern
// It can be implemented in two operations
// Pattern 0:
// (x x)
// o o
// (x x
// x x)
//
// Note: Pixel order in sp
// 0x08 | 0x80
// 0x04 | 0x40
// 0x02 | 0x20
// 0x01 | 0x10
const bool pattern_0 = (sp & 0x88) && !(sp & 0x44) && (sp & 0x33);
// Pattern 1:
// (x x
// x x)
// o o
// (x x)
const bool pattern_1 = (sp & 0xCC) && !(sp & 0x22) && (sp & 0x11);
const uint number_of_clusters = (pattern_0 | pattern_1) ? 2 : 1;
// Add the found clusters
uint current_estimated_module_size = atomicAdd(estimated_module_size, number_of_clusters);
assert(current_estimated_module_size < Velo::Constants::max_numhits_in_module);
}
else {
// Find candidates that follow this condition:
// For pixel o, all pixels x should *not* be populated
// x x
// o x
// x
// Load required neighbouring pixels in order to check the condition
// x x x
// o o x
// o o x
// o o x
// o o x
// x x
//
// Use an int for storing and calculating
// Bit order
//
// 4 10 16
// 3 9 15
// 2 8 14
// 1 7 13
// 0 6 12
// 5 11
//
// Bit masks
//
// 0x10 0x0400 0x010000
// 0x08 0x0200 0x8000
// 0x04 0x0100 0x4000
// 0x02 0x80 0x2000
// 0x01 0x40 0x1000
// 0x20 0x0800
uint32_t pixels = (sp & 0x0F) | ((sp & 0xF0) << 2);
// Current row and col
const uint32_t sp_row = sp_addr & 0x3FU;
const uint32_t sp_col = sp_addr >> 6;
for (uint k = 0; k < raw_bank.sp_count; ++k) {
const uint32_t other_sp_word = raw_bank.sp_word[k];
const uint32_t other_no_sp_neighbours = sp_word & 0x80000000U;
if (!other_no_sp_neighbours) {
const uint32_t other_sp_addr = (other_sp_word & 0x007FFF00U) >> 8;
const uint32_t other_sp_row = other_sp_addr & 0x3FU;
const uint32_t other_sp_col = (other_sp_addr >> 6);
const uint8_t other_sp = other_sp_word & 0xFFU;
// Populate pixels
// Note: Pixel order in sp
// 0x08 | 0x80
// 0x04 | 0x40
// 0x02 | 0x20
// 0x01 | 0x10
const bool pattern_0 = (sp & 0x88) && !(sp & 0x44) && sp & 0x33;
// Pattern 1:
// (x x
// x x)
// o o
// (x x)
const bool pattern_1 = (sp & 0xCC) && !(sp & 0x22) && sp & 0x11;
const uint number_of_clusters = 1 + (pattern_0 | pattern_1);
// Add the found clusters
uint current_estimated_module_size = atomicAdd(estimated_module_size, number_of_clusters);
assert(current_estimated_module_size < Velo::Constants::max_numhits_in_module);
}
else {
// Find candidates that follow this condition:
// For pixel o, all pixels x should *not* be populated
// x x
// o x
// x
// Load required neighbouring pixels in order to check the condition
// x x x
// o o x
// o o x
// o o x
// o o x
// x x
//
// Use an int for storing and calculating
// Bit order
//
// 4 10 16
// 3 9 15
// 2 8 14
// 1 7 13
// 0 6 12
// 5 11
//
// Bit masks
//
// 0x10 0x0400 0x010000
// 0x08 0x0200 0x8000
// 0x04 0x0100 0x4000
// 0x02 0x80 0x2000
// 0x01 0x40 0x1000
// 0x20 0x0800
uint32_t pixels = (sp & 0x0F) | ((sp & 0xF0) << 2);
// Current row and col
const uint32_t sp_row = sp_addr & 0x3FU;
const uint32_t sp_col = sp_addr >> 6;
for (uint k = 0; k < raw_bank.sp_count; ++k) {
const uint32_t other_sp_word = raw_bank.sp_word[k];
const uint32_t other_no_sp_neighbours = sp_word & 0x80000000U;
if (!other_no_sp_neighbours) {
const uint32_t other_sp_addr = (other_sp_word & 0x007FFF00U) >> 8;
const uint32_t other_sp_row = other_sp_addr & 0x3FU;
const uint32_t other_sp_col = (other_sp_addr >> 6);
const uint8_t other_sp = other_sp_word & 0xFFU;
// Populate pixels
// Note: Pixel order in sp
// 0x08 | 0x80
// 0x04 | 0x40
// 0x02 | 0x20
// 0x01 | 0x10
const bool is_top = other_sp_row == (sp_row + 1) && other_sp_col == sp_col;
const bool is_top_right = other_sp_row == (sp_row + 1) && other_sp_col == (sp_col + 1);
const bool is_right = other_sp_row == sp_row && other_sp_col == (sp_col + 1);
const bool is_right_bottom = other_sp_row == (sp_row - 1) && other_sp_col == (sp_col + 1);
const bool is_bottom = other_sp_row == (sp_row - 1) && other_sp_col == sp_col;
if (is_top || is_top_right || is_right || is_right_bottom || is_bottom) {
pixels |= is_top * ((other_sp & 0x01 | ((other_sp & 0x10) << 2)) << 4);
pixels |= is_top_right * ((other_sp & 0x01) << 16);
pixels |= is_right * ((other_sp & 0x0F) << 12);
pixels |= is_right_bottom * ((other_sp & 0x08) << 8);
pixels |= is_bottom * ((other_sp & 0x80) >> 2);
}
}
const bool is_top = other_sp_row == (sp_row + 1) && other_sp_col == sp_col;
const bool is_top_right = other_sp_row == (sp_row + 1) && other_sp_col == (sp_col + 1);
const bool is_right = other_sp_row == sp_row && other_sp_col == (sp_col + 1);
const bool is_right_bottom = other_sp_row == (sp_row - 1) && other_sp_col == (sp_col + 1);
const bool is_bottom = other_sp_row == (sp_row - 1) && other_sp_col == sp_col;
if (is_top || is_top_right || is_right || is_right_bottom || is_bottom) {
pixels |= is_top * ((other_sp & 0x01 | ((other_sp & 0x10) << 2)) << 4);
pixels |= is_top_right * ((other_sp & 0x01) << 16);
pixels |= is_right * ((other_sp & 0x0F) << 12);
pixels |= is_right_bottom * ((other_sp & 0x08) << 8);
pixels |= is_bottom * ((other_sp & 0x80) >> 2);
}
}
}
// 16 1024 65536
// 8 512 32768
// 4 256 16384
// 2 128 8192
// 1 64 4096
// 32 2048
//
// Look up pattern
// x x
// o x
// x
//
uint found_cluster_candidates = 0;
assert(raw_bank_number < Velo::Constants::n_sensors);
const uint32_t sp_inside_pixel = pixels & 0x3CF;
const uint32_t mask =
(sp_inside_pixel << 1) | (sp_inside_pixel << 5) | (sp_inside_pixel << 6) | (sp_inside_pixel << 7);
const uint32_t working_cluster = mask & (~pixels);
const uint32_t candidates_temp =
(working_cluster >> 1) & (working_cluster >> 5) & (working_cluster >> 6) & (working_cluster >> 7);
const uint32_t candidates = candidates_temp & pixels;
const uint8_t candidates_uint8 = (candidates & 0x03) | ((candidates & 0xC0) >> 4) |
((candidates & 0x0C) << 2) | ((candidates & 0x0300) >> 2);
// Add candidates 0, 1, 4, 5
// Only one of those candidates can be flagged at a time
if (candidates_uint8 & 0xF) {
// if ((candidates_uint8 & 0xF) >= 9) {
// auto print_candidates8 = [] (const uint8_t& candidates) {
// printf("%i%i\n%i%i\n%i%i\n%i%i\n\n",
// (candidates & 0x80) > 0, (candidates & 0x40) > 0,
// (candidates & 0x20) > 0, (candidates & 0x10) > 0,
// (candidates & 0x8) > 0, (candidates & 0x4) > 0,
// (candidates & 0x2) > 0, candidates & 0x1
// );
// };
// auto print_candidates = [] (const uint32_t& candidates) {
// printf("%i%i%i\n%i%i%i\n%i%i%i\n%i%i%i\n%i%i%i\n %i%i\n\n",
// (candidates & 0x10) > 0, (candidates & 0x0400) > 0, (candidates & 0x010000) > 0,
// (candidates & 0x08) > 0, (candidates & 0x0200) > 0, (candidates & 0x8000) > 0,
// (candidates & 0x04) > 0, (candidates & 0x0100) > 0, (candidates & 0x4000) > 0,
// (candidates & 0x02) > 0, (candidates & 0x80) > 0, (candidates & 0x2000) > 0,
// (candidates & 0x01) > 0, (candidates & 0x40) > 0, (candidates & 0x1000) > 0,
// (candidates & 0x20) > 0, (candidates & 0x0800) > 0
// );
// };
// printf("pixels:\n");
// print_candidates(pixels);
// printf("sp_inside_pixel:\n");
// print_candidates(sp_inside_pixel);
// printf("mask:\n");
// print_candidates(mask);
// printf("working_cluster:\n");
// print_candidates(working_cluster);
// printf("candidates:\n");
// print_candidates(candidates);
// printf("candidates_uint8:\n");
// print_candidates8(candidates_uint8);
// }
// Verify candidates are correctly created
assert((candidates_uint8 & 0xF) < 9);
// Decode the candidate number (ie. find out the active bit)
const uint8_t k = dev_velo_candidate_ks[candidates_uint8 & 0xF];
auto current_cluster_candidate = atomicAdd(event_candidate_num, 1);
const uint32_t candidate = (sp_index << 11) | (raw_bank_number << 3) | k;
assert(current_cluster_candidate < blockDim.x * VeloClustering::max_candidates_event);
cluster_candidates[current_cluster_candidate] = candidate;
++found_cluster_candidates;
}
// 16 1024 65536
// 8 512 32768
// 4 256 16384
// 2 128 8192
// 1 64 4096
// 32 2048
//
// Look up pattern
// x x
// o x
// x
//
uint found_cluster_candidates = 0;
assert(raw_bank_number < Velo::Constants::n_sensors);
const uint32_t sp_inside_pixel = pixels & 0x3CF;
const uint32_t mask =
(sp_inside_pixel << 1) | (sp_inside_pixel << 5) | (sp_inside_pixel << 6) | (sp_inside_pixel << 7);
const uint32_t working_cluster = mask & (~pixels);
const uint32_t candidates_temp =
(working_cluster >> 1) & (working_cluster >> 5) & (working_cluster >> 6) & (working_cluster >> 7);
const uint32_t candidates = candidates_temp & pixels;
const uint8_t candidates_uint8 =
(candidates & 0x03) | ((candidates & 0xC0) >> 4) | ((candidates & 0x0C) << 2) | ((candidates & 0x0300) >> 2);
// Add candidates 0, 1, 4, 5
// Only one of those candidates can be flagged at a time
if (candidates_uint8 & 0xF) {
// if ((candidates_uint8 & 0xF) >= 9) {
// auto print_candidates8 = [] (const uint8_t& candidates) {
// printf("%i%i\n%i%i\n%i%i\n%i%i\n\n",
// (candidates & 0x80) > 0, (candidates & 0x40) > 0,
// (candidates & 0x20) > 0, (candidates & 0x10) > 0,
// (candidates & 0x8) > 0, (candidates & 0x4) > 0,
// (candidates & 0x2) > 0, candidates & 0x1
// );
// };
// auto print_candidates = [] (const uint32_t& candidates) {
// printf("%i%i%i\n%i%i%i\n%i%i%i\n%i%i%i\n%i%i%i\n %i%i\n\n",
// (candidates & 0x10) > 0, (candidates & 0x0400) > 0, (candidates & 0x010000) > 0,
// (candidates & 0x08) > 0, (candidates & 0x0200) > 0, (candidates & 0x8000) > 0,
// (candidates & 0x04) > 0, (candidates & 0x0100) > 0, (candidates & 0x4000) > 0,
// (candidates & 0x02) > 0, (candidates & 0x80) > 0, (candidates & 0x2000) > 0,
// (candidates & 0x01) > 0, (candidates & 0x40) > 0, (candidates & 0x1000) > 0,
// (candidates & 0x20) > 0, (candidates & 0x0800) > 0
// );
// };
// printf("pixels:\n");
// print_candidates(pixels);
// printf("sp_inside_pixel:\n");
// print_candidates(sp_inside_pixel);
// printf("mask:\n");
// print_candidates(mask);
// printf("working_cluster:\n");
// print_candidates(working_cluster);
// printf("candidates:\n");
// print_candidates(candidates);
// printf("candidates_uint8:\n");
// print_candidates8(candidates_uint8);
// }
// Verify candidates are correctly created
assert((candidates_uint8 & 0xF) < 9);
// Decode the candidate number (ie. find out the active bit)
const uint8_t k = dev_velo_candidate_ks[candidates_uint8 & 0xF];
auto current_cluster_candidate = atomicAdd(event_candidate_num, 1);
const uint32_t candidate = (sp_index << 11) | (raw_bank_number << 3) | k;
assert(current_cluster_candidate < blockDim.x * VeloClustering::max_candidates_event);
cluster_candidates[current_cluster_candidate] = candidate;
++found_cluster_candidates;
}
// Add candidates 2, 3, 6, 7
// Only one of those candidates can be flagged at a time
if (candidates_uint8 & 0xF0) {
assert(((candidates_uint8 >> 4) & 0xF) < 9);
const uint8_t k = dev_velo_candidate_ks[(candidates_uint8 >> 4)] + 2;
auto current_cluster_candidate = atomicAdd(event_candidate_num, 1);
const uint32_t candidate = (sp_index << 11) | (raw_bank_number << 3) | k;
assert(current_cluster_candidate < blockDim.x * VeloClustering::max_candidates_event);
cluster_candidates[current_cluster_candidate] = candidate;
++found_cluster_candidates;
}
// Add candidates 2, 3, 6, 7
// Only one of those candidates can be flagged at a time
if (candidates_uint8 & 0xF0) {
assert(((candidates_uint8 >> 4) & 0xF) < 9);
const uint8_t k = dev_velo_candidate_ks[(candidates_uint8 >> 4)] + 2;
auto current_cluster_candidate = atomicAdd(event_candidate_num, 1);
const uint32_t candidate = (sp_index << 11) | (raw_bank_number << 3) | k;
assert(current_cluster_candidate < blockDim.x * VeloClustering::max_candidates_event);
cluster_candidates[current_cluster_candidate] = candidate;
++found_cluster_candidates;
}
// Add the found cluster candidates
if (found_cluster_candidates > 0) {
uint current_estimated_module_size = atomicAdd(estimated_module_size, found_cluster_candidates);
assert(current_estimated_module_size < Velo::Constants::max_numhits_in_module);
}
}
// Add the found cluster candidates
if (found_cluster_candidates > 0) {
uint current_estimated_module_size = atomicAdd(estimated_module_size, found_cluster_candidates);
assert(current_estimated_module_size < Velo::Constants::max_numhits_in_module);
}
}
}
......
......@@ -16,7 +16,8 @@ __device__ void track_forwarding(
Velo::TrackHits* tracks,
const uint number_of_hits,
int* dev_atomics_velo,
const int ip_shift);
const int ip_shift,
const int first_module);
/**
* @brief Finds candidates in the specified module.
......@@ -49,7 +50,7 @@ __device__ std::tuple<int, int> find_forward_candidates(
track_extrapolation_phi,
Velo::Tracking::forward_phi_tolerance);
first_candidate += module.hitStart;
last_candidate = last_candidate == 0 ? first_candidate + 1 : first_candidate + last_candidate;
last_candidate = first_candidate + last_candidate;
}
return std::tuple<int, int> {first_candidate, last_candidate};
......
......@@ -35,39 +35,39 @@ __device__ void fill_candidates_impl(
const auto phi_window =
Velo::Tracking::phi_extrapolation_base + std::abs(hit_Zs[h1_index]) * Velo::Tracking::phi_extrapolation_coef;
int first_h0_bin = -1, last_h0_bin = -1;
int first_h0_bin = -1, size_h0_bin = 0;
if (m0_hitNums > 0) {
// Do a binary search for h0 candidates
first_h0_bin = binary_search_first_candidate(hit_Phis + m0_hitStarts, m0_hitNums, h1_phi, phi_window);
if (first_h0_bin != -1) {
// Find last h0 candidate
last_h0_bin = binary_search_second_candidate(
size_h0_bin = binary_search_second_candidate(
hit_Phis + m0_hitStarts + first_h0_bin, m0_hitNums - first_h0_bin, h1_phi, phi_window);
first_h0_bin += m0_hitStarts;
last_h0_bin = last_h0_bin == 0 ? first_h0_bin + 1 : first_h0_bin + last_h0_bin;
// size_h0_bin = (size_h0_bin == 0) ? 1 : size_h0_bin;
}
}
h0_candidates[2 * h1_index] = first_h0_bin;
h0_candidates[2 * h1_index + 1] = last_h0_bin;
h0_candidates[2 * h1_index + 1] = size_h0_bin;
int first_h2_bin = -1, last_h2_bin = -1;
int first_h2_bin = -1, size_h2_bin = 0;
if (m2_hitNums > 0) {
// Do a binary search for h2 candidates
first_h2_bin = binary_search_first_candidate(hit_Phis + m2_hitStarts, m2_hitNums, h1_phi, phi_window);
if (first_h2_bin != -1) {
// Find last h0 candidate
last_h2_bin = binary_search_second_candidate(
size_h2_bin = binary_search_second_candidate(
hit_Phis + m2_hitStarts + first_h2_bin, m2_hitNums - first_h2_bin, h1_phi, phi_window);
first_h2_bin += m2_hitStarts;
last_h2_bin = last_h2_bin == 0 ? first_h2_bin + 1 : first_h2_bin + last_h2_bin;
// size_h2_bin = (size_h2_bin == 0) ? 1 : size_h2_bin;
}
}
h2_candidates[2 * h1_index] = first_h2_bin;
h2_candidates[2 * h1_index + 1] = last_h2_bin;
h2_candidates[2 * h1_index + 1] = size_h2_bin;
}
}
......
......@@ -29,8 +29,8 @@ __device__ void process_modules(
// Prepare the first seeding iteration
// Load shared module information
if (threadIdx.x < 6) {
const auto module_number = first_module - threadIdx.x;
if (threadIdx.x < 4) {
const auto module_number = first_module - threadIdx.x - 2;
module_data[threadIdx.x].hitStart = module_hitStarts[module_number] - hit_offset;
module_data[threadIdx.x].hitNums = module_hitNums[module_number];
module_data[threadIdx.x].z = dev_velo_module_zs[module_number];
......@@ -57,15 +57,15 @@ __device__ void process_modules(
uint last_ttf = 0;
first_module -= 2;
while (first_module >= 4) {
while (first_module > 4) {
// Due to WAR between trackSeedingFirst and the code below
__syncthreads();
// Iterate in modules
// Load in shared
if (threadIdx.x < 6) {
const auto module_number = first_module - threadIdx.x;
if (threadIdx.x < 4) {
const auto module_number = first_module - threadIdx.x - 2;
module_data[threadIdx.x].hitStart = module_hitStarts[module_number] - hit_offset;
module_data[threadIdx.x].hitNums = module_hitNums[module_number];
module_data[threadIdx.x].z = dev_velo_module_zs[module_number];
......@@ -95,7 +95,8 @@ __device__ void process_modules(
tracks,
number_of_hits,
dev_atomics_velo,
ip_shift);
ip_shift,
first_module);
// Due to ttf_insert_pointer
__syncthreads();
......@@ -125,21 +126,17 @@ __device__ void process_modules(
const auto diff_ttf = last_ttf - prev_ttf;
// Process the last bunch of track_to_follows
for (int i = 0; i < (diff_ttf + blockDim.x - 1) / blockDim.x; ++i) {
const auto ttf_element = blockDim.x * i + threadIdx.x;