I'm trying to optimize an algorithm (Lattice Boltzmann) for parallel computing using C++ AMP. And looking for some suggestions to optimize the memory layout, just found out that removing one parameter from the structure into another vector (the blocked vector) gave and increase of about 10%.
Anyone got any tips that can further improve this, or something i should take into consideration? Below is the most time consuming function that is executed for each timestep, and the structure used for the layout.
struct grid_cell { // int blocked; // Define if blocked float n; // North float ne; // North-East float e; // East float se; // South-East float s; float sw; float w; float nw; float c; // Center }; int collision(const struct st_parameters param, vector<struct grid_cell> &node, vector<struct grid_cell> &tmp_node, vector<int> &obstacle) { int x,y; int i = 0; float c_sq = 1.0f/3.0f; // Square of speed of sound float w0 = 4.0f/9.0f; // Weighting factors float w1 = 1.0f/9.0f; float w2 = 1.0f/36.0f; int chunk = param.ny/20; float total_density = 0; float u_x,u_y; // Avrage velocities in x and y direction float u[9]; // Directional velocities float d_equ[9]; // Equalibrium densities float u_sq; // Squared velocity float local_density; // Sum of densities in a particular node for(y=0;y<param.ny;y++) { for(x=0;x<param.nx;x++) { i = y*param.nx + x; // Node index // Dont consider blocked cells if (obstacle[i] == 0) { // Calculate local density local_density = 0.0; local_density += tmp_node[i].n; local_density += tmp_node[i].e; local_density += tmp_node[i].s; local_density += tmp_node[i].w; local_density += tmp_node[i].ne; local_density += tmp_node[i].se; local_density += tmp_node[i].sw; local_density += tmp_node[i].nw; local_density += tmp_node[i].c; // Calculate x velocity component u_x = (tmp_node[i].e + tmp_node[i].ne + tmp_node[i].se - (tmp_node[i].w + tmp_node[i].nw + tmp_node[i].sw)) / local_density; // Calculate y velocity component u_y = (tmp_node[i].n + tmp_node[i].ne + tmp_node[i].nw - (tmp_node[i].s + tmp_node[i].sw + tmp_node[i].se)) / local_density; // Velocity squared u_sq = u_x*u_x +u_y*u_y; // Directional velocity components; u[1] = u_x; // East u[2] = u_y; // North u[3] = -u_x; // West u[4] = - u_y; // South u[5] = u_x + u_y; // North-East u[6] = -u_x + u_y; // North-West u[7] = -u_x - u_y; // South-West u[8] = u_x - u_y; // South-East // Equalibrium densities // Zero velocity density: weight w0 d_equ[0] = w0 * local_density * (1.0f - u_sq / (2.0f * c_sq)); // Axis speeds: weight w1 d_equ[1] = w1 * local_density * (1.0f + u[1] / c_sq + (u[1] * u[1]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); d_equ[2] = w1 * local_density * (1.0f + u[2] / c_sq + (u[2] * u[2]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); d_equ[3] = w1 * local_density * (1.0f + u[3] / c_sq + (u[3] * u[3]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); d_equ[4] = w1 * local_density * (1.0f + u[4] / c_sq + (u[4] * u[4]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); // Diagonal speeds: weight w2 d_equ[5] = w2 * local_density * (1.0f + u[5] / c_sq + (u[5] * u[5]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); d_equ[6] = w2 * local_density * (1.0f + u[6] / c_sq + (u[6] * u[6]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); d_equ[7] = w2 * local_density * (1.0f + u[7] / c_sq + (u[7] * u[7]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); d_equ[8] = w2 * local_density * (1.0f + u[8] / c_sq + (u[8] * u[8]) / (2.0f * c_sq * c_sq) - u_sq / (2.0f * c_sq)); // Relaxation step node[i].c = (tmp_node[i].c + param.omega * (d_equ[0] - tmp_node[i].c)); node[i].e = (tmp_node[i].e + param.omega * (d_equ[1] - tmp_node[i].e)); node[i].n = (tmp_node[i].n + param.omega * (d_equ[2] - tmp_node[i].n)); node[i].w = (tmp_node[i].w + param.omega * (d_equ[3] - tmp_node[i].w)); node[i].s = (tmp_node[i].s + param.omega * (d_equ[4] - tmp_node[i].s)); node[i].ne = (tmp_node[i].ne + param.omega * (d_equ[5] - tmp_node[i].ne)); node[i].nw = (tmp_node[i].nw + param.omega * (d_equ[6] - tmp_node[i].nw)); node[i].sw = (tmp_node[i].sw + param.omega * (d_equ[7] - tmp_node[i].sw)); node[i].se = (tmp_node[i].se + param.omega * (d_equ[8] - tmp_node[i].se)); } } } return 1; }