Before writing this post I looked around for a library that could solve this problem. I didn't find much so I decided to try to write this.
My problem is the following:
I have two sets of points with coordinates x1, y1, and x2, y2. The sets have different number of elements. I want to find what is the average distance between all the elements in set 1 vs all the elements in the set 2 given a certain cutoff. This mean that, if two points (of the two different sets) are further than cutoff they should not be considered.
The easiest solution is to perform \$O(n^2)\$ search and then filter the results based on the distance, but it's inefficient.
I tried to write an algorithm that divide the space of the sets in squares of size "cutoff". For each point I can associate two indexes that tell me to which box the point belong. Looking at the indexes I can generate the lists of neighbors points and calculate the distances only between points that are in confining boxes.
#include <vector> #include <algorithm> #include <iostream> #include <time.h> #include <numeric> using namespace std; // euclidean distance double euc(double x, double y) { return sqrt(x * x + y * y); } //calculate the distance vector between two different sets of points // set 1 of coordinate x1, y1 // set 2 of coordinate x2, y2 vector <double> all_dist(vector <double>& x1, vector <double>& x2, vector <double>& y1, vector <double>& y2) { vector <double> d(x1.size()*x2.size()); for (int i = 0; i < x1.size(); ++i) { for (int j = 0; j < x2.size(); ++j) { d[i*x2.size()+j]=euc(x1[i] - x2[j], y1[i] - y2[j]); } } return d; } vector <double> dist(vector <double>& x1, vector <double>& x2, vector <double>& y1, vector <double>& y2, double cutoff) { //we divide the space of the vectors in squares of size cutoff // each square has two indexes, one for the x coordinates and one for the y // these vectors contain the indexes for each point for the vector x1 and y1 // means: by reading the content of box_x1[n] box_y1[n] we know to which spatial // box the point "n" belongs vector <int> box_x1(x1.size()); vector <int> box_y1(y1.size()); // these vectors contain the indexes for each point for the vector x2 and y2 vector <int> box_x2(x2.size()); vector <int> box_y2(y2.size()); vector <double> res; // we compute the maximum number of sections (or divisions) in the x and y dimension // we need to find the maximum and minimum value for each vector double maxx = max(*max_element(x1.begin(), x1.end()), *max_element(x2.begin(), x2.end())); double minx = min(*min_element(x1.begin(), x1.end()), *min_element(x2.begin(), x2.end())); double maxy = max(*max_element(y1.begin(), y1.end()), *max_element(y2.begin(), y2.end())); double miny = min(*min_element(y1.begin(), y1.end()), *min_element(y2.begin(), y2.end())); int max_box_x = int((maxx - minx) / (1.1 * cutoff)); int max_box_y = int((maxy - miny) / (1.1 * cutoff)); for (int i = 0; i < x1.size(); ++i) { box_x1[i]=(int((x1[i] - minx) / (1.1*cutoff) )); box_y1[i]=(int((y1[i] - miny) / (1.1*cutoff) )); } for (int i = 0; i < x2.size(); ++i) { box_x2[i]=(int((x2[i] - minx) / (1.1*cutoff))); box_y2[i]=(int((y2[i] - miny) / (1.1*cutoff))); } // we need to create the list of neighbors points for a specific box // we need to consider all the boxes that are neighboring a specific box // this mean looking at the boxes +1 and -1 for (int i = 0; i < max_box_x; ++i) { for (int j = 0; j < max_box_y; ++j) { vector <double> points_x1c, points_y1c, points_x2c, points_y2c; for (int k = 0; k < box_x1.size(); ++k) { if ((box_x1[k] == i || box_x1[k] == i + 1 || box_x1[k] == i - 1) && (box_y1[k] == j || box_y1[k] == j + 1 || box_y1[k] == j - 1)) { points_x1c.push_back(x1[k]); points_y1c.push_back(y1[k]); } } for (int k = 0; k < box_x2.size(); ++k) { if ((box_x2[k] == i || box_x2[k] == i + 1 || box_x2[k] == i - 1) && (box_y2[k] == j || box_y2[k] == j + 1 || box_y2[k] == j - 1)) { points_x2c.push_back(x2[k]); points_y2c.push_back(y2[k]); } } // now that we have the two list of points (we have four vectors) // we can calculate the distances between these points vector <double> temp = all_dist(points_x1c, points_x2c, points_y1c, points_y2c); // we still accept only the distances below the cutoff vector <double> temp2; for (int m = 0; m < temp.size(); ++m) if (temp[m] < cutoff) temp2.push_back(temp[m]); move(temp2.begin(), temp2.end(), back_inserter(res)); } } return res; } int main() { int num_el = 50000; double cutoff = 200; vector<double> x1(num_el); vector<double> y1(num_el); vector<double> x2(num_el/2); vector<double> y2(num_el/2); generate(x1.begin(), x1.end(), rand); generate(y1.begin(), y1.end(), rand); generate(x2.begin(), x2.end(), rand); generate(y2.begin(), y2.end(), rand); clock_t begin_time = clock(); vector <double> res = dist(x1, x2, y1, y2,cutoff); cout << float(clock() - begin_time) / CLOCKS_PER_SEC<<endl; cout << accumulate(res.begin(), res.end(), 0.0) /res.size() << endl; begin_time = clock(); res=all_dist(x1, x2, y1, y2); cout << float(clock() - begin_time) / CLOCKS_PER_SEC << endl; vector <double> res2; for (int i = 0; i < res.size(); ++i) if (res[i] < cutoff) res2.push_back(res[i]); cout << accumulate(res2.begin(), res2.end(), 0.0) / res2.size() << endl; } I wonder if there are some optimizations that can be applied to the code and I also wonder if, somewhere, there is a library that does what need. I tried to measure the speed of the simple \$O(n^2)\$ solution vs the "box solution" (\$O(n \log n)\$? I wish that). The box solution is faster but the result is not exactly the same as the \$O(n^2)\$. Is this an acceptable error in such algorithm?
I have put an image to explain my reasoning. The boxes should be squared of size cutoff (approximately). In the code we can read the variables max_box_x and max_box_y that, for the image, are respectively 4 and 3. The size of the vectors box_x1 and box_y1 are as big as one of the set. By looking at box_x1[n] and box_y1[n] we can tell to which box the particle with index n belongs.
