You can solve this in O(N log2 N) by dividing the first list in half to create a sublist of first-half numbers and a sublist of second-half numbers. Then:
Separate the other two lists as well into first-half and second-half versions, where the first-half version contains the first-half numbers in the same order, and the second-half version contains the second-half numbers in the same order.
Count all the valid pairings of a first-half number with a second-half number. You can do this in O(N log N) time.
Recurse separately on the first-half numbers and second-half numbers to count the pairings within each sublist.
All that remains, then, is to figure out how to do step (2). To aid in visualization, I will refer each number as a point, its position in list 2 as "x", and its position in list 3 as "y".
So, to find valid pairs of first-half and second-half points, first sort the points in each half by y, and initialize an empty order statistic tree that sorts points by x.
Then iterate through all the points in y order (i.e., the original list 3):
For each first-half point, add it to the order statistic tree; and
For each second-half point, retrieve the number of points in the tree with smaller x, and add it to the total.
Note that for each bunch of points you count, they all occur before the second-half point in list1, because they're first-half points. They all occur before the second-half point in list3, because the came earlier in y-order. And they all occur before the second-half point in list 2, because that's what we used the order statistic tree for.
Here's a somewhat-optimized C++ implementation with tests + timings:
#include <iostream>
#include <vector>
#include <stdint.h>
#include <algorithm>
#include <chrono>
void partitionSection(std::vector<int>& A, size_t pos, size_t len, int splitVal, std::vector<int>& scratch) {
scratch.clear();
size_t d = 0;
for (size_t i = 0; i < len; ++i) {
int v = A[pos + i];
if (v >= splitVal) {
scratch.push_back(v);
}
else {
A[pos + d] = v;
++d;
}
}
size_t s = 0;
for (;d < len; ++d, ++s) {
A[pos + d] = scratch[s];
}
}
uint64_t pairsIn3_helper(
std::vector<int>& X, std::vector<int>& Y,
std::vector<int> scratch,
const int pos, const int len
) {
if (len < 2) {
return 0;
}
const int mid = len >> 1;
uint64_t count = 0;
//Use scratch as a compact order statistic tree.
//It stores the number of items in each slot, followed by the
//number of items in each pair of slots, etc.
int levelStart = 0, levelLen = len;
while (levelLen > 0) {
levelStart += levelLen;
levelLen >>= 1;
}
scratch.assign(levelStart + len, 0);
const size_t xmapStart = levelStart;
// Add a lookup for x coord by z coord
for (int x = 0;x < len;++x) {
scratch[xmapStart + X[x + pos] - pos] = x;
}
for (int y = 0;y < len;++y) {
int z = Y[y + pos] - pos;
int x = scratch[xmapStart + z];
if (z < mid) {
// add val to the tree
levelStart = 0;
levelLen = len;
while (levelLen > x) {
scratch[levelStart + x] += 1;
x >>= 1;
levelStart += levelLen;
levelLen >>= 1;
}
}
else {
// count values < x
levelStart = 0;
levelLen = len;
while (levelLen) {
if (x & 1) {
count += scratch[levelStart + x - 1];
}
x >>= 1;
levelStart += levelLen;
levelLen >>= 1;
}
}
}
partitionSection(X, pos, len, pos + mid, scratch);
partitionSection(Y, pos, len, pos + mid, scratch);
count += pairsIn3_helper(X, Y, scratch, pos, mid);
count += pairsIn3_helper(X, Y, scratch, pos + mid, len - mid);
return count;
}
uint64_t pairsIn3(const std::vector<int>& X, const std::vector<int>& Y, std::vector<int>& Z) {
if (Z.size() < 2) {
return 0;
}
std::vector<int> scratch(Z.size() + 2, 0);
// replace each integer with its Z-position
for (size_t i = 0; i < Z.size(); ++i) {
scratch[Z[i]] = (int)i;
}
std::vector<int> newX, newY;
for (size_t i = 0; i < Z.size(); ++i) {
newX.push_back(scratch[X[i]]);
newY.push_back(scratch[Y[i]]);
}
return pairsIn3_helper(newX, newY, scratch, 0, Z.size());
}
uint64_t slow(const std::vector<int>& X, const std::vector<int>& Y, std::vector<int>& Z) {
uint64_t count = 0;
for (int z2 = 1; z2 < Z.size(); ++z2) {
int val2 = Z[z2];
size_t x2 = std::find(X.begin(), X.end(), val2) - X.begin();
size_t y2 = std::find(Y.begin(), Y.end(), val2) - Y.begin();
for (int z1 = 0; z1 < z2; ++z1) {
int val1 = Z[z1];
size_t x1 = std::find(X.begin(), X.end(), val1) - X.begin();
size_t y1 = std::find(Y.begin(), Y.end(), val1) - Y.begin();
if (x1 < x2 && y1 < y2) {
++count;
}
}
}
return count;
}
int main()
{
auto sizes = std::vector<int>({ 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000, 2000000, 3000000});
for (auto N : sizes) {
std::cout << "N=" << N << std::endl;
std::vector<int> X, Y, Z;
for (int i = 1; i <= N; ++i) {
X.push_back(i);
Y.push_back(i);
Z.push_back(i);
}
std::random_shuffle(X.begin(), X.end());
std::random_shuffle(Y.begin(), Y.end());
std::random_shuffle(Z.begin(), Z.end());
if (N < 10000) {
auto t0 = std::chrono::high_resolution_clock::now();
auto slowCount = slow(X, Y, Z);
auto t1 = std::chrono::high_resolution_clock::now();
auto slowMs = std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() * 0.001;
std::cout << "slow impl found " << slowCount << " in " << slowMs << " ms" << std::endl;
}
auto t2 = std::chrono::high_resolution_clock::now();
auto fastCount = pairsIn3(X, Y, Z);
auto t3 = std::chrono::high_resolution_clock::now();
auto fastMs = std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count() * 0.001;
std::cout << "fast impl found " << fastCount << " in " << fastMs << " ms" << std::endl;
}
}
And results on a Ryzen 5600x. For lists of 200,000 elements, it takes 137ms:
N=1000
slow impl found 121406 in 129.437 ms
fast impl found 121406 in 0.414 ms
N=2000
slow impl found 505122 in 939.681 ms
fast impl found 505122 in 1.177 ms
N=5000
slow impl found 3205125 in 13789.7 ms
fast impl found 3205125 in 2.544 ms
N=10000
fast impl found 12642459 in 5.007 ms
N=20000
fast impl found 50166685 in 10.621 ms
N=50000
fast impl found 313241083 in 29.631 ms
N=100000
fast impl found 1252007335 in 63.935 ms
N=200000
fast impl found 5013227960 in 137.03 ms
N=500000
fast impl found 31346510805 in 383.266 ms
N=1000000
fast impl found 124806065267 in 826.671 ms
N=2000000
fast impl found 499940410615 in 1782.94 ms
N=3000000
fast impl found 1124759267734 in 2924.1 ms