The files 1..64 are each 160 MBytes and stored in a RAM disk.
Generated by:
seq 120 | parallel -k 'seq {}0000000 {}9999999 | fmt -30' | head -c 10G > 10G parallel --pipepart --block -1 -a 10G -j 64 'cat > {#}' nocat:
#!/bin/bash export LC_ALL=C sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 1; sort ) < 1) \ <((rm 2; sort ) < 2) ) \ <(sort -m \ <((rm 3; sort ) < 3) \ <((rm 4; sort ) < 4) ) ) \ <(sort -m \ <(sort -m \ <((rm 5; sort ) < 5) \ <((rm 6; sort ) < 6) ) \ <(sort -m \ <((rm 7; sort ) < 7) \ <((rm 8; sort ) < 8) ) ) ) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 9; sort ) < 9) \ <((rm 10; sort ) < 10) ) \ <(sort -m \ <((rm 11; sort ) < 11) \ <((rm 12; sort ) < 12) ) ) \ <(sort -m \ <(sort -m \ <((rm 13; sort ) < 13) \ <((rm 14; sort ) < 14) ) \ <(sort -m \ <((rm 15; sort ) < 15) \ <((rm 16; sort ) < 16) ) ) ) ) \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 17; sort ) < 17) \ <((rm 18; sort ) < 18) ) \ <(sort -m \ <((rm 19; sort ) < 19) \ <((rm 20; sort ) < 20) ) ) \ <(sort -m \ <(sort -m \ <((rm 21; sort ) < 21) \ <((rm 22; sort ) < 22) ) \ <(sort -m \ <((rm 23; sort ) < 23) \ <((rm 24; sort ) < 24) ) ) ) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 25; sort ) < 25) \ <((rm 26; sort ) < 26) ) \ <(sort -m \ <((rm 27; sort ) < 27) \ <((rm 28; sort ) < 28) ) ) \ <(sort -m \ <(sort -m \ <((rm 29; sort ) < 29) \ <((rm 30; sort ) < 30) ) \ <(sort -m \ <((rm 31; sort ) < 31) \ <((rm 32; sort ) < 32) ) ) ) ) ) \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 33; sort ) < 33) \ <((rm 34; sort ) < 34) ) \ <(sort -m \ <((rm 35; sort ) < 35) \ <((rm 36; sort ) < 36) ) ) \ <(sort -m \ <(sort -m \ <((rm 37; sort ) < 37) \ <((rm 38; sort ) < 38) ) \ <(sort -m \ <((rm 39; sort ) < 39) \ <((rm 40; sort ) < 40) ) ) ) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 41; sort ) < 41) \ <((rm 42; sort ) < 42) ) \ <(sort -m \ <((rm 43; sort ) < 43) \ <((rm 44; sort ) < 44) ) ) \ <(sort -m \ <(sort -m \ <((rm 45; sort ) < 45) \ <((rm 46; sort ) < 46) ) \ <(sort -m \ <((rm 47; sort ) < 47) \ <((rm 48; sort ) < 48) ) ) ) ) \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 49; sort ) < 49) \ <((rm 50; sort ) < 50) ) \ <(sort -m \ <((rm 51; sort ) < 51) \ <((rm 52; sort ) < 52) ) ) \ <(sort -m \ <(sort -m \ <((rm 53; sort ) < 53) \ <((rm 54; sort ) < 54) ) \ <(sort -m \ <((rm 55; sort ) < 55) \ <((rm 56; sort ) < 56) ) ) ) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 57; sort ) < 57) \ <((rm 58; sort ) < 58) ) \ <(sort -m \ <((rm 59; sort ) < 59) \ <((rm 60; sort ) < 60) ) ) \ <(sort -m \ <(sort -m \ <((rm 61; sort ) < 61) \ <((rm 62; sort ) < 62) ) \ <(sort -m \ <((rm 63; sort ) < 63) \ <((rm 64; sort ) < 64) ) ) ) ) ) | md5sum withcat:
#!/bin/bash export LC_ALL=C sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 1; sort ) < 1) \ <((rm 2; sort ) < 2) | cat) \ <(sort -m \ <((rm 3; sort ) < 3) \ <((rm 4; sort ) < 4) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 5; sort ) < 5) \ <((rm 6; sort ) < 6) | cat) \ <(sort -m \ <((rm 7; sort ) < 7) \ <((rm 8; sort ) < 8) | cat) | cat) | cat) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 9; sort ) < 9) \ <((rm 10; sort ) < 10) | cat) \ <(sort -m \ <((rm 11; sort ) < 11) \ <((rm 12; sort ) < 12) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 13; sort ) < 13) \ <((rm 14; sort ) < 14) | cat) \ <(sort -m \ <((rm 15; sort ) < 15) \ <((rm 16; sort ) < 16) | cat) | cat) | cat) | cat) \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 17; sort ) < 17) \ <((rm 18; sort ) < 18) | cat) \ <(sort -m \ <((rm 19; sort ) < 19) \ <((rm 20; sort ) < 20) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 21; sort ) < 21) \ <((rm 22; sort ) < 22) | cat) \ <(sort -m \ <((rm 23; sort ) < 23) \ <((rm 24; sort ) < 24) | cat) | cat) | cat) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 25; sort ) < 25) \ <((rm 26; sort ) < 26) | cat) \ <(sort -m \ <((rm 27; sort ) < 27) \ <((rm 28; sort ) < 28) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 29; sort ) < 29) \ <((rm 30; sort ) < 30) | cat) \ <(sort -m \ <((rm 31; sort ) < 31) \ <((rm 32; sort ) < 32) | cat) | cat) | cat) | cat) | cat) \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 33; sort ) < 33) \ <((rm 34; sort ) < 34) | cat) \ <(sort -m \ <((rm 35; sort ) < 35) \ <((rm 36; sort ) < 36) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 37; sort ) < 37) \ <((rm 38; sort ) < 38) | cat) \ <(sort -m \ <((rm 39; sort ) < 39) \ <((rm 40; sort ) < 40) | cat) | cat) | cat) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 41; sort ) < 41) \ <((rm 42; sort ) < 42) | cat) \ <(sort -m \ <((rm 43; sort ) < 43) \ <((rm 44; sort ) < 44) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 45; sort ) < 45) \ <((rm 46; sort ) < 46) | cat) \ <(sort -m \ <((rm 47; sort ) < 47) \ <((rm 48; sort ) < 48) | cat) | cat) | cat) | cat) \ <(sort -m \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 49; sort ) < 49) \ <((rm 50; sort ) < 50) | cat) \ <(sort -m \ <((rm 51; sort ) < 51) \ <((rm 52; sort ) < 52) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 53; sort ) < 53) \ <((rm 54; sort ) < 54) | cat) \ <(sort -m \ <((rm 55; sort ) < 55) \ <((rm 56; sort ) < 56) | cat) | cat) | cat) \ <(sort -m \ <(sort -m \ <(sort -m \ <((rm 57; sort ) < 57) \ <((rm 58; sort ) < 58) | cat) \ <(sort -m \ <((rm 59; sort ) < 59) \ <((rm 60; sort ) < 60) | cat) | cat) \ <(sort -m \ <(sort -m \ <((rm 61; sort ) < 61) \ <((rm 62; sort ) < 62) | cat) \ <(sort -m \ <((rm 63; sort ) < 63) \ <((rm 64; sort ) < 64) | cat) | cat) | cat) | cat) | cat) | cat | md5sum The only difference is that in withcat every sort -m is piped to cat.
The "useless use of cat"-people will make you believe withcat would be slower than nocat. However, the opposite is true by a wide margin:
$ time bash nocat c933d81faea7b8dec8eb64ca0b044d74 - real 3m40.854s user 2m48.687s sys 0m49.135s $ time bash withcat c933d81faea7b8dec8eb64ca0b044d74 - real 2m21.812s user 2m16.651s sys 1m36.135s The test is run on a 64-core machine, that does nothing else. Everything is in RAM (so this is not due to slow disks). Every test was run 3 times, and the best time is given above. All three tests completed within 5 seconds of the best time (so it is not a fluke).
Why is it faster to pipe the output to cat?
Edit
Does cat group input in bigger chunks? And/or does sort flush output for every line?
To test this I tried:
$ strace -ff sort -m <(sort 1) <(sort 2) 2>fromsort | cat >/dev/null $ strace -ff sort -m <(sort 1 | cat ) <(sort 2 | cat) 2>fromcat | cat >/dev/null If cat made it into bigger chunks, we would expect read to return larger chunks. But it does not:
$ grep -E 'read|write' fromsort |field 1,5|sort | uniq -c 1 openat(AT_FDCWD, 3 8 pread64(3, = 1 read(3, 3771 40989 read(3, 4096 2 read(3, 832 1 read(3, unknown 1 read(4, 0 1 read(4, 2241 40959 read(4, 4096 1 write(1, 1916 81949 write(1, 4096 $ grep -E 'read|write' fromcat |field 1,5|sort | uniq -c 1 openat(AT_FDCWD, 3 8 pread64(3, = 1 read(3, 3771 40989 read(3, 4096 2 read(3, 832 1 read(3, unknown 1 read(4, 2241 40959 read(4, 4096 1 read(4, unknown 1 write(1, 1916 81949 write(1, 4096 In both cases both read and write are 4K.
(Incidentally, sort does read (much) bigger chunks if reading from a file and not from a pipe, but that is not the case here).
Edit 2
The goal of the above is to demonstrate that an additional cat is not always useless; and to figure out what causes this.
The goal is not to sort data.
But if your goal was to sort data, why not just use sort's built-in --parallel?
By default sort seems to use --parallel 8 on a 64-core machine. top shows it uses up to 800% CPU. You can force it to use 64 cores with --parallel 64:
$ time sort {1..64} | md5sum real 9m4.005s user 29m56.454s sys 5m49.560s $ time sort --parallel 64 {1..64} | md5sum real 6m50.332s user 35m55.040s sys 11m37.609s So GNU sort's --parallel is much slower than the above. The above is now available as parsort: http://git.savannah.gnu.org/cgit/parallel.git/tree/src/parsort
sort -m fileA fileBis alternating blocking reads fromfileAwith those fromfileBand adding another buffering layer (which a UUoC is doing, by putting another pipe in between) may cause it to always have what to read from the pipe, and not have to wait for the producer to put it there. But that is simply a fluke; the shells process substs and general purpose tools like sort are not appropriate for taking advantage of multiprocessing machines (even of my "4-core" phone, much less of a 64-core monster)sort -m. The innersorts could use the built-in parallelization, but they only run at 100% for ~15 secs. So: Yes, it sucks so bad.