I was tasked with making a program that uploads a .csv to a NoSQL cluster. We were doing 2-5 gig files, and it was taking 4-8 hours. Well when we hit 17 gig files that was just to slow.
When I remade my program I realized working in batch made is much much faster. I could get a 17 gig file done in 6 hours. So then I decided to make a consumer-producer multithreading structure. This caused it to be just as slow as before. Although my program is fast and working great, I want to know why the producer-consumer construct was slower than a batch produce, batch consume method.
The batch looks like this:
int count = 0;
//
Row r;
while ((r = rm.getNextRow()) != null)
{
RowQueue.Enqueue(r);
while (RowQueue.Count <= ROWMAX)
{
if ((r = rm.getNextRow()) != null)
RowQueue.Enqueue(r);
else
break;
} //
int uniqueIdentifer = -1;
if (count > 1000)
{
PrintAndSavePosition(count, rm, positionQueue, true);
count = 0;
}
//give it some extra room to be safe
while (RowQueue.Count != 0)
{
r = RowQueue.Dequeue();
while (uniqueIdentifer == -1)
{
uniqueIdentifer = nsqw.tryPut(r);
if (uniqueIdentifer == -1)
Thread.Sleep(1);
}
count++;
}
positionQueue.Add(new Tuple<int, long>(uniqueIdentifer, rm.Position));
}
As compaired to
public void produceLoop(){
while (true)
{
while (RowQueue.Count <= ROWMAX && (r = rm.getNextRow()) != null){
RowQueue.Enqueue(r);
}
}
}
public void consumeLoop(){
while(true){
while (RowQueue.Count != 0)
{
RowQueue.TryDequeue(out r);
while (uniqueIdentifer == -1)
{
uniqueIdentifer = nsqw.tryPut(r);
if (uniqueIdentifer == -1)
Thread.Sleep(1);
}
count++;
}
positionQueue.Add(new Tuple<int, long>(uniqueIdentifer, rm.Position));
}
}
}
EDIT:
The bottom half are infinite loops, but i just left it like that for speed test.