I just came to multithreaded programming. Below is my attempt to do a large txt file reading and processing with 1 thread to read and multiple threads to process line by line. Right now, I put a trivial process function; the actual process function takes longer in my real application, then outputs the processed result to a new file.
Somehow, the code works intermittently; some race conditions may occur somewhere. I tried using gdb
and extensively logging to debug, couldn't figure out where the bug was. So, please help. Any suggestion is appreciated.
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <semaphore.h>
#include <string.h>
#include <zlib.h>
#define READER_SIZE 30
#define NUM_PROCESSORS 4
// shared data by the reader and processors
//------------------------------------------------
char reader_buffer[READER_SIZE][64];
int reader_in = 0; // next free position
int reader_out = 0; // first full position
sem_t reads_mutex; // semaphore to lock buffer
sem_t reads_full; // counts the number of filled slots
sem_t reads_empty; // counts the number of empty slots
gzFile fileIn;
//------------------------------------------------
// shared data by the processors and writer
//------------------------------------------------
pthread_mutex_t writes_mutex; // mutex lock for writting
gzFile fileOut;
//------------------------------------------------
// Function reader
void *producer()
{
while (!gzeof(fileIn))
{
printf("reader turn ");
sem_wait(&reads_empty);
sem_wait(&reads_mutex);
gzgets(fileIn, reader_buffer[reader_in], 64);
// printf("reader is working!\n");
// printf("producer read in: %d, %s \n", reader_in, reader_buffer[reader_in]);
int empty, full;
sem_getvalue(&reads_empty, &empty);
sem_getvalue(&reads_full, &full);
reader_in = (reader_in + 1) % READER_SIZE;
printf("reader_in: %d, empty is %d, full is %d\n", reader_in, empty, full + 1);
sem_post(&reads_mutex);
sem_post(&reads_full);
}
}
void *consumer(void *id)
{
int consumer_id = *((int *)id);
while (!gzeof(fileIn))
{
printf("consumer %d turn ", consumer_id);
sem_wait(&reads_full);
sem_wait(&reads_mutex);
// copy the data
char processor_buffer[64];
char cell_barcode[17];
strcpy(processor_buffer, reader_buffer[reader_out]);
reader_out = (reader_out + 1) % READER_SIZE;
int empty, full;
sem_getvalue(&reads_empty, &empty);
sem_getvalue(&reads_full, &full);
printf("reader_out: %d, empty is %d, full is %d \n", reader_out, empty + 1, full);
sem_post(&reads_mutex);
sem_post(&reads_empty);
// processing, here I made trivial to take substring
if(processor_buffer[0] == '@')
{
pthread_mutex_lock(&writes_mutex);
gzputs(fileOut, processor_buffer);
pthread_mutex_unlock(&writes_mutex);
}
}
}
int main()
{
fileIn = gzopen("temp.gz", "rb");
fileOut = gzopen("test.gz", "wb");
int *proc_id = malloc(NUM_PROCESSORS * sizeof(int));
pthread_t reader_thread;
pthread_t processor_thread[NUM_PROCESSORS];
// initialize semaphores
sem_init(&reads_mutex, 0, 1);
sem_init(&reads_full, 0, 0);
sem_init(&reads_empty, 0, READER_SIZE);
pthread_mutex_init(&writes_mutex, NULL);
pthread_create(&reader_thread, NULL, producer, NULL);
for (int i = 0; i < NUM_PROCESSORS; i++)
{
proc_id[i] = i;
pthread_create(&processor_thread[i], NULL, consumer, &proc_id[i]);
}
// wait for the reader and processor threads to finish
pthread_join(reader_thread, NULL);
for (int i = 0; i < NUM_PROCESSORS; i++)
{
pthread_join(processor_thread[i], NULL);
}
sem_destroy(&reads_mutex);
sem_destroy(&reads_full);
sem_destroy(&reads_empty);
gzclose(fileIn);
gzclose(fileOut);
return 0;
}