1

this is a code for chunk.c file which divides a large file (filename.txt) into smaller chunked files by lines.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>

#define DEFAULT_PREFIX "x"
#define DEFAULT_CHUNK_SIZE 1000
#define ALPHABET_SIZE 26
#define MAX_DIGITS 2

void print_usage() {
    printf("Usage: chunk [-l line_count | -w word_count | -c character_count] [-p prefix] [-s suffix] [-f filename.txt | < filename.txt]\n");
}

int main(int argc, char *argv[]) {
    char *prefix = DEFAULT_PREFIX;
    int chunk_size = DEFAULT_CHUNK_SIZE;
    int suffix_start = 0;
    char *filename = NULL;

    // Parse command line arguments
    int opt;
    while ((opt = getopt(argc, argv, "l:p:s:f:")) != -1) {
        switch (opt) {
            case 'l':
                chunk_size = atoi(optarg);
                break;
  
            case 'p':
                prefix = optarg;
                break;
  
            case 's':
                suffix_start = atoi(optarg);
                break;
  
            case 'f':
                filename = optarg;
                break;
  
            default:
                print_usage();
                return 1;
        }
    }

    // Open input file
    int input_fd = STDIN_FILENO;
    if (filename != NULL) {
        input_fd = open(filename, O_RDONLY);
        if (input_fd == -1) {
            printf("Error: could not open file '%s': %s\n", filename, strerror(errno));
            return -1;
        }
    }

    // Read input file and write output files
    int line_count = 0;
    int chunk_count = 0;
    char suffix[MAX_DIGITS + 1];
    suffix[MAX_DIGITS] = '\0';
    int output_fd = -1;

    while (1) {
        if (line_count == 0) {
            // Close previous output file
            if (output_fd != -1) {
                close(output_fd);
                output_fd = -1;
            }

            // Open new output file (get new filename)
            snprintf(suffix, MAX_DIGITS + 1, "%02d", suffix_start + chunk_count);
            char *filename = malloc(strlen(prefix) + strlen(suffix) + 1);
            strcpy(filename, prefix);
            strcat(filename, suffix);
            
            output_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | 
                        S_IRGRP | S_IWGRP | S_IROTH);
            if (output_fd == -1) {
                printf("Error: could not create file '%s': %s\n", filename, 
                        strerror(errno));
                return -1;
            }
            free(filename);

            chunk_count++;
        } // close if loop 

        // Read input
        char buffer[chunk_size];
        ssize_t bytes_read = read(input_fd, buffer, chunk_size);
        if (bytes_read == -1) {
            printf("Error: could not read input: %s\n", strerror(errno));
            return -1;
        }
        if (bytes_read == 0) {
            break;
        }
        
        // write output
        ssize_t bytes_written = write(output_fd, buffer, bytes_read);
        if (bytes_written == -1) {
            printf("Error: could not write output : %s\n", strerror(errno));
            return -1;
        }
    
        // Update line count
        for (int i = 0; i < bytes_written; i++) {
            if (buffer[i] == '\n') {
                line_count++;
            }
        }
        // Check if it's time to start a new chunk
        if (line_count >= chunk_size) {
            line_count = 0;
        }
    } // close while loop

    // Close input and output files
    if (input_fd != STDIN_FILENO) {
        close(input_fd);
    }
    if (output_fd != -1) {
        close(output_fd);
    }

    return 0;
} // close main

the example run and expected result is

$ chunk -l 100 -f z_answer.jok.txt -p part- -s 00
$ echo $?   # check exit status
0
$ wc *part* z_answer.jok.txt 
  100   669  4052 part-00
  100   725  4221 part-01
  100   551  3373 part-02
  100   640  3763 part-03
  100   588  3685 part-04
  100   544  3468 part-05
   90   473  3017 part-06
  690  4190 25579 z_answer.jok.txt
 1380  8380 51158 total

but when i run that code above, the result came out like this.

$ chunk -l 100 -f z_answer.jok.txt -p part- -s 00
$ echo $?   # check exit status
0
$ wc *part* z_answer.jok.txt 
  102   675  4100 part-00
  101   745  4300 part-01
  100   554  3400 part-02
  101   640  3800 part-03
  103   609  3800 part-04
  100   534  3400 part-05
   83   434  2779 part-06
  690  4190 25579 z_answer.jok.txt
 1380  8381 51158 total

i thought the problem is in the // Update line count section and tried to fix but still stucked. any idea that can divide line by number that the user set?

chqrlie
  • 131,814
  • 10
  • 121
  • 189
springbook
  • 21
  • 2
  • 1
    Please don't write too much code at once. Try to solve the core problem first, piece by piece, and using hard-coded values. Then you can add other things like argument parsing. That will make is *much* easier to [*debug*](https://ericlippert.com/2014/03/05/how-to-debug-small-programs/) your programs. – Some programmer dude Mar 22 '23 at 07:46
  • 4
    Well, you are reading and writing a fixed size of bytes and counting the lines only after each chunk of data. So of course, if your linecount is already 99 and the next chunk of data contains 3 line breaks you will end up with a linecount of 102. If you want to have a fixed number of lines in each of the parts, you have to check the number of line breaks in your read buffer *before* you write it to the output file and only write the respective part of the buffer, which fits within your limit – derpirscher Mar 22 '23 at 07:47
  • 2
    Also, you have to decide if you want a *line* based split of the input file, or a *block* based split. You really can't have both, as lines in a text-file are notoriously non-conforming to specific lengths. Also, what if the input is *not* a text-file? What is your actual assignment? What are your requirements and limitations? Please [edit] your question to include those details. – Some programmer dude Mar 22 '23 at 07:49
  • 1
    The simplest way to get a fixed linecount is probably reading the file byte by byte. So you will immediately encounter line breaks and can switch to the next output file when necessary – derpirscher Mar 22 '23 at 07:51
  • `chunk_size` and lines per file should be separate parameters. – chux - Reinstate Monica Mar 22 '23 at 10:04

1 Answers1

0

OP needs a new algorithm.

Per each buffer, multiple times: look for a \n, write the line and maybe close/open a file.

// Pseudo code
read_chunk = 4k (Size is independent of lines/file)
while ((length = read(buffer, read_chunk)) > 0) {
  if needed, open next destination file
  start = buffer
  end = start + length
  while (start < end) {
    line_end = strchr(start, '\n')
    if (line_end) line_end++; line_count++;
    else line_end = end
    // write a (maybe partial) line 
    write(start, line_end - start)
    if (line_count big enough)
      reset line_count
      close destination file
    start = line_end
  }
}
close file, if needed
chux - Reinstate Monica
  • 143,097
  • 13
  • 135
  • 256