I made these different programs in different programming languages to count the number of lines of a file, and it turns out that the output differs according to the program, but the strange thing is that some programs have the same results, I was testing them with a 6gb utf-8 xml file with about 146 million lines.
# Python
# Output -> 146114085 lines
import time
lines = 0
start = time.perf_counter()
with open('file_path') as myfile:
for line in myfile:
lines += 1
print("{} lines".format(lines))
end = time.perf_counter()
elapsed = end - start
print(f'Elapsed time: {elapsed:.3f} seconds')
// Java
// Output -> 146114085 lines (just as with python)
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
long startTime = System.currentTimeMillis();
int BUFFER_SIZE = 1024*1024;
String filePath = "file_path";
FileReader file = file = new FileReader(filePath);
BufferedReader reader = new BufferedReader(file, BUFFER_SIZE);
long lines = reader.lines().count();
reader.close();
System.out.println("The number of lines is " + lines);
long elapsedTime = System.currentTimeMillis() - startTime;
System.out.println("Duration in seconds: " + elapsedTime/1000);
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
// Rust
// Output -> 146113746 lines
use std::fs::File;
use std::io::{BufRead, BufReader, Error, Read};
use std::time::Instant;
fn main() {
let file_path = "file_path";
let buffer_size = 1024*1024;
let start = Instant::now();
if let Err(err) = read_file(buffer_size, file_path) {
println!("{}", err);
}
let duration = start.elapsed();
println!("The function took {} seconds to execute", duration.as_secs());
}
fn read_file(buffer_size: usize, file_path: &str) -> Result<(), Error> {
let file = File::open(file_path)?;
let reader = BufReader::with_capacity(buffer_size, file);
let lines = reader.lines().fold(0, |sum, _| sum + 1);
println!("Number of lines {}", lines);
Ok(())
}
// C
// Output -> 146113745 lines (one line less than rust output)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char *argv[]) {
// start time
clock_t start = clock();
// File path
const char* file_path = "file_path";
// Open the file for reading
FILE *fp = fopen(file_path, "r");
// Allocate a buffer to hold the data
const size_t BUFFER_SIZE = 1024*1024;
char *buffer = malloc(BUFFER_SIZE);
// Declare the number of lines variable
unsigned int lines = 0;
// Read the data in chunks
while (!feof(fp)) {
// Read a chunk of data from the file
size_t bytes_read = fread(buffer, 1, BUFFER_SIZE, fp);
// Process the data here...
for (int i = 0; i < bytes_read; i++) {
if (buffer[i] == '\n') {
lines++;
}
}
}
printf("The number of lines %u\n", lines);
// Clean up
free(buffer);
fclose(fp);
// End
clock_t end = clock();
// Calculate the elapsed time in seconds
double elapsed = (double) ((end - start) / CLOCKS_PER_SEC);
printf("Elapsed time: %f seconds", elapsed);
return 0;
}
Finally, the command wc
Output -> 146113745 lines (just as with C)
wc -l file_path
I think the correct answer is Rust's, because it has one more than wc/C, and it is the last line that has no line change as it reaches the end of the file. The cases that cause me confusion are java and python.