1

My goal is to rename all the files inside a directory with their MD5 hashes to make checking for duplicates much easier.

I currently have around 30,000 files I want to process, however, after testing with small batches of files to make sure the code works I run into this error

Error: EMFILE: too many open files ...

And yes I have tried to look at this and multiple other similar issues. node and Error: EMFILE, too many open files

I figured it has to do with how I open files and something to do with asynchronous operations, however, I have no clue how to correctly code in such a manner.

This is my original attempt at tackling this.

const md5hashtable = [];
async function processFilesMD5(routePath) {
    // Get files/folders in path
    await fs.readdirSync(routePath).forEach((file) => {
        const filepath = path.join(routePath, file);
        // Check if folder is dir to do tree walk
        fs.stat(filepath, async (err, stat) => {
            if (stat.isDirectory()) {
                await processFilesMD5(filepath);
            // Calculate md5 of file
            } else {
                let filename = path.basename(filepath).replace(path.extname(filepath), "")
                if (RegExp('^[a-f0-9]{32}$', 'gm').test(filename)){
                    if (md5hashtable.includes(filename)){
                        console.log(`\nFound dup: ${filename} loc: ${filepath}\n`)
                        fs.unlinkSync(filepath)
                    } else {
                        if (!(path.basename(filepath) === `${filename}${path.extname(filepath)}`)){
                            fs.renameSync(filepath, `${filepath.replace(path.basename(filepath), "")}${filename}${path.extname(filepath)}`)
                        }
                        md5hashtable.push(filename)
                    }
                    countProcess++;
                } else {
                    countProcess++;
                    countOpen++;
                    let hash = crypto.createHash('md5')
                    let stream = fs.createReadStream(filepath)
                    console.log(`Created Stream with ID: ${countOpen}`)
                    await stream.on('data', function (data) {
                        hash.update(data, 'utf8')
                        countRead++;
                        // console.log(`Reading Stream with chunk ID: ${countRead}`)
                    })
                    await stream.on('end', function () {
                        countClose++;
                        // console.log(`Closing Steam with ID: ${countClose}`)
                        const md5name = hash.digest('hex')
                        if (md5hashtable.includes(md5name)){
                            console.log(`\nFound dup: ${md5name} loc: ${filepath}\n`)
                            fs.unlinkSync(filepath)
                        } else {
                            if (!(path.basename(filepath) === `${md5name}${path.extname(filepath)}`)){
                                fs.renameSync(filepath, `${filepath.replace(path.basename(filepath), "")}${md5name}${path.extname(filepath)}`)
                            }
                            md5hashtable.push(md5name)
                        }
                        console.log(`File: ${filepath} has hash: ${md5name}`)
                        stream.destroy()
                    })
                }
            }
        });
    });
    console.log(`Current Route: ${routePath}\nTotal files processed: ${countProcess}\nFiles Opened: ${countOpen}\nChunks Read: ${countRead}\nFiles Closed: ${countClose}`)
}

processFilesMD5(`${path.join(__dirname, 'media')}`).then(() => {
    console.log('Done')
})

This is my second attempt to fix this issue, and I also cleaned it up for simplicity.

const md5hashtable = [];

function calculateMD5(filepath) {
    let hash = crypto.createHash('md5')
    let stream = fs.createReadStream(filepath)
    console.log(`Created Stream`)

    stream.on('data', function (data) {
        hash.update(data, 'utf8')
        console.log(`Reading Stream`)
    })

    stream.on('end', function () {
        const MD5hash = hash.digest('hex')
        if (dupHashCheck(MD5hash)){ // Hash already exsit
            console.log(`\nFound dup: ${filename} loc: ${filepath}\n`) 
            fs.unlink(filepath) // Deletes duplicate
        } else { // Hash doest not exsit
            md5hashtable.push(md5name)
        }
        console.log(`File: ${filepath}\nHash: ${md5name}\n`)
        stream.destroy()
        console.log(`Closing Steam`)
    })
}

function validateMD5(hash){
    return RegExp('^[a-f0-9]{32}$', 'gm').test(hash);
}

function dupHashCheck(hash){
    return md5hashtable.includes(hash)
}

function processImageRoute(routePath) {
    fs.readdir(routePath, (err, files) => { // Get files in path
        files.forEach(file => {
            let filepath = path.join(routePath, file); // Join root dir with path of folder
            fs.stat(filepath, async (err, stat) => { // Get stats of dir
                if (stat.isDirectory()) { // If dir is folder, run recursivley
                    processImageRoute(filepath);
                        } else { // Continue
                            let filename = path.basename(filepath).replace(path.extname(filepath), "") // Get filename without extension
                            if (validateMD5(filename)){ // Filename is a valid md5 hash
                                if (dupHashCheck(filename)){ // Hash already exsit
                                    console.log(`\nFound dup: ${filename} loc: ${filepath}\n`) 
                                    fs.unlink(filepath) // Deletes duplicate
                                } else { // Hash doest not exsit
                                    md5hashtable.push(filename)
                                }
                            } else { // Isnt a valid md5 hash
                                calculateMD5(filepath)
                            }
                        }
                    })
        })
    })
}

processImageRoute(`${path.join(__dirname, 'media')}`)

Both of these codes do not work as they open too many files but in small batches, they work perfectly. Also, this is my first question to I am opened to any suggestions and comments.

Kvrn
  • 11
  • 5

1 Answers1

0

Following with what @codeness93 said promisifying the code; I made this

global.fs = require('fs-extra');

const md5hashtable = [];

function calculateMD5(filePath) {
    return new Promise((resolve, reject) => {
        let hash = crypto.createHash('md5')
        let stream = fs.createReadStream(filePath)

        stream.on('error', function (err) {
            reject(err);
        })

        stream.on('data', function (data) {
            hash.update(data, 'utf8')
        })

        stream.on('end', function () {
            stream.close();
            resolve(hash.digest('hex'));
        })
    });
}

function validateMD5(hash){
    return RegExp('^[a-f0-9]{32}$', 'gm').test(hash);
}

function dupHashCheck(hash){
    return md5hashtable.includes(hash)
}

function renameFile(filePath, fileHash){
    try {
        fs.renameSync(filePath, `${filePath.replace(path.basename(filePath), "")}${fileHash}${path.extname(filePath)}`)
    } catch (e){
        throw new Error(e)
    }
}

function processImageRoute(routePath) {
    fs.readdir(routePath, (err, files) => { // Get files in path
        files.forEach(file => {
            let filePath = path.join(routePath, file); // Join root dir with path of folder
            fs.stat(filePath, async (err, stat) => { // Get stats of dir
                if (stat.isDirectory()) { // If dir is folder, run recursively
                    processImageRoute(filePath);
                } else { // Continue
                    let fileName = path.basename(filePath).replace(path.extname(filePath), "") // Get fileName without extension
                    if (validateMD5(fileName)){ // fileName is a valid md5 hash
                        if (dupHashCheck(fileName)){ // Hash already exist
                            console.log(`\nFound dup: ${fileName} loc: ${filePath}\n`)
                            fs.unlink(filePath) // Deletes duplicate
                            } else { // Hash doest not exist
                                md5hashtable.push(fileName)
                            }
                    } else { // Isn't a valid md5 hash
                        await calculateMD5(filePath).then(function(fileHash){
                            if (validateMD5(fileHash)){
                                if (dupHashCheck(fileHash)){ // Hash already exist
                                    console.log(`\nFound dup: ${fileName} loc: ${filePath}\n`)
                                    fs.unlink(filePath) // Deletes duplicate
                                } else { // Hash doest not exist
                                    renameFile(filePath, fileHash); // Renames the file to its hash plus extension
                                    md5hashtable.push(fileHash)
                                }
                                console.log(`File: ${filePath}\nHash: ${fileHash}\n`)
                            } else {
                                throw new Error(`Unable to calculate hash for file: ${fileName}\nError: ${fileHash}\n`)
                            }
                        })
                    }
                }
            })
        })
    })
}

processImageRoute(`${path.join(__dirname, 'media')}`)

I am not sure if it was adding the promise thus adding a delay in opening streams vs reading them and subsequently closing them, or the substitution of fs with fs-extra that made it work, or both, or magic dust, but it works.

At the end it was able to process all 29088 Files totaling 400 GBs so I'll call that a success. Feel free to use, or leave suggestions.

Kvrn
  • 11
  • 5
  • 1
    Promisifying would favor fs.promises... over callbacks. Your code could be working but it's unnecessary complicated. Refector until there is no single callback left and all async code is done with async/await. – Wiktor Zychla Mar 23 '21 at 17:51