I am currently trying to return a request of all the file names (in each existing folder) on a particular website. My web application is using NodeJS, Express, Cheerio, and Request to web scrape. My code is first getting a list of all the folder names. After retrieving a list of folder names, it then goes inside each folder name to get a list of file names and store them in the 'files' array. Finally, the 'files' array is what will be sent to the client-side.
Right now I am having a big issue with asynchronous stuff because my request would always return an empty list of 'files'. I have the Q node module installed and have tried using promises, but have had no luck getting the results I want. I am still new to nodeJS and would love it if someone can help me out.. :)
exports.getAllImages = function(req, res) {
var folders = [];
var files = [];
//Step 1: Get folder names and store all of them in the 'folders' array
var foldersUrl = 'http://students.washington.edu/jmzhwng/Images/';
request(foldersUrl, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a:contains('-')").filter(function(){
var data = $(this)[0].attribs.href;
folders.push(data);
})
//Step 2: Using the 'folders' array, get file names in each folder and store all of them in the 'files' array
for (var i=0; i < folders.length; i++) {
var imagesUrl = 'http://students.washington.edu/jmzhwng/Images/' + folders[i];
request(imagesUrl, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
$("a:contains('.')").filter(function(){
var data = $(this)[0].attribs.href;
files.push(data);
})
}
})
}
//Step 3: Return all file names to client-side
res.json({
images: files
}, 200);
console.log('GET ALL IMAGES - ' + JSON.stringify(files));
}
})
For better readability or support, you can view the JSFiddle I created here: http://jsfiddle.net/fKGrm/