0

I am trying to learn js/puppeteer and by building a simple web scraper to scrape books info for educational purposes. I am trying to get the web scraper to fill UPC numbers from a CSV file onto the search bar of a book website. I managed to get a the web scraper to scrape the website if I use a single UPC number.

But I have a CSV with a list of UPCs and would love for the web scraper:

  1. to read the CSV file,
  2. grab the UPC from first line,
  3. search for the UPC on website,
  4. scrape the information,
  5. grab the UPC from 2nd line,
  6. repeat 3, 4

Sample CSV:

DATE,QUANTITY,NAME,CODECONTENT,CODETYPE
2021-10-13 20:16:44 +1100,1,"Book 1","9781250035288",9
2021-10-13 20:16:40 +1100,1,"Book 2","9781847245601",9
2021-10-13 20:16:35 +1100,1,"Book 3","9780007149247",9
2021-10-13 20:16:30 +1100,1,"Book 4","9780749958084",9
2021-10-13 20:16:26 +1100,1,"Book 5","9781405920384",9

This is my code so far. I am stuck at async function for the CSV parser where its giving me an undefined result when i do a

console.log(allupcs);

Plus I am not sure how to get the

await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');

to accept the UPCs

See code below:

const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');

async function getupcs(){
var upcData=[];
fs.createReadStream('Book_Bulk.csv')
    .pipe(parse({delimiter: ':'}))
    .on('data', function(csvrow) {
        // console.log(+csvrow.CODECONTENT);
        //do something with csvrow
        upcData.push(+csvrow.CODECONTENT);        
    })
    .on('end',function() {
      //do something with csvData
      // return upcData;
      console.log(upcData);
    });
}

async function main(){

  // const allupcs = await upcData();

  // console.log(allupcs);

  const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
  const page = await browser.newPage();
  await page.goto('https://www.bookdepository.com/');
  await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input','9781509847556');
  await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
  
  //Title
  await page.waitForSelector('.item-info h1');
  const title = await page.$eval('.item-info h1', h1 => h1.textContent);

  //Author
  await page.waitForSelector('div.author-info.hidden-md > span > a > span');
  const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);

  //Genre
  await page.waitForSelector('.active a');
  const genre = await page.$eval('.active a', a => a.innerText);

  //Format
  await page.waitForSelector('.item-info li');
  const format = await page.$eval('.item-info li', li => li.innerText);

  //Publisher
  await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span'); 
  const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);

  //Year
  await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span'); 
  const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
  const newyear = year.slice(-4)

  // Price
  try {
    await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
    const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
    var newprice = price.slice(-6);
  } catch {
    await page.waitForSelector('p.list-price'); 
    const price = await page.$eval('p.list-price', p => p.innerText);
    var newprice = price.slice(-6);
  } finally {
    await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price'); 
    const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
    var newprice = price.slice(-6);
  }

  console.log(title);
  console.log(author);
  console.log(genre);
  console.log(format);
  console.log(publisher);
  console.log(newyear);
  console.log(newprice);

  // return {
  //     title: title,
  //     author: author,
  //     genre: genre,
  //     format: format,
  //     publisher: publisher,
  //     year: newyear,
  //     price: newprice
  // }

}

main();

Updated: with code from Answer

const puppeteer = require('puppeteer');
const parse = require('csv-parser');
const fs = require('fs');


async function getpageData(page,upc){
    await page.goto('https://www.bookdepository.com/');
    await page.type('#book-search-form > div.el-wrap.header-search-el-wrap > input.text-input',upc);
    await page.click('#book-search-form > div.el-wrap.header-search-el-wrap > button');
    
    //Title
    await page.waitForSelector('.item-info h1');
    const title = await page.$eval('.item-info h1', h1 => h1.textContent);

    //Author
    await page.waitForSelector('div.author-info.hidden-md > span > a > span');
    const author = await page.$eval('div.author-info.hidden-md > span > a > span', span => span.innerText);

    //Genre
    await page.waitForSelector('.active a');
    const genre = await page.$eval('.active a', a => a.innerText);

    //Format
    await page.waitForSelector('.item-info li');
    const format = await page.$eval('.item-info li', li => li.innerText);

    //Publisher
    await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span'); 
    const publisher = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(4) > span > a > span', span => span.innerText);

    //Year
    await page.waitForSelector('div.biblio-wrap > div > ul > li:nth-child(3) > span'); 
    const year = await page.$eval('div.biblio-wrap > div > ul > li:nth-child(3) > span', span => span.innerText);
    const newyear = year.slice(-4)

    // Price
    try {
        await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span', { timeout: 1000 });
        const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span', span => span.innerText);
        var newprice = price.slice(-6);
    } catch {
        await page.waitForSelector('p.list-price'); 
        const price = await page.$eval('p.list-price', p => p.innerText);
        var newprice = price.slice(-6);
    } finally {
        await page.waitForSelector('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price'); 
        const price = await page.$eval('div.price.item-price-wrap.hidden-xs.hidden-sm > span.sale-price', span => span.innerText);
        var newprice = price.slice(-6);
    }

    
    // console.log(title);
    // console.log(author);
    // console.log(genre);
    // console.log(format);
    // console.log(publisher);
    // console.log(newyear);
    // console.log(newprice);

    return {
        title: title,
        author: author,
        genre: genre,
        format: format,
        publisher: publisher,
        year: newyear,
        price: newprice
    }

};


function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
    return new Promise((resolve, reject) => {
        const rows = [];
        try {
            fs.createReadStream(filename, {encoding: encoding})
                .pipe(parse({delimiter: delimiter}))
                .on('data', (row) => rows.push(+row.CODECONTENT))
                .on('end', () => resolve(rows))
                .on('error', reject);
        } catch (err) {
            reject(err);
        }
    });
}

async function upcData() {
    try {
        const rows = await readCsvAsync('Book_Bulk.csv', ':');
        // console.log(csvData);
        // call puppeteer or whatever
        return rows;
    } catch (err) {
        console.log(err);
    }
}


async function main(){

    const allupcs = await upcData();
  
    // console.log(allupcs);
    const browser = await puppeteer.launch({ headless: false, defaultViewport: null, args: ['--start-maximized']});
    const page = await browser.newPage();
    const scrapedData = [];

    for(let upc of allupcs){
        const data = await getpageData(page,upc);
        scrapedData.push(data);
    }

    console.log(scrapedData);
  
  }

main();

Ben
  • 77
  • 9
  • 1
    The question seems to be about an issue with the csv-parser package. Puppeteer seems to be unrelated to the actual issue. Consider condensing the question to just the relevant part. It would make it much easier to understand and answer. – Boaz Oct 17 '21 at 10:27
  • Also, are you sure this is actually a CSV file? It looks like JSON. – Boaz Oct 17 '21 at 10:45
  • thanks for pointing that out. Yes its a CSV. Updated to reflect on questions descriptions – Ben Oct 17 '21 at 11:19

1 Answers1

1

As you have noticed, the CSV parser is asynchronous. "asynchronous" means you can't do this:

var upcData=[];                               // 1
fs.createReadStream('Book_Bulk.csv')          // 2
    .pipe(parse({delimiter: ':'}))
    .on('data', (csvrow) {                    // 5 6 7 8 9
        upcData.push(+csvrow.CODECONTENT);   
    })
    .on('end',function() {                    // 10
      console.log(upcData);
    });
}
console.log(upcData);                         // 3
// call puppeteer or whatever                 // 4

I've outlined the order of execution. The last console.log() runs immediately after you set up the read stream. upcData will not contain anything at this point.

But it will contain data at point #10, and #5 etc will fill it.

That means: Whatever you want to do with upcData, do it inside the 'end' event handler.

    .on('end',function() {                    // 10
      console.log(upcData);
      for (let upc of upcData) {
        // call puppeteer or whatever
      }
    });

Since csv reader will give you one row per data event, you can also do things directly in the data event handler and not build an upcData array at all.

    .on('data', (csvrow) {                    // 5 6 7 8 9
        const upc = +csvrow.CODECONTENT;
        // call puppeteer or whatever
    })

If you want to be able to await the whole thing, you must turn it into a promise first. In this case again the relevant step (promise resolution) happens in the end callback:

function readCsvAsync(filename, delimiter=',', encoding='utf-8') {
    return new Promise((resolve, reject) => {
        const rows = [];
        try {
            fs.createReadStream(filename, {encoding: encoding})
                .pipe(parse({delimiter: delimiter}))
                .on('data', (row) => rows.push(row))
                .on('end', () => resolve(rows))
                .on('error', reject);
        } catch (err) {
            reject(err);
        }
    });
}

async function main() {
    try {
        const rows = await readCsvAsync('Book_Bulk.csv', ':');
        // call puppeteer or whatever
    } catch (err) {
        console.log(err);
    }
}
Tomalak
  • 332,285
  • 67
  • 532
  • 628
  • Thanks for the answer. I have tried using the await part. I managed to get Puppeteer to open the browser and log to the website. But when it comes to entering the UPC to the text field. It throws an error - TypeError: text is not iterable – Ben Oct 17 '21 at 14:09
  • 1
    @Ben Good thing you got that part working. Don't get me wrong, but your question was not about Puppeteer at all (also noted by the first comment beneath it) - it was about how to structure your code so that it finishes reading the CSV file *before* it tries to use the data. I don't know the first thing about Puppeteer, so I could not help you with that, and interactive debugging in the Stack Overflow comments generally is a fruitless endeavor. Obviously you're calling Puppeteer wrong, and you need to do some more debugging. If you can't figure it out, post a new question just about this issue. – Tomalak Oct 17 '21 at 14:40
  • Thank you so much for your answer. I really appreciate it. – Ben Oct 17 '21 at 22:33
  • @Ben It's best to concentrate on one issue at a time for each question. Of course you primarily want to use Puppeteer, but this particular timing issue needed to be addressed before you could do anything at all with the data. Splitting up a complex issue into distinct questions helps everybody, including yourself - for example, I bet "how to wait until CSV is read in node" has been answered a couple of times already here, and you could have found one of those threads. – Tomalak Oct 18 '21 at 07:01