0

I've been developing a scraper-type chrome extension for internal/personal use to scrape course data from a university's website.

The high-level algorithm is as follows:

  1. Open up the main page where the user can input the class data they want to find. (The point is to use the data in this page to generate every possible url for every unique course page)
  2. Generate the first endpoint and open a new tab with that url. (This is what I'm calling the "second degree scrape")
  3. Begin the second degree scrape and when it's done, set the chrome.storage.local to true. Then close the tab.
  4. The content script from the main page reads the local storage and sees that the state is true so it resolves the promise. It resets the local storage to false.
  5. It generates the new url and recursively repeats the process until every possible url is created.

The extension works well when I set the storage true and never modify it and simply console.log every possible url. The error arrises when I let the program open up a new tab and let it update local.storage. Before using local.storage I tried a similar implementation using messaging (simple and long-lived) and background but I had the same issue then.

Any ideas of what I could try?

Here's my code:

background/index.ts

chrome.storage.local.set({ secondDegreeState: false });

content/index.ts

const un:string = "***";
const pw:string = "***"
const levels:Array<string> = ['L', 'U', 'G'];
let ccyys:string = `20212`;
let search_type_main:string = `FIELD`


let signInSequence:Function = () => {
  if(document.getElementById("login-button")){
    let signInButton:HTMLInputElement = document.getElementById("login-button")?.children[0] as HTMLInputElement;
    let username: HTMLInputElement = document.getElementById("username") as HTMLInputElement;
    let password: HTMLInputElement = document.getElementById("password") as HTMLInputElement;
    username.value = un;
    password.value = pw;
    signInButton.value = "Begin Scraping";
    setTimeout(() => {
      signInButton.click();
      console.log('Sign in button pressed');
    }, 2000);
  }
}
let scrapingSeqence:Function = () => {
  if(window.location.href === "https://utdirect.utexas.edu/apps/registrar/course_schedule/20212/"){ // If we are in the main registration page
    firstDegreeScrape(0, 1);
  }
  if(window.location.hostname == "utdirect.utexas.edu"){ // Make sure that we're on a proper hostname
    secondDegreeScrape();
  }
}


function secondDegreePromise(url:string) : Promise<any> {
  /// Open up a new tab with the generated URL
  window.open(url, '_blank');
  return new Promise (function callback(resolve:Function, reject:Function) {
    chrome.storage.local.get(['secondDegreeState'], (response) => {
      if(chrome.runtime.lastError) {
        console.error(chrome.runtime.lastError.message);
        reject("Chrome error");
      }else if (response.secondDegreeState === false){ // If the second degree state is still not done
        console.log('Still waiting for 2nd degree scrape to finish...'+' Its state is '+response.secondDegreeState);
        setTimeout(callback, 5000); //    repeat promise after n-seconds until state is true.
      }else if(response.secondDegreeState === true){ // If the promise is complete
        resolve("2nd degree scrape was complete!");
      }else {
        reject("Oopsie...");
      }
    }) 
    });
}




// Two base cases, 1: it reaches the end of the levels array, 2: it reaches the end of the FOS list.
let firstDegreeScrape:Function = (levelNum: number, fosNum: number) => {
  // Reset the scrape state (Turns it false)
  chrome.storage.local.set({ secondDegreeState: false });
  if (levelNum < levels.length){ // If not base case #1
    const fosParent:HTMLElement|null = document.getElementById("fos_fl"); // Define the FOS parent element.
    if(fosParent){  // If the fosParent is present. (Will most likely return true... just for extra safety)
      let fosChildren = fosParent.children;
      if(fosNum < fosChildren.length){ // If not base case #2
        let fos:HTMLOptionElement = fosChildren[fosNum] as HTMLOptionElement; // The individual field of study.
        let fosValue:string = fos.value.split(' ').join('+'); // Format FOS
        const url:string = `https://utdirect.utexas.edu/apps/registrar/course_schedule/20212/results/?ccyys=${ccyys}&search_type_main=${search_type_main}&fos_fl=${fosValue}&level=${levels[levelNum]}`;
        secondDegreePromise(url)
          .then((res)=>{ // If the second degree scrape promise is resolved
            console.log(res+"Now moving along to next URL.");
            firstDegreeScrape(levelNum, fosNum+1); // Generate the next URL and scrape it
          })
          .catch(res=>{console.log(res)});
        
      }else { 
        firstDegreeScrape(levelNum+1, 1);
      }
    }
  }
}

let secondDegreeScrape:Function = () => {
  // make sure that there is something to scrape
  let table: HTMLTableElement = document.getElementsByClassName('rwd-table')[0] as HTMLTableElement;

  if(table){
    let t_rows:HTMLCollection = table.children[1].children as HTMLCollection;
    let t_rows_arr:Element[] = Array.from(t_rows);
    for(let i=0; i < t_rows_arr.length; i++){
      // console.log(t_rows_arr[i].childElementCount);
      if(t_rows_arr[i].childElementCount == 1){ // If the row is a title
        let course_title:any = t_rows_arr[i].childNodes[1].firstChild?.firstChild?.textContent;
        let divisionRegex = /^[a-z\s]{0,3}/gi;
        let courseNumRegex = /\d*\w/m;

        console.log("Division: "+course_title.match(divisionRegex)[0]);
        course_title = course_title.replace(divisionRegex, "");
        console.log("Course Number: "+course_title.match(courseNumRegex)[0]);
        course_title = course_title.replace(courseNumRegex, "");
        console.log("Course Name: "+course_title);
        
      }else { // If it's a sub-column
        let row = t_rows_arr[i];
        let rowChildren = row.childNodes;
        let unique = rowChildren[1].childNodes[0].childNodes[0].textContent; //
        console.log("Unique: "+unique);

        let days = rowChildren[3].textContent;
        console.log("Days: "+days); 
        
        let hour = rowChildren[5].textContent; 
        console.log("Hour: "+hour); 
        
        // let room;
        
        let instruction_mode = rowChildren[9].textContent;
        console.log("Instruction Mode: "+instruction_mode); 
        
        let instructor = rowChildren[11].textContent;
        console.log("Instructor: "+instructor);
        
        let status = rowChildren[13].textContent;
        console.log("Status: "+status);

        let flags = rowChildren[15].textContent;
        console.log("Flags: "+flags);
        
        let core = rowChildren[17].textContent;
        console.log("Core: "+core);
        console.log("\n"); 
      }
    }
    if(document.getElementById("next_nav_link")){ // If there is a next page
      setTimeout(()=>{
        document.getElementById("next_nav_link")?.click(); // Click the next button
      }, 5000)
    }else {
      setTimeout(()=>{
        // Let's complete the 2nd degree scrape (Sets true) & update the local variable
        chrome.storage.local.set({ secondDegreeState: true });
        //close the tab
        window.close();
      }, 1000)
    }
  } 
}

let main:Function = () => {
  signInSequence();
  scrapingSeqence();
}

main();

manifest.json permissions:

  • tabs
  • declarativeContent
  • storage
  • activeTab

Thanks for the help!

  • Use chrome.tabs.create instead of window.open. – wOxxOm Jun 09 '21 at 03:34
  • Thanks for the suggestion! I also thought this was the issue; however, I soon found out that [chrome.tabs.create is not available for content scripts](https://stackoverflow.com/a/15044140/13223787). Maybe I can try messaging the unique url to the background and call tabs.create from there? I'll give it a try. – Rodolfo J. Galván Jun 09 '21 at 05:14

0 Answers0