I've been developing a scraper-type chrome extension for internal/personal use to scrape course data from a university's website.
The high-level algorithm is as follows:
- Open up the main page where the user can input the class data they want to find. (The point is to use the data in this page to generate every possible url for every unique course page)
- Generate the first endpoint and open a new tab with that url. (This is what I'm calling the "second degree scrape")
- Begin the second degree scrape and when it's done, set the
chrome.storage.local
totrue
. Then close the tab. - The content script from the main page reads the local storage and sees that the state is true so it resolves the promise. It resets the local storage to
false
. - It generates the new url and recursively repeats the process until every possible url is created.
The extension works well when I set the storage true
and never modify it and simply console.log every possible url. The error arrises when I let the program open up a new tab and let it update local.storage
. Before using local.storage
I tried a similar implementation using messaging (simple and long-lived) and background but I had the same issue then.
Any ideas of what I could try?
Here's my code:
background/index.ts
chrome.storage.local.set({ secondDegreeState: false });
content/index.ts
const un:string = "***";
const pw:string = "***"
const levels:Array<string> = ['L', 'U', 'G'];
let ccyys:string = `20212`;
let search_type_main:string = `FIELD`
let signInSequence:Function = () => {
if(document.getElementById("login-button")){
let signInButton:HTMLInputElement = document.getElementById("login-button")?.children[0] as HTMLInputElement;
let username: HTMLInputElement = document.getElementById("username") as HTMLInputElement;
let password: HTMLInputElement = document.getElementById("password") as HTMLInputElement;
username.value = un;
password.value = pw;
signInButton.value = "Begin Scraping";
setTimeout(() => {
signInButton.click();
console.log('Sign in button pressed');
}, 2000);
}
}
let scrapingSeqence:Function = () => {
if(window.location.href === "https://utdirect.utexas.edu/apps/registrar/course_schedule/20212/"){ // If we are in the main registration page
firstDegreeScrape(0, 1);
}
if(window.location.hostname == "utdirect.utexas.edu"){ // Make sure that we're on a proper hostname
secondDegreeScrape();
}
}
function secondDegreePromise(url:string) : Promise<any> {
/// Open up a new tab with the generated URL
window.open(url, '_blank');
return new Promise (function callback(resolve:Function, reject:Function) {
chrome.storage.local.get(['secondDegreeState'], (response) => {
if(chrome.runtime.lastError) {
console.error(chrome.runtime.lastError.message);
reject("Chrome error");
}else if (response.secondDegreeState === false){ // If the second degree state is still not done
console.log('Still waiting for 2nd degree scrape to finish...'+' Its state is '+response.secondDegreeState);
setTimeout(callback, 5000); // repeat promise after n-seconds until state is true.
}else if(response.secondDegreeState === true){ // If the promise is complete
resolve("2nd degree scrape was complete!");
}else {
reject("Oopsie...");
}
})
});
}
// Two base cases, 1: it reaches the end of the levels array, 2: it reaches the end of the FOS list.
let firstDegreeScrape:Function = (levelNum: number, fosNum: number) => {
// Reset the scrape state (Turns it false)
chrome.storage.local.set({ secondDegreeState: false });
if (levelNum < levels.length){ // If not base case #1
const fosParent:HTMLElement|null = document.getElementById("fos_fl"); // Define the FOS parent element.
if(fosParent){ // If the fosParent is present. (Will most likely return true... just for extra safety)
let fosChildren = fosParent.children;
if(fosNum < fosChildren.length){ // If not base case #2
let fos:HTMLOptionElement = fosChildren[fosNum] as HTMLOptionElement; // The individual field of study.
let fosValue:string = fos.value.split(' ').join('+'); // Format FOS
const url:string = `https://utdirect.utexas.edu/apps/registrar/course_schedule/20212/results/?ccyys=${ccyys}&search_type_main=${search_type_main}&fos_fl=${fosValue}&level=${levels[levelNum]}`;
secondDegreePromise(url)
.then((res)=>{ // If the second degree scrape promise is resolved
console.log(res+"Now moving along to next URL.");
firstDegreeScrape(levelNum, fosNum+1); // Generate the next URL and scrape it
})
.catch(res=>{console.log(res)});
}else {
firstDegreeScrape(levelNum+1, 1);
}
}
}
}
let secondDegreeScrape:Function = () => {
// make sure that there is something to scrape
let table: HTMLTableElement = document.getElementsByClassName('rwd-table')[0] as HTMLTableElement;
if(table){
let t_rows:HTMLCollection = table.children[1].children as HTMLCollection;
let t_rows_arr:Element[] = Array.from(t_rows);
for(let i=0; i < t_rows_arr.length; i++){
// console.log(t_rows_arr[i].childElementCount);
if(t_rows_arr[i].childElementCount == 1){ // If the row is a title
let course_title:any = t_rows_arr[i].childNodes[1].firstChild?.firstChild?.textContent;
let divisionRegex = /^[a-z\s]{0,3}/gi;
let courseNumRegex = /\d*\w/m;
console.log("Division: "+course_title.match(divisionRegex)[0]);
course_title = course_title.replace(divisionRegex, "");
console.log("Course Number: "+course_title.match(courseNumRegex)[0]);
course_title = course_title.replace(courseNumRegex, "");
console.log("Course Name: "+course_title);
}else { // If it's a sub-column
let row = t_rows_arr[i];
let rowChildren = row.childNodes;
let unique = rowChildren[1].childNodes[0].childNodes[0].textContent; //
console.log("Unique: "+unique);
let days = rowChildren[3].textContent;
console.log("Days: "+days);
let hour = rowChildren[5].textContent;
console.log("Hour: "+hour);
// let room;
let instruction_mode = rowChildren[9].textContent;
console.log("Instruction Mode: "+instruction_mode);
let instructor = rowChildren[11].textContent;
console.log("Instructor: "+instructor);
let status = rowChildren[13].textContent;
console.log("Status: "+status);
let flags = rowChildren[15].textContent;
console.log("Flags: "+flags);
let core = rowChildren[17].textContent;
console.log("Core: "+core);
console.log("\n");
}
}
if(document.getElementById("next_nav_link")){ // If there is a next page
setTimeout(()=>{
document.getElementById("next_nav_link")?.click(); // Click the next button
}, 5000)
}else {
setTimeout(()=>{
// Let's complete the 2nd degree scrape (Sets true) & update the local variable
chrome.storage.local.set({ secondDegreeState: true });
//close the tab
window.close();
}, 1000)
}
}
}
let main:Function = () => {
signInSequence();
scrapingSeqence();
}
main();
manifest.json permissions:
- tabs
- declarativeContent
- storage
- activeTab
Thanks for the help!