I am new to Python/Selenium and coded the following in python /Windows to scrape the 5484 physician demo's in the, MA-Board of Reg. Website.
My Issue: The website is .aspx, so I initially chose Selenium. However, would really appreciate any insights/recommendations on coding the next steps (see below). More specifically, if it is more efficient to continue with selenium or incorporate scrapy? Any insights are greatly appreciated!:
- Select each physician's hyperlink (1-10 per page) by clicking each hyperlinked "PhysicianProfile.aspx?PhysicianID=XXXX" on the "ChooseAPhysician page".
- Follow each, and Extract the, "Demographic info" Demographic info: "phy_name", "lic_issue_date", prim_worksetting, etc
- Return to, "ChooseAPhysician page", click "Next"
Repeat for additional 5474 physician
from selenium import webdriver from selenium.webdriver.support.ui import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By driver = webdriver.Chrome() driver.get('http://profiles.ehs.state.ma.us/Profiles/Pages/ChooseAPhysician.aspx?Page=1') #Locate the elements zip = driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_txtZip\"]") select = Select(driver.find_element_by_xpath("//select[@id=\"ctl00_ContentPlaceHolder1_cmbDistance\"]")) print select.options print [o.text for o in select.options] select.select_by_visible_text("15") prim_care_chekbox = driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_SpecialtyGroupsCheckbox_6\"]") find_phy_button = driver.find_element_by_xpath("//*[@id=\"ctl00_ContentPlaceHolder1_btnSearch\"]") #Input zipcode, check "primary care box", and click "find phy" button zip.send_keys("02109") prim_care_chekbox.click() find_phy_button.click() #wait for "ChooseAPhysician" page to open wait = WebDriverWait(driver, 10) open_phy_bio = driver.find_element_by_xpath("//*[@id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a") element = wait.until(EC.element_to_be_selected(open_phy_bio)) open_phy_bio.click() links = self.driver.find_element_by_xpath("//*[@id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a") for link in links: link = link.get_attribute("href") self.driver.get(link) def parse(self, response): item = SummaryItem() sel = self.selenium sel.open(response.url) time.sleep(4) item["phy_name"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/p[1]").extract() item["lic_status"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/a[1]").extract() item["lic_issue_date"] = driver.find.elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[3]/td[2]").extract() item["prim_worksetting"] = driver.find.elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[1]/table/tbody/tr[5]/td[2]").extract() item["npi"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[2]/tbody/tr[3]/td/table/tbody/tr/td[2]/table/tbody/tr[6]/td[2]").extract() item["Med_sch_grad_date"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[3]/tbody/tr[3]/td/table/tbody/tr[2]/td[2]").extract() item["Area_of_speciality"] = driver.find_elements_by_xpaths("//*[@id=\"content\"]/center/table[4]/tbody/tr[3]/td/table/tbody/tr/td[2]").extract() item["link"] = driver.find_element_by_xpath("//*[@id=\"PhysicianSearchResultGrid\"]/tbody/tr[2]/td[1]/a").extract() return item