this is the code I have so far, and this is what it should print
http://schulnetz.nibis.de/
http://schulnetz.nibis.de/admin/eingeben.phtml?schulnr=35877&aendern=4
http://schulnetz.nibis.de/admin1/schule_edit.php?schulnr=35877
http://schulnetz.nibis.de/daten_mail.php?x=1&Schul_Nr=35877
http://www.grundschule-bierden.de/
the only hrefs I am intrested in obtaining are
http://schulnetz.nibis.de/
http://schulnetz.nibis.de/admin/eingeben.phtml?schulnr=35877&aendern=4
http://schulnetz.nibis.de/admin1/schule_edit.php?schulnr=35877
http://schulnetz.nibis.de/daten_mail.php?x=1&Schul_Nr=35877
so i would need a loop that only look for these hrefs http://schulnetz.nibis.de/
but I cannot seem to find out how to do this. for example if the href was http://schulnetz.nibis.de/one
http://schulnetz.nibis.de/two
http://schulnetz.nibis.de/tree
http://schulnetz.nibis.de/four
http://another.wildow/
i should be able to store one-four and not the one that is another.
include('simple_html_dom.php');
function getHost($url) {
$parseUrl = parse_url(trim($url));
return trim($parseUrl[host] ? $parseUrl[host] : array_shift(explode('/', $parseUrl[path], 2)));
}
$url = "http://schulnetz.nibis.de/db/schulen/schule.php?schulnr=35877&lschb=";
//simple way to add the http:// that dom requires, using curl is a better option
if (substr($url, 0, 4) != "http") {
$url = "http://$url";
}
$parsed_url = getHost($url);
$http_parsed_host = "http://$parsed_url/";
$html = file_get_html($url);
foreach($html->find('a') as $element)
$dom = new DOMDocument();
@$dom->loadHTML($html);
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$href_link = $href->getAttribute('href');
if (substr($href_link, 0, 1) == "/") {
$href_link = trim($href_link,"/");
}
if (substr($href_link, 0, 2) == "//") {
$href_link = trim($href_link,"//");
}
if (substr($href_link, 0, 3) == "///") {
$href_link = trim($href_link,"///");
}
if ((substr($href_link, 0, 8) == "https://")) {
$final_href_link[] = $href_link;
} else {
if (substr($href_link, 0, 1) != "/") {
$final_href_link[] = "$http_parsed_host$href_link";
}
}
}
$links_array = array_unique($final_href_link);
sort($links_array);
foreach ($links_array as $links) {
//echo "$links<br />";
echo "<a href='$links'>$links</a><br />";
}