0

this is the code I have so far, and this is what it should print

http://schulnetz.nibis.de/
http://schulnetz.nibis.de/admin/eingeben.phtml?schulnr=35877&aendern=4
http://schulnetz.nibis.de/admin1/schule_edit.php?schulnr=35877
http://schulnetz.nibis.de/daten_mail.php?x=1&Schul_Nr=35877
http://www.grundschule-bierden.de/

the only hrefs I am intrested in obtaining are

http://schulnetz.nibis.de/
http://schulnetz.nibis.de/admin/eingeben.phtml?schulnr=35877&aendern=4
http://schulnetz.nibis.de/admin1/schule_edit.php?schulnr=35877
http://schulnetz.nibis.de/daten_mail.php?x=1&Schul_Nr=35877

so i would need a loop that only look for these hrefs http://schulnetz.nibis.de/ but I cannot seem to find out how to do this. for example if the href was http://schulnetz.nibis.de/one http://schulnetz.nibis.de/two http://schulnetz.nibis.de/tree http://schulnetz.nibis.de/four http://another.wildow/ i should be able to store one-four and not the one that is another.

include('simple_html_dom.php');

function getHost($url) {
            $parseUrl = parse_url(trim($url));
            return trim($parseUrl[host] ? $parseUrl[host] : array_shift(explode('/', $parseUrl[path], 2)));
        }
$url = "http://schulnetz.nibis.de/db/schulen/schule.php?schulnr=35877&lschb=";
//simple way to add the http:// that dom requires, using curl is a better option
if (substr($url, 0, 4) != "http") {
$url = "http://$url";
}

$parsed_url = getHost($url);

$http_parsed_host = "http://$parsed_url/";
$html = file_get_html($url);

foreach($html->find('a') as $element) 

$dom = new DOMDocument();
@$dom->loadHTML($html);


$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");

for ($i = 0; $i < $hrefs->length; $i++) {

$href = $hrefs->item($i);

$href_link = $href->getAttribute('href');

 if (substr($href_link, 0, 1) == "/") {
$href_link = trim($href_link,"/");

}   

 if (substr($href_link, 0, 2) == "//") {
$href_link = trim($href_link,"//");

} 

 if (substr($href_link, 0, 3) == "///") {
$href_link = trim($href_link,"///");

}  


      if ((substr($href_link, 0, 8) == "https://")) {

         $final_href_link[] = $href_link;


} else {

  if (substr($href_link, 0, 1) != "/") {
$final_href_link[] = "$http_parsed_host$href_link";
}

}             
}
$links_array = array_unique($final_href_link);
sort($links_array);
foreach ($links_array as $links) {

//echo "$links<br />";
echo "<a href='$links'>$links</a><br />";

}
user3368897
  • 41
  • 1
  • 6
  • that code makes no sense. you are using simplehtmldom and domdocument together. you only need one of them, preferably the latter. Also, your foreach is broken. – Gordon Mar 02 '14 at 08:20

1 Answers1

0

Maybe I'm misunderstanding the problem here, but couldn't you just use strstr() to see if the $url contains "schulnetz.nibis.de"

if(strstr($url,'schulnetz.nibis.de')){...}

jonlink
  • 542
  • 6
  • 18
  • I already tried it, did not work for me. Unless I was doing it wrong if(strstr($url,'schulnetz.nibis.de')) { $final_href_link[] = $href_link; } echo "$final_href_link[]"; – user3368897 Mar 03 '14 at 04:17