Friday, November 19, 2010

Grab the URL from Site or Page

If you want to crawel links from all the site or pages, Below is the function:

$arrGetAllLinks = array();

// if you want all links from site, set $isRecuressive = 1
// if you want all links from page, set $isRecuressive = 0

function fnGrabTHeURLFromLink($strLink,$isRecuressive = 0)
{
$parse = parse_url($strLink);
$strMainHost = $parse['host'];

global $arrGetAllLinks;

$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$strLink);
curl_setopt($ch, CURLOPT_TIMEOUT, 30); //timeout after 30 seconds
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
$result=curl_exec ($ch);
curl_close ($ch);

if( $result )
{
preg_match_all( '/href="(http:\/\/www.[^0-9].+?)"/', $result, $output, PREG_SET_ORDER );

foreach( $output as $item )
{
$parse = parse_url($item[1]);
$strHostOfURL = $parse['host'];

// ALL LINKS DISPLAY HERE
//print "
";
//print_r($item);

if(($strMainHost == $strHostOfURL) && !in_array($item[1], $arrGetAllLinks))
{
$arrGetAllLinks[] = $item[1];
if($isRecuressive == 1)
{
$arrTempGetAllLinks = fnGrabTHeURLFromLink($item[1]);
$arrGetAllLinks = array_merge((array)$arrGetAllLinks, (array)$arrTempGetAllLinks);
$arrGetAllLinks = array_unique($arrGetAllLinks);
}
}
}

}

return array_unique($arrGetAllLinks);
}

print_r(fnGrabTHeURLFromLink("MENTION SITE HERE"));

No comments: