Modify to fit your own needs (proxies, url length, pages scraped etc), enjoy.
Code:<?php class scraper { var $ch; var $result; function __construct(){} private function init() { $ch = curl_init(); $this->ch = $ch; $agent = array( 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)', 'Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322)', 'Opera/9.20 (Windows NT 6.0; U; en)', 'Opera/9.00 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0', 'Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en]', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.5) Gecko/20060127 Netscape/8.1' ); shuffle($agent); curl_setopt($this->ch,CURLOPT_RETURNTRANSFER,1); curl_setopt($this->ch,CURLOPT_FOLLOWLOCATION,1); curl_setopt($this->ch,CURLOPT_USERAGENT,$agent[0]); curl_setopt($this->ch,CURLOPT_TIMEOUT,10); curl_setopt($this->ch,CURLOPT_******SESSION,1); curl_setopt($this->ch,CURLOPT_SSL_VERIFYHOST,0); curl_setopt($this->ch,CURLOPT_SSL_VERIFYPEER,0); return; } private function get($url) { curl_setopt($this->ch,CURLOPT_URL,$url); curl_setopt($this->ch,CURLOPT_POST,0); $s = curl_exec($this->ch); return $s; } /* parse related */ private function parse_all($source,$tag1,$tag2) { $source=str_replace($tag1,'<tiny:parse>',$source); $source=str_replace($tag2,'</tiny:parse>',$source); preg_match_all('#<tiny:parse>(.*?)</tiny:parse>#',$source,$result); return($result[1]); } function go($keyword) { $this->init(); $start = 0; while($start<200) { $s = $this->get('google.com/search?hl=en&q='.urlencode($keyword).'&start='.$start.'&sa=N'); $urls = $this->parse_all($s,'<h3 class=r><a href="','" class=l>'); if(is_array($urls) && count($urls)>0) { foreach($urls as $url) { if(strlen($url)>40) $fin[] = $url; } } $start = $start + 10; } array_unique($fin); foreach($fin as $result) echo $result.'<br />'; } } $q = $_GET['q']; if(trim($q)=='') { echo 'You must provide a query'; }else{ $scr = new scraper(); $scr->go($q); } ?>




LinkBack URL
About LinkBacks

Reply With Quote








Bookmarks