php爬虫原型
/* @desc:爬虫原型 @author [Lee] <[<complet@163.com>]> @param url 初始url @param callback 处理业务的回调函数 @param 挖掘url的深度 默认3 */function crawl($url,$callback,$depth = 3){ if($depth > 0){ $depth--; $http = new http($url); $content = $http->get()->exec(); // 业务处理开始 call_user_func($callback,$content); // 业务处理结束 $preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/'; $bool = preg_match_all($preg,$content,$res); $urls = array(); if($bool){ $urls = $res[1]; } $urls = array_unique($urls); $info = parse_url($url); $scheme = $info["scheme"]?:'http'; $user = $info["user"]; $pass = $info["pass"]; $host = $info["host"]; $port = $info["port"]; $path = $info["path"]; $url = $scheme . '://'; if ($user && $pass) { $url .= $user . ":" . $pass . "@"; } $url .= $host; if ($port) { $url .= ":" . $port; } $url .= $path; if (is_array($urls)) { foreach ($urls as $u) { if (preg_match('/^http/', $u)) { $returl = $u; } else { $real = $url . '/' . $u; $returl = $real; } crawl($returl,$callback,$depth); } } }}
声明:本站所有文章资源内容,如无特殊说明或标注,均为采集网络资源。如若本站内容侵犯了原著者的合法权益,可联系本站删除。