- /*
- @desc: 爬虫原型
- @author [Lee] <[<complet@163.com>]>
- @param url 初始 url
- @param callback 处理业务的回调函数
- @param 挖掘 url 的深度 默认 3
- */
- function crawl($url,$callback,$depth = 3){
- if($depth> 0){
- $depth--;
- $http = new http($url);
- $content = $http->get()->exec();
- // 业务处理开始
- call_user_func($callback,$content);
- // 业务处理结束
- $preg = '/<[a|A].*?href=[\'\"]{0,1}([^>\'\"\ ]*).*?>/';
- $bool = preg_match_all($preg,$content,$res);
- $urls = array();
- if($bool){
- $urls = $res[1];
- }
- $info = parse_url($url);
- $scheme = $info["scheme"]?:'http';
- $user = $info["user"];
- $pass = $info["pass"];
- $host = $info["host"];
- $port = $info["port"];
- $path = $info["path"];
- $url = $scheme . '://';
- if ($user && $pass) {
- $url .= $user . ":" . $pass . "@";
- }
- $url .= $host;
- if ($port) {
- $url .= ":" . $port;
- }
- $url .= $path;
- if (is_array($urls)) {
- foreach ($urls as $u) {
- if (preg_match('/^http/', $u)) {
- $returl = $u;
- } else {
- $real = $url . '/' . $u;
- $returl = $real;
- }
- crawl($returl,$callback,$depth);
- }
- }
- }
- }
来源: http://www.bubuko.com/infodetail-2632870.html