- <?php
- namespace Think;
- header("Content-Type: text/html;charset=utf-8");
- class Mycurl
- {
- public $ch = null;
- public $data = null;
- public function __construct($url)
- {
- $this->ch = curl_init($url);
- curl_setopt($this->ch, CURLOPT_HEADER, false); //不返回头部信息
- //将 curl_exec()获取的信息以文件流的形式返回,而不是直接输出。
- curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);
- $this->data = curl_exec($this->ch);
- }
- public function __destruct() //释放资源
- {
- curl_close($this->ch);
- }
- public function regmatch() //正则方式抓取
- {
- $reg = '/(?<=<title>)(.*)(?=<\/title>)/i'; //抓取标题
- $reg = '/<div\sid="article_content"\sclass="article_content">([^(?<\/div>)]*)<\/div>/si'; //抓取文章内容
- preg_match($reg,$this->data,$out);
- return $out[1];
- }
- public function result($pos1,$pos2) //字符串方式抓取
- {
- $len = strlen($pos1);
- $flag1 = stripos($this->data, $pos1);
- $flag2 = stripos($this->data, $pos2);
- $str = substr($this->data,$flag1,$flag2-$flag1);
- return $str;
- }
- public function exec() //获取抓取数据
- {
- $data = Array();
- $data['title'] = self::result('<title>','-卢松松博客</title>');
- $data['title'] = substr($data['title'],7); //参数7偏移是为了过滤上一步字符串抓取结果中的前面<title>
- $data['content'] = self::result('<dd class="post-info">','<center>');
- $data['content'] = str_ireplace("/upload/","http://lusongsong.com/upload/",$data['content']); //这一步解决抓取文章的图片地址错误
- $data['content'] = str_ireplace("http://lusongsong.comhttp://lusongsong.com","http://lusongsong.com",$data['content']); //解决上一步产生的副作用,
- $data['content'] = str_ireplace("bloghttp://lusongsong.com","blog",$data['content']); //继续解决上两步产生的副作用
- $data['atime'] = time();
- $data['author'] = 'Internet';
- $data['sort'] = '精彩博文';
- // $data['oldlink'] = '';
- $data['summary'] = substr(strip_tags($data['content']),0,180); //截取文章摘要
- return $data;
- }
- }
- // 测试
- $url = 'http://lusongsong.com/reed/';
- $num = 100; //住区文章数目
- $start = 350; //抓取起点
- $Art = M('article');
- for($i=$start; $i < $start+$num ; $i++)
- {
- $posurl = $url.$i.'.html';
- $curl = new Mycurl($posurl);
- $data = $curl->exec();
- $data['oldlink'] = $posurl;
- if($pos = strpos($data['title'], "出现404错误页面了"))
- {
- continue;
- }
- $Art->add($data);
- $curl = null;
- }
- $this->success("执行完成!","index");
- ?>
来源: http://www.phpxs.com/code/1003609/