建表语句:CREATE TABLE dy2008_url (id int(9) NOT NULL AUTO_INCREMENT, url varchar(2000) NOT NULL, status tinyint(2) NOT NULL, PRIMARY KEY(id));
代码:
- <?php
- declare(ticks = 1);
- pcntl_signal(SIGQUIT, 'signal_handler');
- pcntl_signal(SIGTERM, 'signal_handler');
- $crawlers_pid = array();
- $finish_count = 0;
- //信号处理函数
- function signal_handler($signal)
- {
- global $crawlers_pid;
- if ($signal == SIGQUIT || $signal == SIGTERM)
- {
- foreach ($crawlers_pid as $pid) {
- posix_kill($pid,SIGTERM);
- }
- echo "---------- crawl task exit ----------";
- global $con;//mysql
- exit();
- }
- }
- //GET方式获取链接对应页面内容
- function get_page_content($url)
- {
- $content = file_get_contents($url);
- return $content;
- }
- //POST方式获取链接对应页面内容
- function get_page_content_by_post($url, $arr)
- {
- $arr = http_build_query($arr);
- $opts = array (
- 'http' => array('method' => 'POST', 'header' => 'Content-type:application/x-www-form-urlencoded'.' Content-Length:'.strlen($data).'"', 'content' => $data)
- );
- $context = stream_context_create($opts);
- $content = file_get_contents($url,false,$context);
- return $content;
- }
- //dy2018抓取主流程
- function run_dy2018()
- {
- global $crawlers_pid;
- global $finish_count;
- $crawl_urls = array("http://www.dy2018.com/html/tv/hytv/",
- "http://www.dy2018.com/html/tv/hepai/",
- "http://www.dy2018.com/html/tv/gangtai/",
- "http://www.dy2018.com/html/tv/oumeitv/",
- "http://www.dy2018.com/html/tv/rihantv/",
- "http://www.dy2018.com/html/tv/tvzz/",
- "http://www.dy2018.com/0/",
- "http://www.dy2018.com/1/",
- "http://www.dy2018.com/2/",
- "http://www.dy2018.com/3/",
- "http://www.dy2018.com/4/",
- "http://www.dy2018.com/5/",
- "http://www.dy2018.com/6/",
- "http://www.dy2018.com/7/",
- "http://www.dy2018.com/8/",
- "http://www.dy2018.com/9/",
- "http://www.dy2018.com/10/",
- "http://www.dy2018.com/11/",
- "http://www.dy2018.com/12/",
- "http://www.dy2018.com/13/",
- "http://www.dy2018.com/14/",
- "http://www.dy2018.com/15/",
- "http://www.dy2018.com/16/",
- "http://www.dy2018.com/17/",
- "http://www.dy2018.com/18/",
- "http://www.dy2018.com/19/",
- "http://www.dy2018.com/20/");
- $i = 0;
- while($i < count($crawl_urls))
- {
- $pid = pcntl_fork();
- if($pid == -1) {
- echo "system error. check it now!";
- exit();
- } else if($pid > 0){
- $crawlers_pid[$i] = $pid;
- } else {
- $url = $crawl_urls[$i];
- $con = mysql_connect("localhost", "root", "123456");
- if(!$con) {
- die('Count not connect: '.mysql_error());
- }
- mysql_select_db("mysql", $con);
- crawl_process($url);
- $finish_count++;
- }
- $i++;
- }
- //pcntl_waitpid可能会导致信号监听失败
- while (true) {
- if($finish_count == count($crawlers_pid)) {
- echo "---------- crawl task finish ----------";
- mysql_close();
- exit();
- }
- sleep(1);
- }
- }
- //从入口链接到其下所有下载页链接抓取过程
- function crawl_process($url)
- {
- echo "start handle url:".$url;
- $page_idx = 1;
- $valid_tag = true;
- $info_url_pattern = '/\/i\/\d+.html/';
- $ftp_url_pattern = '/ftp:\/\/.*?.(swf|avi|flv|mpg|rm|mov|wav|asf|3gp|mkv|rmvb)/i';//^$两个符号不起作用
- while($valid_tag) {
- $page_url = get_page_index_url($url, $page_idx);
- printf("start crawl url:".$page_url."\n");
- $page_content = get_page_content($page_url);
- $valid_tag = is_valid_page($page_content);
- if($valid_tag) {
- $matches_urls = array();
- preg_match_all($info_url_pattern, $page_content, $matches_urls);
- $page_content = mb_convert_encoding($page_content, "UTF-8", "GBK");
- for($i=0; $i<count($matches_urls[0]); $i++) {
- $detail_url = 'http://www.dy2018.com'.$matches_urls[0][$i];
- $detail_page_content = get_page_content($detail_url);
- $detail_page_content = mb_convert_encoding($detail_page_content, "UTF-8", "GBK");
- preg_match_all($ftp_url_pattern, $detail_page_content, $ftp_urls);
- $ftp_links = array();
- for($j=0;$j<count($ftp_urls[0]); $j++) {
- $ftp_links[$j] = $ftp_urls[0][$j];
- }
- $ftp_links_unique = array_values(array_unique($ftp_links));
- foreach ($ftp_links_unique as $ftp_link) {
- mysql_query("insert into dy2018_url (url, status) values('$ftp_link','0')");
- // echo mysql_error();//打印mysql错误
- }
- sleep(1);
- }
- }
- $page_idx++;
- }
- }
- //获取页码对应的url链接
- function get_page_index_url($url, $idx)
- {
- $idx_url = $url;
- if($idx == 1) {
- $idx_url = $idx_url.'index.html';
- } else if($idx > 1){
- $idx_url = $idx_url.'index_'.$idx.'.html';
- }
- return $idx_url;
- }
- //根据页面内容判断链接是否有效
- function is_valid_page($content)
- {
- return $content?true:false;
- }
- run_dy2018();
- mysql_close();
- ?>
来源: http://www.phpxs.com/code/1003015/