- use Encode;
- use LWP::Simple;
- use IO::Handle;
- use html::Element;
- use HTML::TreeBuilder;
- use Data::Dumper qw(Dumper);
- $Data::Dumper::Indent = 1;
- $mday="2013";
- #在本代码里实际上不用将爬回来的数据转存。
- for(my $issue=7001;$issue<7094;$issue++)
- {
- open(FL,">>".$mday.".txt");
- print FL "0".$issue." ";
- close FL;
- my $url="一个网址可自行查找?issue=0".$issue;
- my $content=get($url);
- $content=encode("euc-cn", $content);
- #write
- open(FH,">temp.txt");
- print FH $content;
- close FH;
- my $lines=0;
- #parse
- my $begin='<TD align=middle width="13%"><font color=red>';
- my $end='</font></TD>';
- $content=~m/$begin(.*)$end/is;
- #parse1
- my $tree = new HTML::TreeBuilder;
- $tree->parse_file("temp.txt");
- $body=$tree->find_by_tag_name('body');
- my $tempstr="";
- foreach my $red ($body->find_by_attribute('color','red'))
- {
- if($red->as_text()=~/[0-9]{2}/)
- {
- $tempstr=$tempstr.$red->as_text()." ";
- }
- }
- foreach my $blue ($body->find_by_attribute('color','blue'))
- {
- $tempstr=$tempstr.$blue->as_text()." ";
- }
- open(FL,">>".$mday.".txt");
- print FL $tempstr."\\n";
- close FL;
- $tree->delete();
- }
- #print($content);
- #该片段来自于http://www.codesnippet.cn/detail/300820135471.html
来源: http://www.codesnippet.cn/detail/300820135471.html