- package com.thief.parser.impl;
- import java.io.IOException;
- import java.net.URISyntaxException;
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.List;
- import java.util.Map;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.apache.http.HttpException;
- import org.apache.http.client.HttpClient;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.DefaultHttpClient;
- import org.apache.http.protocol.HTTP;
- import org.apache.log4j.Logger;
- import org.htmlparser.Node;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.AndFilter;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.tags.TableColumn;
- import org.htmlparser.tags.TableTag;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- import com.thief.parser.IMail163Parser;
- import com.thief.po.Contact;
- import com.thief.util.HttpUtil;
- import com.thief.util.StringUtil;
- public class Mail163ParserImpl implements IMail163Parser{
- private String loginUrl;
- private static final String charCode = HTTP.UTF_8;
- public List<Contact> parser(String email, String password) throws HttpException,
- IOException, InterruptedException, URISyntaxException {
- DefaultHttpClient client = new DefaultHttpClient();
- try {
- String loginRes = login(email, password,client);
- return parser(client,loginRes, email);
- } catch (Exception e) {
- // TODO: handle exception
- }finally{
- client.getConnectionManager().shutdown();
- }
- return null;
- }
- public String login(String email, String password, HttpClient client)
- throws IllegalStateException, URISyntaxException, IOException, HttpException, InterruptedException{
- Map<String,String> map = new HashMap<String, String>();
- map.put(".verifycookie", "1");
- map.put("style", "35");
- map.put("product", "mail163");
- map.put("username", email);
- map.put("password", password);
- map.put("selType=", "jy");
- map.put("remUser", "on");
- map.put("secure", "on");
- String res = HttpUtil.doPost(client, loginUrl,map, charCode);
- if (res.indexOf("跳转提示") != -1) {
- HttpGet get = new HttpGet(
- "http://entry.mail.163.com/coremail/fcg/ntesdoor2?username="+email+"&lightweight=1&verifycookie=1&language=-1&style=-1");
- res = StringUtil.readInputStream(client.execute(get)
- .getEntity().getContent(), charCode);
- }else if(res.indexOf("errorType")!=-1){
- throw new RuntimeException("帐号或密码错误");
- }
- return res;
- }
- String getUsers="http://tg4a84.mail.163.com/jy3/address/addrprint.jsp?sid=";
- private static String regex = "iframe src=\\"index.jsp\\\\?sid=([^\\"]+)";
- public List<Contact> parser(HttpClient client, String content, String email) throws IllegalStateException, URISyntaxException, IOException, HttpException, InterruptedException {
- //iframe src="index.jsp?sid=zBObqxwciWMxDZiIlwccEFhCuYOLgipm"
- String id = StringUtil.getByRegex(regex, 1, content);
- if(id == null || "".equals(id.trim())){
- throw new RuntimeException("没能获取到关键ID");
- }
- String userJson = getUsers+id;
- Map<String, String> map = new HashMap<String, String>();
- //String res = HttpUtil.doPost(client, userJson,map , charCode);
- log.info(userJson);
- userJson = userJson+"&dd=" + System.currentTimeMillis();
- String res = HttpUtil.doGet(client, userJson, null);
- log.info(res);
- List<Contact> contactList = new ArrayList<Contact>();
- /*try {
- parseByHtmlParser(res);
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- if(1==1 )return contactList;
- */ String aa = "<div class=\\"gTitleSub\\"><div align=\\"left\\"><b class=\\"mTT\\">(.*?)</b></div><div class=\\"Extra\\"></div></div><table class=\\"gTable\\"><tr id=\\"tr_base_0\\" style=\\"\\"><th>邮件地址:</th><td>(.*?)</td></tr>(.*?)</table>";
- Pattern p = Pattern.compile(aa,Pattern.DOTALL);
- Matcher m = p.matcher(res);
- int groupNum = 0;
- int firstIndex = 0;
- while(m.find(firstIndex))
- {
- String name = m.group(1);
- String email1 = m.group(2);
- contactList.add(new Contact(name,email1));
- firstIndex = m.end();
- groupNum++;
- }
- return contactList;
- }
- NodeFilter filter = new AndFilter(new NodeClassFilter(TableTag.class), new HasAttributeFilter("class", "gTable"));//new HasAttributeFilter("class","gTable");
- private List<Contact> parseByHtmlParser(String content) throws ParserException{
- List<Contact> contactList = new ArrayList<Contact>();
- Parser p = new Parser();
- p.setInputHTML(content);
- NodeList nodeList = p.extractAllNodesThatMatch(filter);
- if(nodeList != null && nodeList.size()!=0){
- for(int i=0;i<nodeList.size();i++){
- Node node = nodeList.elementAt(i);
- System.out.println(node.toHtml());
- }
- }
- return contactList;
- }
- public void setLoginUrl(String loginUrl) {
- this.loginUrl = loginUrl;
- }
- public static void main(String[] args) {
- String content = "<!doctype html><html><head><meta http-equiv=\\"Content-Type\\" content=\\"text/html; charset=utf-8\\" /><meta name=\\"application-name\\" content=\\"网易电子邮箱 - 极速4.0\\" /><link rel=\\"shortcut icon\\" href=\\"http://mimg.127.net/p/images/favicon3.ico\\" type=\\"image/x-icon\\"/><title>网易电子邮箱 - 极速4.0</title><style type=\\"text/CSS\\">.Patch118-safe-tit{ border-bottom:#DADADA 1px solid; padding:15px 0 25px 86px; position:relative; zoom:1}.Patch118-safe-tit .ico{ position:absolute; left:20px; top:10px}.Patch118-safe-ct{ padding:20px 25px; line-height:22px}</style></head><body style=\\"margin:0;padding:0;overflow:hidden\\" scroll=\\"no\\"><iframe src=\\"index.jsp?sid=zBObqxwciWMxDZiIlwccEFhCuYOLgipm\\" name=\\"index\\" style=\\"width:100%;height:100%;position:absolute\\" frameborder=\\"0\\" border=\\"0\\"></iframe></body></html>";
- String res = StringUtil.getByRegex(regex, 1, content);
- System.out.println(res);
- }
- Logger log = Logger.getLogger(Mail163ParserImpl.class);
- }
- //该片段来自于http://www.codesnippet.cn/detail/210820135272.html
来源: http://www.codesnippet.cn/detail/210820135272.html